# 2. Refine the Data
 
> "Data is messy"

We will be performing the following operation on our Onion price to refine it
- **Remove** e.g. remove redundant data from the data frame
- **Derive** e.g. State and City from the market field
- **Parse** e.g. extract date from year and month column

Other stuff you may need to do to refine are...
- **Missing** e.g. Check for missing or incomplete data
- **Quality** e.g. Check for duplicates, accuracy, unusual data
- **Convert** e.g. free text to coded value
- **Calculate** e.g. percentages, proportion
- **Merge** e.g. first and surname for full name
- **Aggregate** e.g. rollup by year, cluster by area
- **Filter** e.g. exclude based on location
- **Sample** e.g. extract a representative data
- **Summary** e.g. show summary stats like mean

In [231]:
library(plyr)
library(dplyr)

In [232]:
df <- read.csv('MonthWiseMarketArrivals.csv')

In [233]:
dim(df)

In [234]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
1,ABOHAR(PB),January,2005,2350,404,493,446
2,ABOHAR(PB),January,2006,900,487,638,563
3,ABOHAR(PB),January,2010,790,1283,1592,1460
4,ABOHAR(PB),January,2011,245,3067,3750,3433
5,ABOHAR(PB),January,2012,1035,523,686,605
6,ABOHAR(PB),January,2013,675,1327,1900,1605


In [235]:
class(df)

In [236]:
sapply(df , class)

In [237]:
tail(df,1)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
10228,,,Total,783438108,647(Avg),1213(Avg),984(Avg)


In [238]:
last_row <- as.numeric(rownames(tail(inp_file, 1)))

In [239]:
df <- inp_file[-c(last_row),]

In [240]:
dim(df)

In [241]:
# selecting particular columns from the df
head(df %>% 
select(priceMin,priceMax,priceMod))

Unnamed: 0,priceMin,priceMax,priceMod
1,404,493,446
2,487,638,563
3,1283,1592,1460
4,3067,3750,3433
5,523,686,605
6,1327,1900,1605


In [242]:
sapply(head(df %>% 
select(year,quantity,priceMin,priceMax,priceMod)),class)

In [243]:
#we see that all of them are factors, we know that they are integers, so lets convert

In [244]:
df$quantity <- as.numeric(as.character(df$quantity))
df$year     <- as.numeric(as.character(df$year))
df$priceMin <- as.numeric(as.character(df$priceMin))
df$priceMax <- as.numeric(as.character(df$priceMax))
df$priceMod <- as.numeric(as.character(df$priceMod))

In [245]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
1,ABOHAR(PB),January,2005,2350,404,493,446
2,ABOHAR(PB),January,2006,900,487,638,563
3,ABOHAR(PB),January,2010,790,1283,1592,1460
4,ABOHAR(PB),January,2011,245,3067,3750,3433
5,ABOHAR(PB),January,2012,1035,523,686,605
6,ABOHAR(PB),January,2013,675,1327,1900,1605


In [246]:
sapply(head(df %>% select(year,quantity,priceMin,priceMax,priceMod)),class)

In [247]:
sapply(df,class)

In [248]:
summarise(df)

ERROR: Error: length(rows) == 1 is not TRUE


In [249]:
library(psych)

In [250]:
do.call(data.frame,list(
                        mean=apply(select(df,quantity,priceMin,priceMax,priceMod),2,mean),
                        sd=apply(select(df,quantity,priceMin,priceMax,priceMod),2,sd),
                        median=apply(select(df,quantity,priceMin,priceMax,priceMod),2,median),
                        min=apply(select(df,quantity,priceMin,priceMax,priceMod),2,min),
                        max=apply(select(df,quantity,priceMin,priceMax,priceMod),2,max)
                        )
        )

Unnamed: 0,mean,sd,median,min,max
quantity,76604.88,124408.7,27460,20,1639032
priceMin,646.9444,673.1219,440,16,6000
priceMax,1212.761,979.6589,923,145,8192
priceMod,984.2843,818.4715,747,80,6400


## Extracting the states from market names

In [251]:
head(count(df,'market'))

Unnamed: 0,market,freq
1,ABOHAR(PB),90
2,AGRA(UP),133
3,AHMEDABAD(GUJ),125
4,AHMEDNAGAR(MS),144
5,AJMER(RAJ),15
6,ALIGARH(UP),16


In [216]:
# we see that city and state are combined, lets split them up
df$market[1]
strsplit(as.character(df$market[1]),'\\(')[[1]][2]

In [259]:
df <- mutate(df,state = strsplit(as.character(market),'\\(')[[1]][2],
                city = strsplit(as.character(market),'\\(')[[1]][1])

                          

In [260]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
1,ABOHAR(PB),January,2005,2350,404,493,446,PB),ABOHAR
2,ABOHAR(PB),January,2006,900,487,638,563,PB),ABOHAR
3,ABOHAR(PB),January,2010,790,1283,1592,1460,PB),ABOHAR
4,ABOHAR(PB),January,2011,245,3067,3750,3433,PB),ABOHAR
5,ABOHAR(PB),January,2012,1035,523,686,605,PB),ABOHAR
6,ABOHAR(PB),January,2013,675,1327,1900,1605,PB),ABOHAR
