# 2. Refine the Data
 
> "Data is messy"

We will be performing the following operation on our Onion price to refine it
- **Remove** e.g. remove redundant data from the data frame
- **Derive** e.g. State and City from the market field
- **Parse** e.g. extract date from year and month column

Other stuff you may need to do to refine are...
- **Missing** e.g. Check for missing or incomplete data
- **Quality** e.g. Check for duplicates, accuracy, unusual data
- **Convert** e.g. free text to coded value
- **Calculate** e.g. percentages, proportion
- **Merge** e.g. first and surname for full name
- **Aggregate** e.g. rollup by year, cluster by area
- **Filter** e.g. exclude based on location
- **Sample** e.g. extract a representative data
- **Summary** e.g. show summary stats like mean

In [46]:
library(plyr)
library(dplyr)

In [47]:
df <- read.csv('MonthWiseMarketArrivals.csv')

In [48]:
dim(df)

In [49]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
1,ABOHAR(PB),January,2005,2350,404,493,446
2,ABOHAR(PB),January,2006,900,487,638,563
3,ABOHAR(PB),January,2010,790,1283,1592,1460
4,ABOHAR(PB),January,2011,245,3067,3750,3433
5,ABOHAR(PB),January,2012,1035,523,686,605
6,ABOHAR(PB),January,2013,675,1327,1900,1605


In [50]:
class(df)

In [51]:
sapply(df , class)

In [52]:
tail(df,1)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
10228,,,Total,783438108,647(Avg),1213(Avg),984(Avg)


In [53]:
last_row <- as.numeric(rownames(tail(df, 1)))

In [54]:
df <- df[-c(last_row),]

In [55]:
dim(df)

In [56]:
# selecting particular columns from the df
head(df %>% 
select(priceMin,priceMax,priceMod))

Unnamed: 0,priceMin,priceMax,priceMod
1,404,493,446
2,487,638,563
3,1283,1592,1460
4,3067,3750,3433
5,523,686,605
6,1327,1900,1605


In [57]:
sapply(head(df %>% 
select(year,quantity,priceMin,priceMax,priceMod)),class)

In [58]:
#we see that all of them are factors, we know that they are integers, so lets convert

In [59]:
df$quantity <- as.numeric(as.character(df$quantity))
df$year     <- as.numeric(as.character(df$year))
df$priceMin <- as.numeric(as.character(df$priceMin))
df$priceMax <- as.numeric(as.character(df$priceMax))
df$priceMod <- as.numeric(as.character(df$priceMod))

In [60]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
1,ABOHAR(PB),January,2005,2350,404,493,446
2,ABOHAR(PB),January,2006,900,487,638,563
3,ABOHAR(PB),January,2010,790,1283,1592,1460
4,ABOHAR(PB),January,2011,245,3067,3750,3433
5,ABOHAR(PB),January,2012,1035,523,686,605
6,ABOHAR(PB),January,2013,675,1327,1900,1605


In [61]:
sapply(head(df %>% select(year,quantity,priceMin,priceMax,priceMod)),class)

In [62]:
sapply(df,class)

In [63]:
library(psych)

In [64]:
do.call(data.frame,list(
                        mean=apply(select(df,quantity,priceMin,priceMax,priceMod),2,mean),
                        sd=apply(select(df,quantity,priceMin,priceMax,priceMod),2,sd),
                        median=apply(select(df,quantity,priceMin,priceMax,priceMod),2,median),
                        min=apply(select(df,quantity,priceMin,priceMax,priceMod),2,min),
                        max=apply(select(df,quantity,priceMin,priceMax,priceMod),2,max)
                        )
        )

Unnamed: 0,mean,sd,median,min,max
quantity,76604.88,124408.7,27460,20,1639032
priceMin,646.9444,673.1219,440,16,6000
priceMax,1212.761,979.6589,923,145,8192
priceMod,984.2843,818.4715,747,80,6400


## Extracting the states from market names

In [65]:
head(count(df,'market'))

Unnamed: 0,"""market""",n
1,market,10227


In [66]:
# we see that city and state are combined, lets split them up
df$market[1]
strsplit(as.character(df$market[1]),'\\(')[[1]][2]

In [67]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
1,ABOHAR(PB),January,2005,2350,404,493,446
2,ABOHAR(PB),January,2006,900,487,638,563
3,ABOHAR(PB),January,2010,790,1283,1592,1460
4,ABOHAR(PB),January,2011,245,3067,3750,3433
5,ABOHAR(PB),January,2012,1035,523,686,605
6,ABOHAR(PB),January,2013,675,1327,1900,1605


In [68]:
unique(df$market)

In [69]:
sapply(df,class)

In [70]:
unlist(strsplit(as.character(df$market),'\\('))[2]

In [71]:
grepl('\\(','BANGALORE')

In [72]:
stateSplit <- function(inpString){
    if(grepl('\\(',as.character(inpString))){
    m <- unlist(strsplit(as.character(inpString),'\\('))
    return(gsub(")","",m[2]))
    } else{
        return(as.character(inpString))
    }
}

In [73]:
citySplit <- function(inpString){
    if(grepl('\\(',as.character(inpString))){
    m <- unlist(strsplit(as.character(inpString),'\\('))
        return(m[1])
    } else{
        return(as.character(inpString))

    }
}

In [74]:
df <- data.frame(mutate(rowwise(df),
             state = stateSplit(market),
             city = citySplit(market)))

                          

In [75]:
sapply(df,class)

In [76]:
head(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
1,ABOHAR(PB),January,2005,2350,404,493,446,PB,ABOHAR
2,ABOHAR(PB),January,2006,900,487,638,563,PB,ABOHAR
3,ABOHAR(PB),January,2010,790,1283,1592,1460,PB,ABOHAR
4,ABOHAR(PB),January,2011,245,3067,3750,3433,PB,ABOHAR
5,ABOHAR(PB),January,2012,1035,523,686,605,PB,ABOHAR
6,ABOHAR(PB),January,2013,675,1327,1900,1605,PB,ABOHAR


In [77]:
tail(df)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
10222,YEOLA(MS),December,2010,57586,541,2713,1830,MS,YEOLA
10223,YEOLA(MS),December,2011,131326,282,612,526,MS,YEOLA
10224,YEOLA(MS),December,2012,207066,485,1327,1136,MS,YEOLA
10225,YEOLA(MS),December,2013,215883,472,1427,1177,MS,YEOLA
10226,YEOLA(MS),December,2014,201077,446,1654,1456,MS,YEOLA
10227,YEOLA(MS),December,2015,223315,609,1446,1126,MS,YEOLA


In [78]:
unique(df$state)

In [79]:
unique(df$city)

In [80]:
state_now <- c('PB', 'UP', 'GUJ', 'MS', 'RAJ', 'BANGALORE', 'KNT', 'BHOPAL', 'OR',
       'BHR', 'WB', 'CHANDIGARH', 'CHENNAI','UTT',
       'DELHI', 'MP', 'TN', 'Podis', 'GUWAHATI', 'HYDERABAD', 'JAIPUR',
       'WHITE', 'JAMMU', 'HR', 'KOLKATA', 'AP', 'LUCKNOW', 'MUMBAI',
       'NAGPUR', 'KER', 'PATNA', 'CHGARH', 'JH', 'SHIMLA', 'SRINAGAR',
       'TRIVENDRUM')

In [81]:
state_new <- c('PB', 'UP', 'GUJ', 'MS', 'RAJ', 'KNT', 'KNT', 'MP', 'OR',
       'BHR', 'WB', 'CH', 'TN', 'KNT', 'TN', 'UP',
       'DEL', 'MP', 'TN', 'TN', 'ASM', 'AP', 'RAJ',
       'MS', 'JK', 'HR', 'WB', 'AP', 'UP', 'MS',
       'MS', 'KER', 'BHR', 'HR', 'JH', 'HP', 'JK',
       'KEL')

In [82]:
df$state <- replace(df$state,state_now,state_new)

In replace(df$state, state_now, state_new): number of items to replace is not a multiple of replacement length

ERROR: Error in `$<-.data.frame`(`*tmp*`, "state", value = structure(c("PB", : replacement has 10263 rows, data has 10227


In [83]:
unique(df$state)