In [48]:
import numpy as np
import pandas as pd
import matplotlib as plt
from scipy.stats import lognorm, zscore
%matplotlib inline 
from IPython.display import display
from datetime import datetime 
import matplotlib.dates as mdates
from sklearn.cross_validation import train_test_split
from scipy.stats import linregress
from sklearn.ensemble import RandomForestRegressor 
from sklearn.preprocessing import LabelEncoder  


In [8]:
fig_size = plt.rcParams["figure.figsize"]

# Set figure width to 12 and height to 9
fig_size[0] = 12
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size

#### Importing file

In [3]:
df=pd.read_csv('crap.csv', header=0)
pd.set_option('display.max_colwidth', 50)


1.1 Outlier Detection
   Two cases come to mind:
       a) An input error by the contributor. This may an incorrect placement of a decimal point (i.e. quantity=1.00 -> quantity=100.).  The mistake can happen while inputing the price, quantity, size, units. Each of these contribute to the normalized price which will we use to find the outliers. 
       b) Given that contributors are financially compensated for the contributions, fraudulent submissions are innevitble. This often leads to a cat-and-mouse situation and the outlier algorithm needs to be updated
       
1.2 Not quite guassian Log-Normal plot(?):
    Giving that we do not expect prices to go below 0 and that decimal point mistakes can change the price values by order of magnitudes above and below the real price, a guassian distribution may not be suitable to model prices. On the otherhand, the log of the prices could follow a normal distributions so we use that.     
    
1.3 Method:
    After transforming the normalized price column to Log(normalized price), we define an outlier as a data point likes beyond 3 sigma of the mean. 

In [10]:
#Empty dataframe to fill later
outliers = pd.DataFrame()

#Create dataframes who's content has ben grouped by Product
#groupedDf = df.groupby(["p_item_product_lc","city"])
groupedDf = df.groupby(["p_item_product_lc"])


for prod_place, gp in groupedDf:
    #Add column of Log(normalized prices)
    gp.loc[:,'log_normalized_price'] = gp['normalized_price'].apply(lambda x: np.log(x))
    
    #Calculate z-score
    gp.loc[:,'zScore']= zscore(gp['log_normalized_price'])
    
    #store datapoints at least 1.96 sigma away
    #z = gp.loc[gp["zScore"]>9.96].loc[:,["p_item_product_lc","normalized_price", "zScore","city","l_place_name","u_uuid","t_time"]]
    z = gp[np.abs(gp["zScore"])>1.96]

    if not z.empty : 
        outliers = outliers.append(z)


In [None]:
#display all outliers by product
display(outliers)

I've noticed many Fruit Juice observations have a z score above ten.  Investingating a bit more:

In [None]:
fruitJuice = df[df["p_item_product_lc"]=="Fruit juice"]

fruitJuice.ix[:,"normalized_price"].plot(kind="hist",bins=100, logy=True, title="Fruit Juice Prices")

In [None]:
fruitJuiceOutliers= outliers[outliers["p_item_product_lc"]=="Fruit juice"]
for contributor in fruitJuiceOutliers.groupby("u_uuid"):
    contributor[1].loc[:,"normalized_price"].plot(kind="hist",bins=100, logy=True, xlim=(0,.4),title="Outlier Fruit Juice prices by contributor")

Largest outliers in fruit juices seem to be from one contributor who has been submitting the same picture of a damaged can of fruit juice at various angles over the past 2 years. 

2.1
    i)  
        Contributor
        Location:
        Variations in set of products
    ii)
      

In [None]:
def plotTrend(frame=None, y="", product="",grp=None):
    grouped = frame.groupby(["p_item_product_lc"])
    prod = grouped.get_group("%s" % product) 
    #outlierProd = outliers[outliers["p_item_product_lc"]==product]
    #cleanedProd = prod.drop(outlierProd.index)
    prod.loc[:,"date"]= prod["t_time"].apply(lambda x : datetime.strptime(x.split(".")[0],"%Y-%m-%d %X").date())
    avgCC = prod.groupby(["date"]).mean()
    #display(avgCC)

    if grp is not None:
        for x in prod.groupby(grp):
            if len(x[1].groupby(["date"]))> 1.0:
                height = x[1][y].max() - x[1][y].min()
                print x[1].groupby(["date"]).mean().plot( y = "%s" % y, title="%s Bar Prices (%s)" % (product,x[0]), ylim =(prod[y].min()-5*height,prod[y].max()+5*height))
    else:
        height = prod[y].max() - prod[y].min()
        print prod.plot(x="date",y="%s" % y, title="%s Prices (%s)" % (product,y), ylim =(prod[y].min()-5*height,prod[y].max()+5*height))

    return 


We might want to look at the composition of the sample. i.e..sample by sample is a specific product over/under represented relative to other products.  For example, if normally, one expects on any given day that Coca-Cola bottles make up for 5% of submissions but yet for a specific day they made up 15% of the observations (maybe because contributors did not submit other products as much), then we can expect a bias in our sample.  

i) This question is pretty open ended (probably purposefully).  Price prediction at what level? An apple in a supermarket? any apple in a Accra? Or apples Ghana?   

In [5]:
def addDate(dFrame):
    dFrame.loc[:,"date"]= dFrame["t_time"].apply(lambda x : datetime.strptime(x.split(".")[0],"%Y-%m-%d %X").date())
    return dFrame
df = addDate(df.drop(outliers.index))

Let's calculate the average daily price of each product for every purchase location for every city.

Keep in mind that that we need the additional requirement that all fields be filled.

In [6]:
newDf = pd.DataFrame()
groups = ["p_item_product_lc","city","l_place_name","date"]
df = df.dropna(subset = groups)

#test=df[df["p_item_product_lc"]=="Banana"]
#display(test[test["date"]<datetime(2014, 11, 20).date()][groups])

prodCityPlaceDateGrouped = df.groupby(groups, as_index=True)
prodCityPlaceDate = prodCityPlaceDateGrouped["normalized_price"].agg([np.mean])
prodCityPlaceDate = prodCityPlaceDate.reset_index()
#test=prodCityPlaceDate[prodCityPlaceDate["p_item_product_lc"]=="Banana"]
#display(test[test["date"]<datetime(2014, 11, 20).date()])

#prodCityPlaceDate["normalized_price"]["mean"]
#prodDateGrouped = prodCityPlaceDate.groupby(["city","l_place_name"])
prodDateGrouped = prodCityPlaceDate.groupby(["p_item_product_lc","date"])
unbiased = prodDateGrouped["mean"].agg([np.mean])

biasedGrouped = df.groupby(["p_item_product_lc","date"], as_index=True)
biased = biasedGrouped["normalized_price"].agg([np.mean])
unbiased = unbiased.reset_index()
biased = biased.reset_index()
#unbiased.sort_values(by="date")
#display(unbiased.head())
#display(biased.head())
#plotTrend(frame=unbiased,y="mean",product="Banana")

def estimateBias(unbiased=None, biased=None):
    biases = [] 
    unbiased["bias_est"] = (unbiased["mean"]-biased["mean"])/ biased["mean"]
    for prod,prodDf in unbiased.groupby(["p_item_product_lc"]):
        days = [(e - min(prodDf["date"])).days for e in prodDf["date"]] 
        l = linregress(days,prodDf["bias_est"])
        biases.append(l.intercept)
    return biases

max(estimateBias(unbiased, biased))
#prodDateGrouped["mean"].agg([np.mean])
#prodDate = prodDateGrouped["normalized_price"]["mean"].agg([np.mean])

#["normalized_price"]["mean"]
# test
#test2= test.groupby(level=2)
#test3 = test2["mean"].agg([np.mean])
#for name, prodCityPlace in grouped:
#    if len(prodCityPlace)>10:
#        
#        #prodCityPlace["dailyAvg_normalized_price"] = prodCityPlace["normalized_price"].mean() 
#        newDf = newDf.append(prodCityPlace.iloc[:1,:]))

0.086506281188178283

# Modelling Question

### i) This question is pretty open ended (probably purposefully). Price prediction at what level? An apple in a supermarket? any apple in a Accra? Or apples Ghana?

In [97]:
#unbiased
prodCityPlaceDate.sort_values(by=groups)

X = prodCityPlaceDate.loc[:,groups]
for group in groups:
        X.loc[:,group] = LabelEncoder().fit_transform(X.loc[:,group])
        
X_train, X_test, y_train, y_test = train_test_split(X,prodCityPlaceDate["mean"], test_size=0.33, random_state=42)

rfr = RandomForestRegressor(n_estimators=100)


rfr.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [101]:
rfr.score(X_train[:100],y_train[:100])

0.99372651171970405

In [102]:
rfr.score(X_test[:100],y_test[:100])

0.9333589864075843

In [None]:

#TrainDf = prodCityPlaceDate[prodCityPlaceDate["date"]<datetime(2015, 3, 17).date()]
#TestDf = prodCityPlaceDate[prodCityPlaceDate["date"]>=datetime(2015, 3, 17).date()]


TrainDf = sortedNewDF[sortedNewDF["date"]<datetime(2015, 3, 17).date()]
TestDf = sortedNewDF[sortedNewDF["date"]>=datetime(2015, 3, 17).date()]


for t in TrainDf.groupby(["p_item_product_lc"]):
    print t[0]

df