In [2]:
%matplotlib inline
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn import grid_search, linear_model as lm, cross_validation as cv, tree, preprocessing as prep

iowa = pd.read_csv('IowaLiquor.csv',parse_dates=[0])
iowa.columns = ['Date','StoreNo','City','Zip','CountyNo','County','CategoryNo','Category','VendorNo','ItemNo','Item','Volume','Cost','Retail','Quantity','TotSales','TotVolume','TotVolumeGals']
iowa

Unnamed: 0,Date,StoreNo,City,Zip,CountyNo,County,CategoryNo,Category,VendorNo,ItemNo,Item,Volume,Cost,Retail,Quantity,TotSales,TotVolume,TotVolumeGals
0,2015-11-04,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,$4.50,$6.75,12,$81.00,9.00,2.38
1,2016-03-02,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,$13.75,$20.63,2,$41.26,1.50,0.40
2,2016-02-11,2106,CEDAR FALLS,50613,7.0,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,$12.59,$18.89,24,$453.36,24.00,6.34
3,2016-02-03,2501,AMES,50010,85.0,Story,1071100.0,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,$9.50,$14.25,6,$85.50,10.50,2.77
4,2015-08-18,3654,BELMOND,50421,99.0,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,$7.20,$10.80,12,$129.60,21.00,5.55
5,2015-04-20,2569,CEDAR RAPIDS,52402,57.0,Linn,1041100.0,AMERICAN DRY GINS,205,31473,New Amsterdam Gin,1750,$13.32,$19.98,6,$119.88,10.50,2.77
6,2015-08-05,2596,OTTUMWA,52501,90.0,Wapello,1051010.0,AMERICAN GRAPE BRANDIES,85,52806,Korbel Brandy,750,$6.66,$9.99,3,$29.97,2.25,0.59
7,2015-06-25,3456,CLEAR LAKE,50428,17.0,Cerro Gordo,1012100.0,CANADIAN WHISKIES,65,10628,Canadian Club Whisky,1750,$15.75,$23.63,2,$47.26,3.50,0.92
8,2016-01-04,4757,BONDURANT,50035,77.0,Polk,1032080.0,IMPORTED VODKA,370,34006,Absolut Swedish Vodka 80 Prf,750,$11.49,$17.24,4,$68.96,3.00,0.79
9,2015-11-10,4346,SHELLSBURG,52332,6.0,Benton,1081315.0,CINNAMON SCHNAPPS,65,82610,Dekuyper Hot Damn!,1000,$7.62,$11.43,2,$22.86,2.00,0.53


In [3]:
#Change dollar columns to a number
iowa['TotSales'] = iowa[['TotSales']].replace('[\$,]','',regex=True).astype(float)
iowa['Retail'] = iowa[['Retail']].replace('[\$,]','',regex=True).astype(float)
iowa['Cost'] = iowa[['Cost']].replace('[\$,]','',regex=True).astype(float)

In [4]:
import datetime as dt
#Add the Year variable
iowa['Year'] = pd.DatetimeIndex(iowa['Date']).year
#Quarter
iowa['Quarter'] = pd.DatetimeIndex(iowa['Date']).quarter
#TotalMargin
iowa['TotMargin'] = (iowa['Retail']-iowa['Cost'])*iowa['Quantity']
#PricePerLitre
iowa['PricePerLitre'] = iowa['TotSales']/iowa['Quantity']

In [5]:
daterange = lambda x:np.max(x)-np.min(x)

def aggsales(year=0,quarter=0):
    
    if year != 0 and quarter!=0:
        df = iowa[(iowa['Year']==year) & (iowa['Quarter']==quarter)]
    else:
        df = iowa
    
    dfout = pd.pivot_table(df,
                   index=['Quarter','StoreNo'],
                   values=['TotSales',
                           'Date',
                           'Quantity',
                           'TotMargin',
                           'TotVolume'
                          ],
                   aggfunc = {'TotSales':{'TotSales':np.sum},
                              'TotMargin':{'TotMargin':np.sum},
                              'TotVolume':{'TotVolume':np.sum},
                              'Quantity':{'TotQty':np.sum,'TotNoSales':len},
                              'Date':{'DateRange':daterange},
                             })
    
    dfout.columns = dfout.columns.droplevel()
    dfout['AvgBottlePrice'] = dfout['TotSales']/dfout['TotQty']
    dfout['AvgSaleAmount'] = dfout['TotSales']/dfout['TotNoSales']
    dfout['AvgPricePerLitre'] = dfout['TotSales']/dfout['TotQty']
    dfout['AvgMarginPercent'] = dfout['TotMargin']/dfout['TotSales']
    
    return dfout

sales_2015q1 = aggsales(2015,1)
sales_2016q1 = aggsales(2016,1)
sales_agg = aggsales()
sales_2015q1

Unnamed: 0_level_0,Unnamed: 1_level_0,DateRange,TotMargin,TotNoSales,TotQty,TotVolume,TotSales,AvgBottlePrice,AvgSaleAmount,AvgPricePerLitre,AvgMarginPercent
Quarter,StoreNo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,2106,77 days,13108.37,129,2705,2526.10,39287.29,14.523952,304.552636,14.523952,0.333654
1,2113,77 days,944.72,42,196,177.11,2833.25,14.455357,67.458333,14.455357,0.333440
1,2130,77 days,8108.68,87,1533,1447.25,24272.57,15.833379,278.995057,15.833379,0.334068
1,2152,77 days,668.60,32,154,151.74,2003.46,13.009481,62.608125,13.009481,0.333723
1,2178,77 days,1961.28,48,490,409.81,5856.41,11.951857,122.008542,11.951857,0.334895
1,2190,84 days,9828.30,347,2557,1666.58,29452.92,11.518545,84.878732,11.518545,0.333695
1,2191,84 days,9723.40,151,1868,1957.28,29085.57,15.570434,192.619669,15.570434,0.334303
1,2200,77 days,1641.80,84,338,367.72,4900.43,14.498314,58.338452,14.498314,0.335032
1,2205,84 days,2138.97,70,466,375.38,6407.74,13.750515,91.539143,13.750515,0.333810
1,2228,84 days,1736.43,60,372,405.62,5193.97,13.962285,86.566167,13.962285,0.334317


In [None]:
treeReg = tree.DecisionTreeRegressor()
linReg = lm.LinearRegression()
TSReg = lm.TheilSenRegressor()
RANReg = lm.RANSACRegressor()

In [None]:
clf = grid_search.GridSearchCV(ols,{'max_depth':range(1,10)})

clf.fit(train_X,train_Y)
clf.best_estimator_.max_depth