In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
import sklearn.cross_validation as cross_val

In [16]:
sample = pd.read_csv('Data/sample_submission.csv',index_col='Id')

In [17]:
train = pd.read_csv('Data/train.csv')

In [18]:
store = pd.read_csv('Data/store.csv', index_col='Store')

In [19]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [20]:
store.CompetitionDistance.values

array([  1270.,    570.,  14130., ...,   9260.,    870.,   5350.])

In [21]:
store.loc[1]

StoreType                       c
Assortment                      a
CompetitionDistance          1270
CompetitionOpenSinceMonth       9
CompetitionOpenSinceYear     2008
Promo2                          0
Promo2SinceWeek               NaN
Promo2SinceYear               NaN
PromoInterval                 NaN
Name: 1, dtype: object

In [22]:
test = pd.read_csv('Data/test.csv', index_col='Id')

In [23]:
test.head()

Unnamed: 0_level_0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,4,2015-09-17,1,1,0,0
2,3,4,2015-09-17,1,1,0,0
3,7,4,2015-09-17,1,1,0,0
4,8,4,2015-09-17,1,1,0,0
5,9,4,2015-09-17,1,1,0,0


In [24]:
store.loc[np.isnan(store.CompetitionDistance)]

Unnamed: 0_level_0,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
291,d,a,,,,0,,,
622,a,c,,,,0,,,
879,d,a,,,,1,5.0,2013.0,"Feb,May,Aug,Nov"


In [25]:
train.loc[np.isnan(train.Open)]

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday


In [26]:
train["StoreType"] = 1
train["Assortment"] = 1
train["CompDist"] = 1

In [27]:
train.loc[:,["Assortment", "StoreType"]] = store.loc[train.Store,{"StoreType", "Assortment"}].values
train.loc[:,"CompDist"] = store.loc[train.Store, "CompetitionDistance"].values
train.CompDist = train.CompDist.fillna(0)

In [28]:
store.loc[store.CompetitionOpenSinceYear < 1961]

Unnamed: 0_level_0,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
815,a,a,590,1,1900,1,40,2014,"Jan,Apr,Jul,Oct"


In [29]:
train.loc[19]

Store                    20
DayOfWeek                 5
Date             2015-07-31
Sales                  9593
Customers               974
Open                      1
Promo                     1
StateHoliday              0
SchoolHoliday             0
StoreType                 d
Assortment                a
CompDist               2340
Name: 19, dtype: object

In [30]:
train.loc[train.Store == 291]

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompDist
290,291,5,2015-07-31,10052,973,1,1,0,1,d,a,0
1405,291,4,2015-07-30,9030,883,1,1,0,1,d,a,0
2520,291,3,2015-07-29,8204,814,1,1,0,1,d,a,0
3635,291,2,2015-07-28,10374,943,1,1,0,1,d,a,0
4750,291,1,2015-07-27,11692,1086,1,1,0,1,d,a,0
5865,291,7,2015-07-26,0,0,0,0,0,0,d,a,0
6980,291,6,2015-07-25,4132,461,1,0,0,0,d,a,0
8095,291,5,2015-07-24,6932,755,1,0,0,1,d,a,0
9210,291,4,2015-07-23,6263,720,1,0,0,1,d,a,0
10325,291,3,2015-07-22,5679,595,1,0,0,1,d,a,0


In [31]:
def type_to_numeric(x):
    if(x == "a"):
        return 1
    elif(x == "b"):
        return 2
    elif(x == "c"):
        return 3
    elif(x == "d"):
        return 4

In [32]:
def assort_to_numeric(x):
    if(x == "a"):
        return 1
    elif(x == "b"):
        return 2
    elif(x == "c"):
        return 3

In [33]:
train.StoreType = train.apply(lambda x: type_to_numeric(x.StoreType), axis = 1)

In [34]:
train.Assortment = train.apply(lambda x: assort_to_numeric(x.Assortment), axis = 1)

In [35]:
import sklearn.linear_model as lm

In [5]:
train = pd.read_csv("Data/ready_data_1.csv")

In [6]:
train_t = train.loc[:,["DayOfWeek", "Store", "Open", "Promo", "SchoolHoliday", "StoreType", "Assortment", "CompDist"]]
train_r = train.loc[:, "Sales"]

In [151]:
train_t.StoreType.values

array([3, 1, 1, ..., 1, 1, 4])

In [166]:
lr_model = lm.LinearRegression(n_jobs=4)

In [167]:
lr_model.fit(train_t.values, train_r.values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=4, normalize=False)

In [168]:
train.head(7)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompDist
0,1,5,2015-07-31,5263,555,1,1,0,1,3,1,1270
1,2,5,2015-07-31,6064,625,1,1,0,1,1,1,570
2,3,5,2015-07-31,8314,821,1,1,0,1,1,1,14130
3,4,5,2015-07-31,13995,1498,1,1,0,1,3,3,620
4,5,5,2015-07-31,4822,559,1,1,0,1,1,1,29910
5,6,5,2015-07-31,5651,589,1,1,0,1,1,1,310
6,7,5,2015-07-31,15344,1414,1,1,0,1,1,3,24000


In [169]:
train_t.loc[1]

DayOfWeek          5
Store              2
Open               1
Promo              1
SchoolHoliday      1
StoreType          1
Assortment         1
CompDist         570
Name: 1, dtype: float64

In [176]:
lr_model.predict(train_t.loc[6])

array([ 8112.5384462])

In [13]:
import sklearn.cross_validation as cross_val

In [184]:
score = cross_val.cross_val_score(lr_model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

array([ 2661.75939396,  2666.00728358,  2543.90994078])

In [10]:
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor

In [None]:
model = RandomForestRegressor(n_estimators = 100, n_jobs = 4, random_state=213)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

In [207]:
model = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, n_jobs=4),random_state=213)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

array([ 1325.86788605,  1254.0801347 ,  1348.81875763])

In [188]:
model = RandomForestRegressor(n_estimators = 400, n_jobs = 7, random_state=213)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

array([ 1249.34842207,  1187.20813561,  1167.96771253])

In [3]:
import sklearn.grid_search as gs

In [245]:
grid_search_cv = gs.GridSearchCV(RandomForestRegressor(random_state=213, n_jobs = -1), {
        'max_depth': (10, None),
        'min_samples_split': (10, 20)}, scoring='mean_squared_error', n_jobs = -1, cv=4)
grid_search_cv.fit(train_t, train_r)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=213,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'min_samples_split': (10, 20), 'max_depth': (10, None)},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='mean_squared_error', verbose=0)

In [246]:
print(-grid_search_cv.best_score_) ** 0.5
grid_search_cv.best_estimator_

1167.98812152


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=20, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=213,
           verbose=0, warm_start=False)

In [7]:
model = RandomForestRegressor(n_estimators = 100, min_samples_leaf=1, n_jobs = 4, random_state=213,
                              min_samples_split=10)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mea')
(- score) ** 0.5

ValueError: 'mea' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']

In [247]:
model = RandomForestRegressor(n_estimators = 100, min_samples_leaf=1, n_jobs = 4, random_state=213,
                              min_samples_split=20)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

array([ 1248.30613692,  1186.02022562,  1157.81162403])

In [None]:
model = RandomForestRegressor(n_estimators = 100, min_samples_leaf=1, n_jobs = 4, random_state=213,
                              min_samples_split=40)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

In [248]:
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor

In [249]:
model = GradientBoostingRegressor(random_state=213)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

array([ 2420.64778238,  2397.527671  ,  2291.55121457])

In [250]:
model = ExtraTreesRegressor(random_state=213, n_jobs=-1, n_estimators=100)
score = cross_val.cross_val_score(model, train_t, train_r, scoring='mean_squared_error')
(- score) ** 0.5

array([ 1249.29364295,  1187.0770099 ,  1172.03279546])

In [47]:
from sklearn.svm import SVR

In [51]:
model = SVR()
score = cross_val.cross_val_score(model, train_t[:40000], train_r[:40000], scoring='mean_squared_error')
(- score) ** 0.5

array([ 3464.01132749,  3782.12569713,  4115.82754782])

In [13]:
import time
import datetime

In [21]:
dt = datetime.datetime(*(time.strptime("2015-07-31","%Y-%m-%d")[0:3]))
time.mktime(dt.timetuple())

1438290000.0

### Metrics etc.

In [43]:
def scale(feature):
    return map(lambda x: math.log(x + 1), feature)
def recover(feature):
    return map(lambda x: math.pow(math.e, x) - 1, feature)
def scaleFeatures(dataF, features):
    data1 = dataF.copy()
    for f in features:
        data1[f] = scale(data1[f])
    return data1
def recoveFeatures(dataF, features):
    data1 = dataF.copy()
    for f in features:
        data1[f] = recover(data1[f])
    return data1
def calcRMSPE(test, predict):
    mpe = 0.0
    len1 = 0
    for t,p in zip(test, predict):
        if (t == 0):
            continue
        mpe += math.pow((1.0 * (t - p)) / t, 2)
        len1 +=1
    mpe = mpe / len1
    rmpe = math.sqrt(mpe)
    return rmpe
def getAccuracy(test, predict):
    accs = {}
    accs['RMSE'] = math.sqrt(mean_squared_error(test, predict))
    accs['MAE'] = mean_absolute_error(test, predict)
    accs['RMSPE'] = calcRMSPE(test,predict)
    return accs
def parseDate(data):
    data1 = data.copy()
    data1['Month'] = data1.Date.apply(lambda x: time.strptime(x, "%Y-%m-%d").tm_mon)
    data1['Day'] = data1.Date.apply(lambda x: time.strptime(x, "%Y-%m-%d").tm_mday)
    #data1['Year'] = data1.Date.apply(lambda x: time.strptime(x, "%Y-%m-%d").tm_year)
    return data1
def strToNumeric(data, columns):
    data1 = data.copy()
    for col in columns:
        uniq = data[col].drop_duplicates()
        k = 0
        for u in uniq:
            data1[col] = data1[col].replace(u, k)
            k += 1
    return data1

### tmp model

In [6]:
model = RandomForestRegressor(n_estimators = 10, min_samples_leaf=1, n_jobs = -1, random_state=213, criterion='asd',
                              min_samples_split=20)
#model.fit(train_t, train_r)

In [9]:
X_train, X_test, y_train, y_test = cross_val.train_test_split(train_t, train_r, test_size=0.2, random_state=23)
model.fit(X_train, y_train)
print "TEST: ", getAccuracy(y_test, model.predict(X_test))
print "TRAIN: ", getAccuracy(y_train, model.predict(X_train))

NameError: name 'train_t' is not defined

In [44]:
print "TEST: ", getAccuracy(y_test, model.predict(X_test))
print "TRAIN: ", getAccuracy(y_train, model.predict(X_train))

TEST:  {'RMSPE': 0.18018543608470253, 'MAE': 678.82698923530711, 'RMSE': 1136.67421762252}
TRAIN:  {'RMSPE': 0.23890193752165959, 'MAE': 651.6991772302863, 'RMSE': 1078.6556620723736}


In [55]:
test["StoreType"] = 1
test["Assortment"] = 1
test["CompDist"] = 1

test.loc[:,["Assortment", "StoreType"]] = store.loc[test.Store,{"StoreType", "Assortment"}].values
test.loc[:,"CompDist"] = store.loc[test.Store, "CompetitionDistance"].values
test.CompDist = test.CompDist.fillna(0)

test.StoreType = test.apply(lambda x: type_to_numeric(x.StoreType), axis = 1)
test.Assortment = test.apply(lambda x: assort_to_numeric(x.Assortment), axis = 1)

In [56]:
test_final = test.loc[:,["DayOfWeek", "Store", "Open", "Promo", "SchoolHoliday", "StoreType", "Assortment", "CompDist"]]

In [60]:
test_final_2 = test_final.fillna(0)

In [61]:
prediction = model.predict(test_final_2)

In [89]:
prediction_pd = pd.DataFrame(prediction)

In [79]:
prediction_pd.index.name = 'i'

In [82]:
prediction_pd.head()

Unnamed: 0_level_0,Sales
i,Unnamed: 1_level_1
0,4973.770704
1,7939.378606
2,9026.378349
3,7139.241977
4,6615.210181


In [98]:
sample.head()

Unnamed: 0_level_0,Sales
Id,Unnamed: 1_level_1
1,0
2,0
3,0
4,0
5,0


In [95]:
prediction_pd.columns = {'Sales'}
prediction_pd.index = np.arange(1,41089)
prediction_pd.index.name = "Id"

In [97]:
prediction_pd.head()

Unnamed: 0_level_0,Sales
Id,Unnamed: 1_level_1
1,4973.770704
2,7939.378606
3,9026.378349
4,7139.241977
5,6615.210181


In [100]:
prediction_pd.to_csv('Result/first.csv', index_label='Id')

In [101]:
tmp = pd.read_csv('Result/first.csv',index_col='Id')

In [102]:
tmp.head()

Unnamed: 0_level_0,Sales
Id,Unnamed: 1_level_1
1,4973.770704
2,7939.378606
3,9026.378349
4,7139.241977
5,6615.210181
