In [41]:
# PyData
import pandas
import numpy
from pandas import DataFrame, Series
from sklearn import datasets, linear_model, preprocessing, cross_validation
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge

# System
import datetime
import os
import math

# Graphing
#%matplotlib inline # Only works on Python 3 in the docker container
#import seaborn # Only works on Python 3 in the docker container

#os.environ['PYSPARK_PYTHON'] = 'python2'

%matplotlib inline


In [2]:
train_df = DataFrame.from_csv("train-featurized.csv", index_col=None)
train_df.head()

Unnamed: 0,Date,Store,Sales,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year,Customers_predicted,Sales_predicted_by_mean
0,2015-07-31 00:00:00,1,5263,5,1,1,0,1,31,4,212,7,3,31,2015,588,5145
1,2015-07-31 00:00:00,2,6064,5,1,1,0,1,31,4,212,7,3,31,2015,600,5115
2,2015-07-31 00:00:00,3,8314,5,1,1,0,1,31,4,212,7,3,31,2015,838,8138
3,2015-07-31 00:00:00,4,13995,5,1,1,0,1,31,4,212,7,3,31,2015,1373,10275
4,2015-07-31 00:00:00,5,4822,5,1,1,0,1,31,4,212,7,3,31,2015,598,5308


# Model

In [3]:
def score(predicted, actual): # RootMeanSquaredPercentError RMSPE
    score_df = DataFrame.from_dict({"Predicted": predicted, "Actual": actual})
    score_df = score_df[score_df['Actual'] != 0]
    score_df['PercentError'] =  (score_df['Actual'] - score_df['Predicted']) / score_df['Actual']
    score_df['PercentErrorSquared'] = score_df['PercentError'] * score_df['PercentError']
    
    return math.sqrt(score_df['PercentErrorSquared'].mean())

In [42]:
features = [col for col in train_df.columns if col != 'Sales' and col != 'Customers' and col != 'Date']
regressor_big = ExtraTreesRegressor(n_estimators=30, n_jobs=-1)
predictions = cross_validation.cross_val_predict(regressor_big, train_df[features], train_df['Sales'])

In [43]:
score(predicted=predictions, actual=train_df['Sales'])

0.3974366853563966

In [34]:
train_df['Predictions-big_model'] = predictions

In [47]:
predictions_dfs = {}

i = 0

features = [col for col in train_df.columns if col != 'Sales' and col != 'Customers' and col != 'Date']
for store in train_df['Store'].unique().tolist():
    if (i % 100 == 0):
        print (i)
    i = i + 1
    store_df = train_df[train_df['Store'] == store].copy()
    
#    regressor = ExtraTreesRegressor(n_estimators=20, n_jobs=-1)
    regressor = LinearRegression()
    predictions = cross_validation.cross_val_predict(regressor, store_df[features], store_df['Sales'])
    store_df['Sales-prediction'] = predictions
    predictions_dfs[store] = store_df


0
100
200
300
400
500
600
700
800
900
1000
1100


In [48]:
predictions_df = pandas.concat(predictions_dfs.values())

In [49]:
score(predicted=predictions_df['Sales-prediction'], actual=predictions_df['Sales'])

0.24522221269361744

In [16]:
score(predicted=train_df['Sales_predicted_by_mean'], actual=train_df['Sales'])

0.2297564142278406

# Submit

In [None]:
df_basic = DataFrame.from_csv("test_features_basic.csv", index_col=['Date', 'Store'])
df_means = DataFrame.from_csv("test-features-predicted_mean.csv", index_col=['Date', 'Store'])

In [None]:
df_test_features = df_means.join(df_basic)

In [None]:
df_test_features.fillna(0, inplace=True)
df_test_features['Sales_predicted'] = df_test_features['Sales_predicted'].map(lambda x: int(x))

In [None]:
df_test_features.reset_index(inplace=True)
del df_test_features['Date']

In [None]:
df_test_features.set_index('Id', inplace=True)
df_test_features.head()

In [None]:

df_test_features.head()

In [None]:
len(df_test_features.columns)

In [None]:
features

In [None]:

predictions = regressor_big.predict(df_test_features[features])

In [None]:
df_test_features['Predictions'] = predictions

In [None]:
score(df_test_features.Sales_predicted, df_test_features.Predictions) # BAd naming, compares mean prediction with ensemble prediction as sanity check

In [None]:
ensemble_results = df_test_features[['Predictions']]
ensemble_results.columns=['Sales']
ensemble_results.to_csv( "results-ensemble.csv")

In [None]:
 ensemble_results.head()