In [None]:
# PyData
import pandas
import numpy
from pandas import DataFrame, Series
from sklearn import datasets, linear_model, preprocessing, cross_validation
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge

# System
import datetime
import os
import math

# Graphing
#%matplotlib inline # Only works on Python 3 in the docker container
#import seaborn # Only works on Python 3 in the docker container

#os.environ['PYSPARK_PYTHON'] = 'python2'

%matplotlib inline


# Featurize - same as in Spark.ipynb

In [None]:
df_basic = DataFrame.from_csv("train_features_basic.csv", index_col=['Date', 'Store'])
df_means = DataFrame.from_csv("train-features-predicted_mean.csv", index_col=['Date', 'Store'])
df_customers = DataFrame.from_csv("train-features-predicted_mean-customers.csv", index_col=['Date', 'Store'])
del df_means['Sales-prediction_mean_error'] # Not supported

In [None]:
df_sales = DataFrame.from_csv("train.csv", index_col=['Date', 'Store'])[['Sales']]

In [None]:
df_features = df_means.join(df_basic, how='outer')
train_df = df_sales.join(df_features)
train_df = train_df.join(df_customers['Customers_predicted'])
train_df.head()

In [None]:
del df_features
del df_basic
del df_means
del df_sales

In [None]:
train_df['Sales_predicted'] = train_df['Sales_predicted'].fillna(train_df['Sales_predicted'].mean())
train_df['Sales_predicted'] = train_df['Sales_predicted'].map(lambda x: int(x))

In [None]:
train_df['Customers_predicted'] = train_df['Customers_predicted'].fillna(train_df['Sales_predicted'].mean())
train_df['Customers_predicted'] = train_df['Customers_predicted'].map(lambda x: int(x))

In [None]:
train_df.reset_index(inplace=True)
del train_df['Date']

In [None]:
feature_columns = list(train_df.columns)
feature_columns.remove("Sales")
feature_columns = ['Sales'] + feature_columns
train_df = train_df[feature_columns]

In [None]:

train_df.head()

In [None]:
train_df, test_df = cross_validation.train_test_split(train_df, train_size=.90)

In [None]:
# Model

In [None]:
def score(predicted, actual): # RootMeanSquaredPercentError RMSPE
    score_df = DataFrame.from_dict({"Predicted": predicted, "Actual": actual})
    score_df = score_df[score_df['Actual'] != 0]
    score_df['PercentError'] =  (score_df['Actual'] - score_df['Predicted']) / score_df['Actual']
    score_df['PercentErrorSquared'] = score_df['PercentError'] * score_df['PercentError']
    
    return math.sqrt(score_df['PercentErrorSquared'].mean())

In [None]:
features = [col for col in train_df.columns if col != 'Sales' and col != 'Customers']
regressor_big = ExtraTreesRegressor(n_estimators=50, n_jobs=-1)
regressor_big.fit(train_df[features], train_df['Sales'])
score(predicted=regressor_big.predict(test_df[features]), actual=test_df['Sales'])

In [None]:
features = [col for col in train_df.columns if col != 'Sales' and col != 'Customers']
regressor_big = ExtraTreesRegressor(n_estimators=100, n_jobs=-1)
regressor_big.fit(train_df[features], train_df['Sales'])
score(predicted=regressor_big.predict(test_df[features]), actual=test_df['Sales'])

In [None]:
# Submit

In [None]:
df_basic = DataFrame.from_csv("test_features_basic.csv", index_col=['Date', 'Store'])
df_means = DataFrame.from_csv("test-features-predicted_mean.csv", index_col=['Date', 'Store'])

In [None]:
df_test_features = df_means.join(df_basic)

In [None]:
df_test_features.fillna(0, inplace=True)
df_test_features['Sales_predicted'] = df_test_features['Sales_predicted'].map(lambda x: int(x))

In [None]:
df_test_features.reset_index(inplace=True)
del df_test_features['Date']

In [None]:
df_test_features.set_index('Id', inplace=True)
df_test_features.head()

In [None]:

df_test_features.head()

In [None]:
len(df_test_features.columns)

In [None]:

predictions = regressor_big.predict(df_test_features[features])

In [None]:
df_test_features['Predictions'] = predictions

In [None]:
score(df_test_features.Sales_predicted, df_test_features.Predictions) # BAd naming, compares mean prediction with ensemble prediction as sanity check

In [None]:
ensemble_results = df_test_features[['Predictions']]
ensemble_results.columns=['Sales']
ensemble_results.to_csv( "results-ensemble.csv")

In [None]:
 ensemble_results.head()