In [1]:
# PyData
import pandas
import numpy
from pandas import DataFrame, Series
from sklearn import datasets, linear_model, preprocessing, cross_validation
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge

# System
import datetime
import os
import math

# Graphing
#%matplotlib inline # Only works on Python 3 in the docker container
#import seaborn # Only works on Python 3 in the docker container

#os.environ['PYSPARK_PYTHON'] = 'python2'

%matplotlib inline

# Spark
import pyspark
from pyspark.sql import SQLContext

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

sc = pyspark.SparkContext('local[8]')
sqlContext = SQLContext(sc)

In [2]:
df_basic = DataFrame.from_csv("train_features_basic.csv", index_col=['Date', 'Store'])
df_means = DataFrame.from_csv("train-features-predicted_mean.csv", index_col=['Date', 'Store'])

In [3]:
df_sales = DataFrame.from_csv("train.csv", index_col=['Date', 'Store'])[['Sales']]

  data = self._reader.read(nrows)


In [4]:
df_features = df_means.join(df_basic, how='outer')
train_df = df_sales.join(df_features)
train_df['Sales_predicted'] = train_df['Sales_predicted'].fillna(0) # when sales is 0, sales_predicted is nan.
train_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Sales_predicted,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year
Date,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-07-31,1,5263,5145.283582,5,1,1,0,1,31,4,212,7,3,31,2015
2015-07-31,2,6064,5115.882353,5,1,1,0,1,31,4,212,7,3,31,2015
2015-07-31,3,8314,8138.089552,5,1,1,0,1,31,4,212,7,3,31,2015
2015-07-31,4,13995,10275.776119,5,1,1,0,1,31,4,212,7,3,31,2015
2015-07-31,5,4822,5308.835821,5,1,1,0,1,31,4,212,7,3,31,2015


In [5]:
del df_features
del df_basic
del df_means
del df_sales

In [6]:
train_df['Sales_predicted'] = train_df['Sales_predicted'].fillna(train_df['Sales_predicted'].mean())

In [7]:
train_df['Sales_predicted'] = train_df['Sales_predicted'].map(lambda x: int(x))

In [8]:
train_df.reset_index(inplace=True)

In [9]:
del train_df['Date']

In [10]:
# Sales Predicted Difference %

In [11]:
train_df['Sales_mean_prediction_error'] = (train_df['Sales_predicted'] - train_df['Sales']) / train_df['Sales']
train_df['Sales_mean_prediction_error'] = train_df['Sales_mean_prediction_error'].fillna(0) # when sales is 0, sales_predicted is nan.

In [12]:
feature_columns = list(train_df.columns)
feature_columns.remove("Sales")
feature_columns = ['Sales'] + feature_columns
train_df = train_df[feature_columns]

In [13]:

train_df.head()

Unnamed: 0,Sales,Store,Sales_predicted,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year
0,5263,1,5145,5,1,1,0,1,31,4,212,7,3,31,2015
1,6064,2,5115,5,1,1,0,1,31,4,212,7,3,31,2015
2,8314,3,8138,5,1,1,0,1,31,4,212,7,3,31,2015
3,13995,4,10275,5,1,1,0,1,31,4,212,7,3,31,2015
4,4822,5,5308,5,1,1,0,1,31,4,212,7,3,31,2015


# Switch to Spark

In [14]:
df = sqlContext.createDataFrame(train_df)
# This relies on correct order: [0] is sales, [1:] are all features
df = df.map(lambda row: LabeledPoint(row[0], row[1:])).toDF()


In [15]:
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=31).fit(df)


In [16]:
(trainingData, testData) = df.randomSplit([0.8, 0.2])


# Train a RandomForest model.
rf = RandomForestRegressor(numTrees=10, maxDepth=7, maxBins=31, featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
#pipeline = Pipeline(stages=[rf])


In [17]:

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)



In [18]:

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
# print "Root Mean Squared Error (RMSE) on test data = %g" % rmse

# Compute RMPSE
squares = predictions.rdd.filter(lambda x: x.label != 0).map(lambda x: ((x.label - x.prediction) / x.label) *  ((x.label - x.prediction) / x.label))

mean = squares.mean()
import math
math.sqrt(mean)

+------------------+-------+--------------------+
|        prediction|  label|            features|
+------------------+-------+--------------------+
|10830.990545688468|13995.0|[4.0,10275.0,5.0,...|
| 11521.47388318413|14180.0|[25.0,11812.0,5.0...|
| 6508.335345530728| 7301.0|[28.0,6024.0,5.0,...|
| 5229.552333256649| 4776.0|[30.0,5132.0,5.0,...|
| 9500.620433507702|10789.0|[33.0,9035.0,5.0,...|
+------------------+-------+--------------------+



0.3590068326232296

In [19]:
testData.head()

Row(features=DenseVector([4.0, 10275.0, 5.0, 1.0, 1.0, 0.0, 1.0, 31.0, 4.0, 212.0, 7.0, 3.0, 31.0, 2015.0]), label=13995.0)

In [20]:
def score(prediction, actual):
    pcts = (actual - prediction) / actual
    return math.sqrt( (pcts * pcts).mean() )


In [21]:
predictions.head()

Row(features=DenseVector([4.0, 10275.0, 5.0, 1.0, 1.0, 0.0, 1.0, 31.0, 4.0, 212.0, 7.0, 3.0, 31.0, 2015.0]), label=13995.0, indexedFeatures=DenseVector([4.0, 10275.0, 4.0, 1.0, 1.0, 0.0, 1.0, 30.0, 4.0, 212.0, 6.0, 2.0, 31.0, 2.0]), prediction=10830.990545688468)

In [22]:
predictions_df = predictions.map(lambda x: [x.label, x.features[1], x.prediction]).collect()

In [23]:
predictions_df = DataFrame.from_records(predictions_df, columns=['Actual', 'Prediction-Mean', 'Prediction-RF'])
predictions_df.head()

Unnamed: 0,Actual,Prediction-Mean,Prediction-RF
0,13995,10275,10830.990546
1,14180,11812,11521.473883
2,7301,6024,6508.335346
3,4776,5132,5229.552333
4,10789,9035,9500.620434


In [24]:
predictions_df = predictions_df[predictions_df.Actual != 0]

In [25]:
score(predictions_df['Prediction-Mean'], predictions_df.Actual)

0.32211533242039947

In [26]:
score(predictions_df['Prediction-RF'], predictions_df.Actual)

0.35900683262322935

# Eval

In [27]:
df_basic = DataFrame.from_csv("test_features_basic.csv", index_col=['Date', 'Store'])
df_means = DataFrame.from_csv("test-features-predicted_mean.csv", index_col=['Date', 'Store'])

In [28]:
df_test_features = df_means.join(df_basic)

In [29]:
df_test_features.fillna(0, inplace=True)
df_test_features['Sales_predicted'] = df_test_features['Sales_predicted'].map(lambda x: int(x))

In [30]:
df_test_features.reset_index(inplace=True)
del df_test_features['Date']

In [31]:
df_test_features.set_index('Id', inplace=True)
df_test_features.head()

Unnamed: 0_level_0,Store,Sales_predicted,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,4961,4,1,1,0,0,17,3,260,9,3,38,2015
2,3,7994,4,1,1,0,0,17,3,260,9,3,38,2015
3,7,9406,4,1,1,0,0,17,3,260,9,3,38,2015
4,8,7194,4,1,1,0,0,17,3,260,9,3,38,2015
5,9,6777,4,1,1,0,0,17,3,260,9,3,38,2015


In [32]:

df_test_features.head()

Unnamed: 0_level_0,Store,Sales_predicted,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,4961,4,1,1,0,0,17,3,260,9,3,38,2015
2,3,7994,4,1,1,0,0,17,3,260,9,3,38,2015
3,7,9406,4,1,1,0,0,17,3,260,9,3,38,2015
4,8,7194,4,1,1,0,0,17,3,260,9,3,38,2015
5,9,6777,4,1,1,0,0,17,3,260,9,3,38,2015


In [33]:
len(df_test_features.columns)

14

In [None]:
df_test = sqlContext.createDataFrame(df_test_features[0:10])
df_test = df_test.map(lambda row: LabeledPoint(0, features=row[0:])).toDF()

In [None]:
model.transform(df_test).take(1)

In [None]:
prediction = df_test_features.index

In [None]:
model.transform(df_test).map(lambda x: x.prediction).take(5)

In [None]:
1+1

In [None]:
prediction['Sales'] = model.transform(df_test).map(lambda x: x.prediction).collect()

In [None]:
prediction.head()

In [None]:

prediction.to_csv( "spark-v1.csv", index = False )

In [None]:
model.transform(trainingData).map(lambda x: x.prediction).take(5)

In [None]:
 trainingData.take(5)