In [1]:
# PyData
import pandas
import numpy
from pandas import DataFrame, Series
from sklearn import datasets, linear_model, preprocessing, cross_validation
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge

# System
import datetime
import os
import math

# Graphing
#%matplotlib inline # Only works on Python 3 in the docker container
#import seaborn # Only works on Python 3 in the docker container

#os.environ['PYSPARK_PYTHON'] = 'python2'


%matplotlib inline

# Spark
import pyspark
from pyspark.sql import SQLContext

from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

pyspark.SparkContext.setSystemProperty('spark.executor.memory', '30g')

sc = pyspark.SparkContext('local[2]')
sqlContext = SQLContext(sc)

In [2]:
train_df = DataFrame.from_csv("train-featurized.csv", index_col=None)
train_df.head()

Unnamed: 0,Date,Store,Sales,Customers,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year,Sales_predicted_by_mean
0,2015-07-31 00:00:00,1,5263,555,5,1,1,0,1,31,4,212,7,3,31,2015,5145
1,2015-07-31 00:00:00,2,6064,625,5,1,1,0,1,31,4,212,7,3,31,2015,5115
2,2015-07-31 00:00:00,3,8314,821,5,1,1,0,1,31,4,212,7,3,31,2015,8138
3,2015-07-31 00:00:00,4,13995,1498,5,1,1,0,1,31,4,212,7,3,31,2015,10275
4,2015-07-31 00:00:00,5,4822,559,5,1,1,0,1,31,4,212,7,3,31,2015,5308


In [3]:
del train_df['Date']

In [4]:
# Fix columns
feature_columns = list(train_df.columns)
feature_columns.remove("Sales")
feature_columns = ['Sales'] + feature_columns
train_df = train_df[feature_columns]

In [5]:
df = sqlContext.createDataFrame(train_df)
# This relies on correct order: [0] is sales, [1:] are all features
df = df.map(lambda row: LabeledPoint(row[0], row[1:])).toDF()


In [6]:
%%time
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=1200).fit(df)


CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 23.9 s


In [7]:
(trainingData, testData) = df.randomSplit([0.8, 0.2])


# Train a RandomForest model.
# rf = RandomForestRegressor(numTrees=10, maxDepth=7, maxBins=31, featuresCol="indexedFeatures")
# OOMS with numTrees=10, maxDepth=7, maxBins=1200 on 30gb r.2xlarge
# Works with 30gb r.2xlarge: numTrees=12, maxDepth=7, maxBins=31
# OOMS with numTrees=12, maxDepth=10, maxBins=31 on 30gb r.2xlarge
# QUESTION: Does limiting the number of processes improve this by reducing memory contention of the processes?
# - YES! The above was using all 8 processes, by limiting to 2 processes:
# - 2procs: Works with 30gb r.2xlarge: numTrees=12, maxDepth=10, maxBins=31
# - OOMS: Works with 30gb r.2xlarge: numTrees=10, maxDepth=7, maxBins=1200
# - Does not finish: numTrees=30, maxDepth=20, maxBins=31
# - Takes 5 min to fit: numTrees=15, maxDepth=10, maxBins=31
# Actually runs with numTrees=10, maxDepth=10 and maxBins=1200 though! Good results
rf = RandomForestRegressor(numTrees=10, maxDepth=10, maxBins=1200, featuresCol="indexedFeatures")


# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
#pipeline = Pipeline(stages=[rf])


In [8]:
%%time
# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

 

CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 1min 47s


In [9]:
# Make predictions.
predictions = model.transform(testData)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

# Compute RMPSE
squares = predictions.rdd.filter(lambda x: x.label != 0).map(lambda x: ((x.label - x.prediction) / x.label) *  ((x.label - x.prediction) / x.label))
math.sqrt(squares.mean())

0.11749226487144407

In [10]:
len(train_df.columns)

16

# Predictions

In [37]:

test_df = DataFrame.from_csv("test-featurized.csv", index_col='Date')

In [38]:
train_df.head()

Unnamed: 0,Sales,Store,Customers,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year,Sales_predicted_by_mean
0,5263,1,555,5,1,1,0,1,31,4,212,7,3,31,2015,5145
1,6064,2,625,5,1,1,0,1,31,4,212,7,3,31,2015,5115
2,8314,3,821,5,1,1,0,1,31,4,212,7,3,31,2015,8138
3,13995,4,1498,5,1,1,0,1,31,4,212,7,3,31,2015,10275
4,4822,5,559,5,1,1,0,1,31,4,212,7,3,31,2015,5308


In [39]:
test_df.reset_index(inplace=True)

In [40]:
del test_df['Date']

In [41]:
train_df.head()

Unnamed: 0,Sales,Store,Customers,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,cal:dayOfMonth,cal:dayOfWeek,cal:dayofyear,cal:month,cal:quarter,cal:weekofyear,cal:year,Sales_predicted_by_mean
0,5263,1,555,5,1,1,0,1,31,4,212,7,3,31,2015,5145
1,6064,2,625,5,1,1,0,1,31,4,212,7,3,31,2015,5115
2,8314,3,821,5,1,1,0,1,31,4,212,7,3,31,2015,8138
3,13995,4,1498,5,1,1,0,1,31,4,212,7,3,31,2015,10275
4,4822,5,559,5,1,1,0,1,31,4,212,7,3,31,2015,5308


In [42]:
columns = train_df.columns.tolist()
columns = columns[1:]
columns = ['Id']  + columns

In [43]:

test_df = test_df[columns]

In [44]:
print (train_df.columns)
print (test_df.columns)

Index(['Sales', 'Store', 'Customers', 'DayOfWeek', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'cal:dayOfMonth', 'cal:dayOfWeek',
       'cal:dayofyear', 'cal:month', 'cal:quarter', 'cal:weekofyear',
       'cal:year', 'Sales_predicted_by_mean'],
      dtype='object')
Index(['Id', 'Store', 'Customers', 'DayOfWeek', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'cal:dayOfMonth', 'cal:dayOfWeek',
       'cal:dayofyear', 'cal:month', 'cal:quarter', 'cal:weekofyear',
       'cal:year', 'Sales_predicted_by_mean'],
      dtype='object')


In [45]:
df = sqlContext.createDataFrame(test_df)
df = df.map(lambda row: LabeledPoint(row[0], row[1:])).toDF()


In [46]:
predictions = model.transform(df).map(lambda row: [row.label, row.prediction]).collect()

In [47]:
predictions_df = DataFrame.from_records(predictions, columns=['Id', 'Prediction'])

In [54]:
predictions_df['Id'] = predictions_df['Id'].map(lambda x: int(x))
predictions_df['Sales'] = predictions_df['Prediction']
del predictions_df['Prediction']

In [55]:
predictions_df.set_index("Id", inplace=True)

In [57]:
predictions_df.to_csv("results-ensemble.csv")

In [59]:
 predictions_df.reset_index(inplace=True)