In [1]:
import os
import sys
 
# Path for spark source folder
os.environ['SPARK_HOME'] = "/Users/wlsherica/Spark/spark-1.4.1-bin-hadoop2.6"
 
# Append pyspark to Python Path
sys.path.append("/Users/wlsherica/Spark/spark-1.4.1-bin-hadoop2.6/python")
 
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

Exception: Java gateway process exited before sending the driver its port number

In [1]:
sqlContext

<pyspark.sql.context.HiveContext at 0x10749ae90>

In [3]:
data = [1, 2, 3, 4, 5]

distData = sc.parallelize(data)

distData.count()

5

In [2]:
#Data prepare
import json
import pandas as pd
import os.path
from datetime import datetime

pd.set_option('display.max_colwidth', -1)

train='/Users/wlsherica/Desktop/Rossmann/train.csv'
store='/Users/wlsherica/Desktop/Rossmann/store.csv'

start_time = datetime.now()
train_df = pd.read_csv(train, sep=',', header=None)
store_df = pd.read_csv(store, sep=',', header=None)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:01.000658


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
1,1,5,2015-07-31,5263,555,1,1,0,1
2,2,5,2015-07-31,6064,625,1,1,0,1
3,3,5,2015-07-31,8314,821,1,1,0,1
4,4,5,2015-07-31,13995,1498,1,1,0,1


In [4]:
store_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
1,1,c,a,1270,9,2008,0,,,
2,2,a,a,570,11,2007,1,13,2010,"Jan,Apr,Jul,Oct"
3,3,a,a,14130,12,2006,1,14,2011,"Jan,Apr,Jul,Oct"
4,4,c,c,620,9,2009,0,,,


In [15]:
store['Store'].count()

1115

In [6]:
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt

In [8]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
                     'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek',
                     'Promo2SinceYear'])

    # add some more with a bit of preprocessing
    features.append('SchoolHoliday')
    data['SchoolHoliday'] = data['SchoolHoliday'].astype(float)

    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'month', 'day', 'year'])
    data['year'] = data.Date.dt.year
    data['month'] = data.Date.dt.month
    data['day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek

In [9]:
## Start of main script
start_time = datetime.now()

print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}

train = pd.read_csv("/Users/wlsherica/Desktop/Rossmann/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("/Users/wlsherica/Desktop/Rossmann/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("/Users/wlsherica/Desktop/Rossmann/store.csv")

print("Assume store open, if not provided")
train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero. Simplifies calculation of rmspe")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
build_features(features, train)
build_features([], test)
print(features)

print('training data processed')

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.09,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 1700

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("/Users/wlsherica/Desktop/Rossmann/xgboost_10_submission.csv", index=False)

# XGB feature importances
# Based on https://www.kaggle.com/mmueller/liberty-mutual-group-property-inspection-prediction/xgb-feature-importance-python/code

create_feature_map(features)
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

featp = df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Load the training, test and store data using pandas
Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero. Simplifies calculation of rmspe
Join with store
augment features
['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'month', 'day', 'year']

Will train until eval error hasn't decreased in 100 rounds.
[0]	train-rmspe:0.999985	eval-rmspe:0.999571
[1]	train-rmspe:0.999916	eval-rmspe:0.999007
[2]	train-rmspe:0.999669	eval-rmspe:0.998029
[3]	train-rmspe:0.998930	eval-rmspe:0.996432
[4]	train-rmspe:0.997055	eval-rmspe:0.993981
[5]	train-rmspe:0.992912	eval-rmspe:0.990363
[6]	train-rmspe:0.986216	eval-rmspe:0.985275
[7]	train-rmspe:0.977212	eval-rmspe:0.978383
[8]	train-rmspe:0.968664	eval-rmspe:0.969386
[9]	train-rmspe:0.958213	eval-rmspe:0.957975
[10]	train-rmspe:0.944017	eval-rmspe:0.943975
[11]	train-rmspe:0.927203	eval-rmspe:0.927273
[12]	train-rmspe:0.907767	eval-rmspe:0.907790
[13]	train-rmspe:0.885608	eval-rmspe:0.885609
[14]	train-rmspe:0.860866	eval-rmspe:0.860840
[15]	train-rmspe:0.833829	eval-rmspe:0.833744
[16]	train-rmspe:0.804841	eval-rmspe:0.804637
[17]	train-rmspe:0.774173	eval-rmspe:0.773816
[18]	train-rmspe:0.742196	eval-rmspe:0.741697
[19]	train-rmspe:0.709477	eval-rmspe:0.708698
[20]	train-rmspe:0.676300	eval


training data processed
Train a XGBoost model
Validating
RMSPE: 0.088399
Make predictions on the test set


[1699]	train-rmspe:0.068408	eval-rmspe:0.088399


In [16]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb

# Gather some features
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

## Start of main script
start_time = datetime.now()
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(int),
         'PromoInterval': np.dtype(str)}

train = pd.read_csv("/Users/wlsherica/Desktop/Rossmann/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("/Users/wlsherica/Desktop/Rossmann/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("/Users/wlsherica/Desktop/Rossmann/store.csv")

print("Assume store open, if not provided")
test.fillna(1, inplace=True)

print("Consider only open stores for training. Closed stores wont count into the score.")
train = train[train["Open"] != 0]
print("Use only Sales bigger then zero")
train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
train = build_features(features, train)
test = build_features([], test)
print(features)

print('training data processed')

def rmspe(y, yhat):
    return np.sqrt(np.mean(((y - yhat)/y) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y, yhat)

print("Train xgboost model")

params = {"objective": "reg:linear",
          "booster" : "gbtree",
          "eta": 0.1,
          "max_depth": 10,
          "subsample": 0.85,
          "colsample_bytree": 0.4,
          "min_child_weight": 6,
          "silent": 1,
          "thread": 1,
          "seed": 1301
          }
num_boost_round = 1200

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=200, \
  feval=rmspe_xg, verbose_eval=True)

print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
test_probs = gbm.predict(dtest)
# Make Submission
result = pd.DataFrame({"Id": test["Id"], 'Sales': np.expm1(test_probs)})
result.to_csv("xgboost_39_submission.csv", index=False)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Load the training, test and store data using pandas
Assume store open, if not provided
Consider only open stores for training. Closed stores wont count into the score.
Use only Sales bigger then zero
Join with store
augment features
['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', 'StoreType', 'Assortment', 'StateHoliday', 'DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']

Will train until eval error hasn't decreased in 200 rounds.
[0]	train-rmspe:0.999980	eval-rmspe:0.999520
[1]	train-rmspe:0.999879	eval-rmspe:0.998814
[2]	train-rmspe:0.999483	eval-rmspe:0.997535
[3]	train-rmspe:0.998193	eval-rmspe:0.995371
[4]	train-rmspe:0.994792	eval-rmspe:0.991943
[5]	train-rmspe:0.988319	eval-rmspe:0.986808
[6]	train-rmspe:0.978789	eval-rmspe:0.979530
[7]	train-rmspe:0.968873	eval-rmspe:0.969590
[8]	train-rmspe:0.956882	eval-rmspe:0.956738
[9]	train-rmspe:0.940610	eval-rmspe:0.940531
[10]	train-rmspe:0.920978	eval-rmspe:0.920970
[11]	train-rmspe:0.898074	eval-rmspe:0.898019
[12]	train-rmspe:0.871714	eval-rmspe:0.871621
[13]	train-rmspe:0.842560	eval-rmspe:0.842368
[14]	train-rmspe:0.810569	eval-rmspe:0.810272
[15]	train-rmspe:0.776518	eval-rmspe:0.776044
[16]	train-rmspe:0.740919	eval-rmspe:0.740225
[17]	train-rmspe:0.704385	eval-rmspe:0.703363
[18]	train-rmspe:0.667680	eval-rmspe:0.666228
[19]	train-rmspe:0.631038	eval-rmspe:0.629169
[20]	train-rmspe:0.595041	eval


training data processed
Train xgboost model
Train a XGBoost model
Validating
RMSPE: 0.094526
Make predictions on the test set
Duration: 0:18:10.139952


[1199]	train-rmspe:0.103954	eval-rmspe:0.094526
