This notebook transforms the dataset into a time series format (with the option of some nre features such as block averages)

It also applies different models (XGboost, Random Forest and MLP) and performs validation on MLP model using parallel processing

In [1]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [2]:
#save and load function
import pickle
def save_obj(obj, name ):
    with open('./'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    

In [3]:
#load the result from aws service
results_raw = load_obj("data/results_long_process")

In [4]:
len(results_raw)

60

In [5]:
#add up the dict from each engine
def Merge(dict1, dict2): 
    return(dict2.update(dict1))

In [6]:
result_target = {}
for d in results_raw:
    Merge(d,result_target)

In [7]:
len(result_target)

424124

In [None]:
#save result_target
save_obj(result_target,"data/result_target")

In [8]:
result_target = load_obj("data/result_target")

In [10]:
#example shop 2 item 4420
result_target[0,15259].head()

Unnamed: 0_level_0,shop_id,item_id,item_category_id,top_categories_id,item_price,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,0.0,15259.0,63.0,16.0,0.0,0.0
1.0,0.0,15259.0,63.0,16.0,736.0,1.0
2.0,0.0,15259.0,63.0,16.0,0.0,0.0
3.0,0.0,15259.0,63.0,16.0,0.0,0.0
4.0,0.0,15259.0,63.0,16.0,0.0,0.0


In [11]:
X_val = load_obj("data/X_val")

In [12]:
X_val.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,top_categories_id,item_price
54,33,31,22154,37,13,-0.342151
55,33,42,22154,37,13,-0.342151
56,33,15,22154,37,13,-0.342151
57,33,28,22154,37,13,-0.342151
58,33,24,22154,37,13,-0.342151


In [None]:
#month 0-32 are train data
#month 33 is validation
#month 34 is test

In [13]:
#for test
result_test = {}
result_test[0,15259]= result_target[0,15259]

In [14]:
%%time
result_stacked = pd.concat(result_target.values()).reset_index().set_index(['shop_id','item_id','item_category_id','top_categories_id','date_block_num']).unstack('date_block_num')

Wall time: 3min 18s


In [15]:
result_stacked.columns = [col[0] + '_' + str(int(col[1])) for col in result_stacked.columns.values]

In [16]:
result_stacked.reset_index(inplace = True)

In [17]:
result_stacked.head()

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_0,item_price_1,item_price_2,item_price_3,item_price_4,item_price_5,...,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31,item_cnt_day_32,item_cnt_day_33
0,0.0,30.0,40.0,13.0,0.0,265.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,31.0,37.0,13.0,0.0,434.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,32.0,40.0,13.0,221.0,221.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,33.0,37.0,13.0,347.0,347.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,35.0,40.0,13.0,247.0,247.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
save_obj(result_stacked,"data/result_stacked")

In [3]:
result_stacked = load_obj("data/result_stacked")

In [None]:
#load the NA rows in test

In [18]:
#from Test_data_1
result_NA = load_obj("data/test_merge_NA")

In [19]:
#fill in missing values from test

In [20]:
result_fill = result_NA.copy()
result_fill.head()

Unnamed: 0,shop_id,item_id,ID,item_category_id,top_categories_id,item_cnt_month
0,5,5320,1,55,15,-999
1,4,5320,5101,55,15,-999
2,6,5320,10201,55,15,-999
3,3,5320,15301,55,15,-999
4,2,5320,20401,55,15,-999


In [21]:
result_fill = result_fill.drop(columns = ["ID","item_cnt_month"])

In [22]:
result_fill.head()

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id
0,5,5320,55,15
1,4,5320,55,15
2,6,5320,55,15
3,3,5320,55,15
4,2,5320,55,15


In [23]:
len(result_stacked.columns[4:])

68

In [24]:
cols_stacked = result_stacked.columns

In [25]:
#add 68 columns of zeros for missing values

for i in range(68):
    result_fill[i] = 0

In [26]:
result_fill.columns = cols_stacked 

In [27]:
#concat
result_stacked  = pd.concat([result_stacked ,result_fill])

In [28]:
#concat
result_stacked = result_stacked.sort_values(by="shop_id")

In [29]:
result_stacked.head()

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_0,item_price_1,item_price_2,item_price_3,item_price_4,item_price_5,...,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31,item_cnt_day_32,item_cnt_day_33
0,0.0,30.0,40.0,13.0,0.0,265.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,268.0,268.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,657.0,657.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,76.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,620.0,620.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#save
save_obj(result_stacked,"data/result_stacked_complete")

In [30]:
train = result_stacked.drop(columns=["item_price_32","item_price_33","item_cnt_day_33"])
#normalize
mean_train_price = np.mean(np.array(train.iloc[:,4:36]))
std_train_price = np.std(np.array(train.iloc[:,4:36]))

In [31]:
std_train_price

509.66518271282956

In [32]:
#standardize
for c in range(32):
    train.iloc[:,c+4] = (train.iloc[:,c+4] - mean_train_price)/std_train_price

In [33]:
train.iloc[:,-2:-1].head()

Unnamed: 0,item_cnt_day_31
0,0.0
2393,0.0
2394,0.0
2395,0.0
2396,0.0


In [34]:
#train is 0-31 month of price, 0-31 month of cnt, to predict cnt #32 
X_train_cnt = train.drop(columns=["item_cnt_day_32"])
y_train_cnt = train["item_cnt_day_32"]

In [35]:
X_train_cnt.iloc[:,35].head()

0      -0.140107
2393   -0.140107
2394   -0.140107
2395   -0.140107
2396   -0.140107
Name: item_price_31, dtype: float64

In [36]:
X_train_cnt.head(5)

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_0,item_price_1,item_price_2,item_price_3,item_price_4,item_price_5,...,item_cnt_day_22,item_cnt_day_23,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31
0,0.0,30.0,40.0,13.0,-0.140107,0.379843,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,0.385729,0.385729,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,1.148975,1.148975,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,0.009011,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,1.076378,1.076378,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
#val is 1-32 month of price, 1-32 month of cnt, to predict cnt #33
#test is 2-33 month of price, 2-33 month of cnt, to predict cnt #34
val = result_stacked.drop(columns=["item_price_0","item_cnt_day_0","item_price_33"])
val.iloc[:,36].head()

0       31.0
2393     1.0
2394     2.0
2395     0.0
2396     1.0
Name: item_cnt_day_1, dtype: float64

In [38]:
val.iloc[:,4:36].head()

Unnamed: 0,item_price_1,item_price_2,item_price_3,item_price_4,item_price_5,item_price_6,item_price_7,item_price_8,item_price_9,item_price_10,...,item_price_23,item_price_24,item_price_25,item_price_26,item_price_27,item_price_28,item_price_29,item_price_30,item_price_31,item_price_32
0,265.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,657.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,620.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
#normalize
mean_val_price = np.mean(np.array(val.iloc[:,4:36]))
std_val_price = np.std(np.array(val.iloc[:,4:36]))

In [40]:
#standardize
for c in range(32):
    val.iloc[:,c+4] = (val.iloc[:,c+4] - mean_val_price)/std_val_price

In [41]:
X_val_cnt = val.drop(columns=["item_cnt_day_33"])
y_val_cnt = val["item_cnt_day_33"]

In [42]:
X_val_cnt.iloc[:,36:].head()

Unnamed: 0,item_cnt_day_1,item_cnt_day_2,item_cnt_day_3,item_cnt_day_4,item_cnt_day_5,item_cnt_day_6,item_cnt_day_7,item_cnt_day_8,item_cnt_day_9,item_cnt_day_10,...,item_cnt_day_23,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31,item_cnt_day_32
0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
#test
test = result_stacked.drop(columns=["item_price_0","item_cnt_day_0","item_price_1","item_cnt_day_1"])
test.head()

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_2,item_price_3,item_price_4,item_price_5,item_price_6,item_price_7,...,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31,item_cnt_day_32,item_cnt_day_33
0,0.0,30.0,40.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
test.iloc[:,4:36].head()

Unnamed: 0,item_price_2,item_price_3,item_price_4,item_price_5,item_price_6,item_price_7,item_price_8,item_price_9,item_price_10,item_price_11,...,item_price_24,item_price_25,item_price_26,item_price_27,item_price_28,item_price_29,item_price_30,item_price_31,item_price_32,item_price_33
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
#normalize
mean_test_price = np.mean(np.array(test.iloc[:,4:36]))
std_test_price = np.std(np.array(test.iloc[:,4:36]))

In [46]:
#standardize
for c in range(32):
    test.iloc[:,c+4] = (test.iloc[:,c+4] - mean_test_price)/std_test_price

In [47]:
X_test_cnt = test

In [44]:
#save the data
save_obj(X_train_cnt,"data/X_train_cnt_complete")
save_obj(y_train_cnt,"data/y_train_cnt_complete")
save_obj(X_val_cnt,"data/X_val_cnt_complete")
save_obj(y_val_cnt,"data/y_val_cnt_complete")

In [45]:
save_obj(X_val_cnt,"data/X_test_cnt_complete")

In [46]:
#load data
X_train_cnt = load_obj("data/X_train_cnt_complete")
y_train_cnt = load_obj("data/y_train_cnt_complete")
X_val_cnt = load_obj("data/X_val_cnt_complete")
y_val_cnt = load_obj("data/y_val_cnt_complete")

In [48]:
X_val_cnt.iloc[:,36:].head()

Unnamed: 0,item_cnt_day_1,item_cnt_day_2,item_cnt_day_3,item_cnt_day_4,item_cnt_day_5,item_cnt_day_6,item_cnt_day_7,item_cnt_day_8,item_cnt_day_9,item_cnt_day_10,...,item_cnt_day_23,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31,item_cnt_day_32
0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
#baseline is taking the average of past sale counts
y_val_pred = np.mean(X_val_cnt.iloc[:,36:],axis=1)
y_val_true = y_val_cnt

In [53]:
from sklearn.metrics import mean_squared_error

In [54]:
rmse_base= np.sqrt(mean_squared_error(y_val_true,y_val_pred))

In [55]:
rmse_base

3.3191501725184964

In [56]:
#clip the data to (0,20)
y_val_pred_clip = y_val_pred.clip(0,20)
y_val_true_clip = y_val_true.clip(0,20)

In [57]:
rmse_base_clipped = np.sqrt(mean_squared_error(y_val_true_clip,y_val_pred_clip))

In [58]:
rmse_base_clipped

0.8458738697344089

In [59]:
#use XGBoost
import xgboost as xgb

In [60]:
model_xgb=xgb.XGBRegressor(random_state=123,n_estimators=150,learning_rate=0.001,max_depth = 50,objective='reg:squarederror')
model_xgb.fit(X_train_cnt, y_train_cnt)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.001, max_delta_step=0,
             max_depth=50, min_child_weight=1, missing=None, n_estimators=150,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=123, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [61]:
#rename X_val_cnt
cols = X_train_cnt.columns
X_val_cnt.columns = cols
X_test_cnt.columns = cols

In [62]:
pred_val_xgb = model_xgb.predict(X_val_cnt)

In [63]:
max(pred_val_xgb)

107.1159

In [64]:
rmse_xgb = np.sqrt(mean_squared_error(y_val_true,pred_val_xgb))

In [65]:
rmse_xgb

3.5875433567861927

In [66]:
#clip the data to (0,20)
xgb_val_pred_clip = pred_val_xgb.clip(0,20)

In [67]:
rmse_xgb_clipped = np.sqrt(mean_squared_error(y_val_true_clip,xgb_val_pred_clip))

In [68]:
rmse_xgb_clipped 

0.8191243432577053

In [69]:
#feature selection reduce features by the importance
#random forest
from sklearn.ensemble import RandomForestRegressor

In [70]:
rf_model = RandomForestRegressor(n_estimators=100, criterion='mse',max_features =20,bootstrap=False,n_jobs=1,random_state = 123)
rf_model.fit(X_train_cnt, y_train_cnt)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=20, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                      oob_score=False, random_state=123, verbose=0,
                      warm_start=False)

In [72]:
#features of importance

lst_importance = zip(X_train_cnt.columns, rf_model.feature_importances_)
lst_imp_sorted = sorted (lst_importance, key = lambda t: t[1])[::-1]

#top 10 importance
for name, importance in lst_imp_sorted[:10]:
    print(name, "=", importance)

item_cnt_day_31 = 0.15621564982269118
item_category_id = 0.12215163619101642
top_categories_id = 0.11779148704481171
item_cnt_day_29 = 0.07272682154437723
item_cnt_day_30 = 0.06909484229865002
item_cnt_day_28 = 0.0612301621345856
item_id = 0.05974449163626635
shop_id = 0.04669923804765287
item_cnt_day_27 = 0.04156589318280339
item_price_28 = 0.034875089960411644


In [73]:
pred_train_rf = rf_model.predict(X_train_cnt) 

In [74]:
rmse_rf_train = np.sqrt(mean_squared_error(y_train_cnt,pred_train_rf))
rmse_rf_train

0.0

In [75]:
#prediction performance
pred_val_rf = rf_model.predict(X_val_cnt) 

In [76]:
rmse_rf = np.sqrt(mean_squared_error(y_val_true,pred_val_rf))
rmse_rf 

5.069690661375155

In [77]:
pred_val_rf_clip = pred_val_rf.clip(0,20)

In [78]:
rmse_rf_clipped = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_rf_clip))
rmse_rf_clipped 

0.7842472998590544

In [79]:
#mlp
from sklearn.neural_network import MLPRegressor

In [80]:
MLP_model = MLPRegressor(hidden_layer_sizes = (100,10),activation = "tanh",learning_rate = "adaptive", max_iter = 500,alpha = 0.01,random_state=123)
MLP_model.fit(X_train_cnt, y_train_cnt)

MLPRegressor(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100, 10), learning_rate='adaptive',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=123, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [81]:
#prediction performance
pred_val_mlp = MLP_model.predict(X_val_cnt)

In [82]:
rmse_mlp = np.sqrt(mean_squared_error(y_val_true,pred_val_mlp))
rmse_mlp

3.595107328822808

In [83]:
pred_val_mlp_clip = pred_val_mlp.clip(0,20)
rmse_mlp_clip = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_mlp_clip))
rmse_mlp_clip

0.7746837318526485

In [84]:
#creating features using block averages on every 6 months
X_train_cnt.head()

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_0,item_price_1,item_price_2,item_price_3,item_price_4,item_price_5,...,item_cnt_day_22,item_cnt_day_23,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31
0,0.0,30.0,40.0,13.0,-0.140107,0.379843,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,0.385729,0.385729,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,1.148975,1.148975,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,0.009011,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,1.076378,1.076378,-0.140107,-0.140107,-0.140107,-0.140107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
#add block averages for every 6 months (non-overlapping)
X_train_block_avg = X_train_cnt.copy()

In [86]:
X_train_block_avg.iloc[:,4:36].head()

Unnamed: 0,item_price_0,item_price_1,item_price_2,item_price_3,item_price_4,item_price_5,item_price_6,item_price_7,item_price_8,item_price_9,...,item_price_22,item_price_23,item_price_24,item_price_25,item_price_26,item_price_27,item_price_28,item_price_29,item_price_30,item_price_31
0,-0.140107,0.379843,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107
2393,0.385729,0.385729,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107
2394,1.148975,1.148975,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107
2395,0.009011,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107
2396,1.076378,1.076378,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,...,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107


In [87]:
X_train_block_avg.iloc[:,36:].head()

Unnamed: 0,item_cnt_day_0,item_cnt_day_1,item_cnt_day_2,item_cnt_day_3,item_cnt_day_4,item_cnt_day_5,item_cnt_day_6,item_cnt_day_7,item_cnt_day_8,item_cnt_day_9,...,item_cnt_day_22,item_cnt_day_23,item_cnt_day_24,item_cnt_day_25,item_cnt_day_26,item_cnt_day_27,item_cnt_day_28,item_cnt_day_29,item_cnt_day_30,item_cnt_day_31
0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2393,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2394,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2395,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2396,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
np.mean(X_train_cnt.iloc[:,4:10],axis=1)

0        -0.053448
2393      0.035172
2394      0.289587
2395     -0.115254
2396      0.265388
            ...   
418875    0.039423
96082    -0.140107
424018   -0.140107
423139   -0.140107
422843   -0.140107
Length: 526920, dtype: float64

In [89]:
cols_to_drop = cols[4:]

In [90]:
#adding price averages
for i in range(6):
    print(i)
    if i < 5:
        X_train_block_avg["item_price_avg_"+str(i)] = np.mean(X_train_cnt.iloc[:,(6*i+4):(6*i+4+6)],axis=1)
    else:
        X_train_block_avg["item_price_avg_"+str(i)] = np.mean(X_train_cnt.iloc[:,(6*i+4):36],axis=1)

0
1
2
3
4
5


In [91]:
#same for cnt averages
for i in range(6):
    print(i)
    if i < 5:
        X_train_block_avg["cnt_avg_"+str(i)] = np.mean(X_train_cnt.iloc[:,(6*i+36):(6*i+36+6)],axis=1)
    else:
        X_train_block_avg["cnt_avg_"+str(i)] = np.mean(X_train_cnt.iloc[:,(6*i+36):],axis=1)

0
1
2
3
4
5


In [92]:
X_train_block =  X_train_block_avg.drop(columns = cols_to_drop)

In [93]:
X_train_block.head() 

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_avg_0,item_price_avg_1,item_price_avg_2,item_price_avg_3,item_price_avg_4,item_price_avg_5,cnt_avg_0,cnt_avg_1,cnt_avg_2,cnt_avg_3,cnt_avg_4,cnt_avg_5
0,0.0,30.0,40.0,13.0,-0.053448,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,5.166667,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,0.035172,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,1.833333,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,0.289587,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,0.666667,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,-0.115254,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,0.333333,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,0.265388,-0.140107,-0.140107,-0.140107,-0.140107,-0.140107,0.333333,0.0,0.0,0.0,0.0,0.0


In [94]:
#do the same to validation data
X_val_block_avg = X_val_cnt.copy()

In [95]:
#adding price averages
for i in range(6):
    print(i)
    if i < 5:
        X_val_block_avg["item_price_avg_"+str(i)] = np.mean(X_val_cnt.iloc[:,(6*i+4):(6*i+4+6)],axis=1)
    else:
        X_val_block_avg["item_price_avg_"+str(i)] = np.mean(X_val_cnt.iloc[:,(6*i+4):36],axis=1)

0
1
2
3
4
5


In [96]:
#same for cnt averages
for i in range(6):
    print(i)
    if i < 5:
        X_val_block_avg["cnt_avg_"+str(i)] = np.mean(X_val_cnt.iloc[:,(6*i+36):(6*i+36+6)],axis=1)
    else:
        X_val_block_avg["cnt_avg_"+str(i)] = np.mean(X_val_cnt.iloc[:,(6*i+36):],axis=1)

0
1
2
3
4
5


In [97]:
X_val_block =  X_val_block_avg.drop(columns = cols_to_drop)

In [98]:
X_val_block.head(5)

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_avg_0,item_price_avg_1,item_price_avg_2,item_price_avg_3,item_price_avg_4,item_price_avg_5,cnt_avg_0,cnt_avg_1,cnt_avg_2,cnt_avg_3,cnt_avg_4,cnt_avg_5
0,0.0,30.0,40.0,13.0,-0.051996,-0.13808,-0.13808,-0.13808,-0.13808,-0.13808,5.166667,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,-0.051021,-0.13808,-0.13808,-0.13808,-0.13808,-0.13808,0.166667,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,0.075343,-0.13808,-0.13808,-0.13808,-0.13808,-0.13808,0.333333,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,-0.13808,-0.13808,-0.13808,-0.13808,-0.13808,-0.13808,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,0.063324,-0.13808,-0.13808,-0.13808,-0.13808,-0.13808,0.166667,0.0,0.0,0.0,0.0,0.0


In [99]:
X_test_block_avg = X_test_cnt.copy()

In [100]:
#adding price averages
for i in range(6):
    print(i)
    if i < 5:
        X_test_block_avg["item_price_avg_"+str(i)] = np.mean(X_test_cnt.iloc[:,(6*i+4):(6*i+4+6)],axis=1)
    else:
        X_test_block_avg["item_price_avg_"+str(i)] = np.mean(X_test_cnt.iloc[:,(6*i+4):36],axis=1)

0
1
2
3
4
5


In [101]:
#same for cnt averages
for i in range(6):
    print(i)
    if i < 5:
        X_test_block_avg["cnt_avg_"+str(i)] = np.mean(X_test_cnt.iloc[:,(6*i+36):(6*i+36+6)],axis=1)
    else:
        X_test_block_avg["cnt_avg_"+str(i)] = np.mean(X_test_cnt.iloc[:,(6*i+36):],axis=1)

0
1
2
3
4
5


In [102]:
X_test_block =  X_test_block_avg.drop(columns = cols_to_drop)

In [77]:
#save block inputs
save_obj(X_train_block, "data/X_train_block")
save_obj(X_val_block, "data/X_val_block")
save_obj(X_test_block, "data/X_test_block")

In [103]:
#try XGboost and MLP
model_xgb=xgb.XGBRegressor(random_state=123,n_estimators=200,learning_rate=0.001,max_depth = 50,objective='reg:squarederror')
model_xgb.fit(X_train_block, y_train_cnt)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.001, max_delta_step=0,
             max_depth=50, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=123, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [104]:
pred_val_xgb_block = model_xgb.predict(X_val_block)

In [105]:
rmse_xgb_block = np.sqrt(mean_squared_error(y_val_true,pred_val_xgb_block))
rmse_xgb_block

3.577362026715606

In [106]:
pred_val_xgb_block_clip = pred_val_xgb_block.clip(0,20)
rmse_xgb_block_clip = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_xgb_block_clip))
rmse_xgb_block_clip

0.8153236886984665

In [107]:
#rf
rf_model_block = RandomForestRegressor(n_estimators=200, criterion='mse',max_features =10,bootstrap=False,n_jobs=1,random_state = 123)
rf_model_block.fit(X_train_block, y_train_cnt)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=10, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
                      oob_score=False, random_state=123, verbose=0,
                      warm_start=False)

In [108]:
#features of importance

lst_importance = zip(X_train_block.columns, rf_model_block.feature_importances_)
lst_imp_sorted = sorted (lst_importance, key = lambda t: t[1])[::-1]

#top 10 importance
for name, importance in lst_imp_sorted:
    print(name, "=", importance)

cnt_avg_5 = 0.3613256380449574
top_categories_id = 0.13974079441195866
item_category_id = 0.1354005512154699
item_price_avg_4 = 0.08012791032372166
item_price_avg_5 = 0.06510766984895508
item_id = 0.05841571044067629
shop_id = 0.05697723048252361
cnt_avg_4 = 0.05051525573840822
cnt_avg_3 = 0.012871873184392002
item_price_avg_3 = 0.007688670879925964
item_price_avg_1 = 0.006990455700241864
cnt_avg_2 = 0.006849840343090148
cnt_avg_1 = 0.005583980717865858
item_price_avg_0 = 0.005091413050937835
cnt_avg_0 = 0.0042504409224638
item_price_avg_2 = 0.0030625646944117822


In [109]:
#prediction performance
pred_val_rf_block = rf_model_block.predict(X_val_block) 

In [110]:
rmse_rf_block = np.sqrt(mean_squared_error(y_val_true,pred_val_rf_block))
rmse_rf_block 

5.27006392692154

In [111]:
pred_val_rf_block_clip = pred_val_rf_block.clip(0,20)

In [112]:
rmse_rf_block_clipped = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_rf_block_clip))
rmse_rf_block_clipped 

0.8633522734645953

In [113]:
#MLP
MLP_model_block = MLPRegressor(hidden_layer_sizes = (100,2),activation = "tanh",learning_rate = "adaptive", max_iter = 500,alpha = 0.01,random_state=123)
MLP_model_block.fit(X_train_block, y_train_cnt)

MLPRegressor(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100, 2), learning_rate='adaptive',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=123, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [114]:
#prediction performance
pred_val_mlp_block = MLP_model_block.predict(X_val_block)

In [115]:
rmse_mlp_block = np.sqrt(mean_squared_error(y_val_true,pred_val_mlp_block))
rmse_mlp_block

3.5952428207037377

In [116]:
#clipped version
pred_val_mlp_block_clip = pred_val_mlp_block.clip(0,20)
rmse_mlp_block_clip = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_mlp_block_clip))
rmse_mlp_block_clip

0.7764198486593755

In [117]:
#validation step for MLP to optimize the result
#test on hidden layer sizes, max_iter,alpha
neutron_a = [50,80,100,150,200,500]
neutron_b = [1,2,3,4,5]
max_iter = [50,100,200,500]
alpha = [0.001,0.002,0.005,0.05,0.1,0.2,0.5,1,2]
count = 1

In [19]:
#using  parallel processing by ipython parallel
import os
import ipyparallel as ipp

In [22]:
#parallel process
rc = ipp.Client()
ar = rc[:].apply_async(os.getpid)
pid_map = ar.get_dict()
v = rc.load_balanced_view()

In [23]:
from IPython.display import clear_output
import sys

In [24]:
rc[:]

<DirectView [0, 1, 2, 3,...]>

In [25]:
dview = rc[:]

In [94]:
#importing packages
with dview.sync_imports():
    from sklearn.neural_network import MLPRegressor
    import numpy as np
    from sklearn.metrics import mean_squared_error

importing MLPRegressor from sklearn.neural_network on engine(s)
importing numpy on engine(s)
importing mean_squared_error from sklearn.metrics on engine(s)


In [26]:
def validation_MLP(args):
    from sklearn.neural_network import MLPRegressor
    import numpy as np
    from sklearn.metrics import mean_squared_error
    #(neutron_a,neutron_b,maxiter_m,alpha_a,x_train,y_train,x_val)
    val_result={}
    
    a = args[0]
    b = args[1]
    m = args[2]
    al = args[3]
    X_train_block = args[4]
    y_train_cnt = args[5]
    X_val_block = args[6]
    y_val_true = args[7]
    
    MLP_model_block_temp = MLPRegressor(hidden_layer_sizes = (a,b),activation = "tanh",learning_rate = "adaptive", max_iter = m,alpha = al,random_state=123)
    MLP_model_block_temp.fit(X_train_block, y_train_cnt)
    pred_train = MLP_model_block_temp.predict(X_train_block)
    pred_val = MLP_model_block_temp.predict(X_val_block)
    pred_val_clip = pred_val.clip(0,20)
    y_val_true_clip = y_val_true.clip(0,20)
    train_error = np.sqrt(mean_squared_error(y_train_cnt,pred_train))
    val_error = np.sqrt(mean_squared_error(y_val_true,pred_val))
    val_error_clip = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_clip))
    val_result[a,b,m,al] = train_error,val_error,val_error_clip
    
    #delete all the inputs and outputs to free up memories
    del MLP_model_block_temp,pred_train,pred_val,pred_val_clip,train_error,val_error,val_error_clip,a,b,m,al,X_train_block,y_train_cnt,X_val_block,y_val_true
    
    return val_result

    del val_result
    del MLPRegressor,np,mean_squared_error
    

In [27]:
rc.queue_status(targets='all', verbose=True)

{0: {'completed': ['4e18219e-cea9-4cdc-a3b8-1661fa9543a9'],
  'queue': [],
  'tasks': []},
 1: {'completed': ['890aa4d1-688f-460d-bb43-fc3d5ef86c0f'],
  'queue': [],
  'tasks': []},
 10: {'completed': ['f22c70df-0740-41f9-b94b-1fbd668a4fe6'],
  'queue': [],
  'tasks': []},
 11: {'completed': ['2cda185f-08f4-4bcc-aff4-daf3dcbf2802'],
  'queue': [],
  'tasks': []},
 12: {'completed': ['04d45db9-d54f-4b24-9677-38db90856e50'],
  'queue': [],
  'tasks': []},
 13: {'completed': ['cd90a688-6642-49f2-b75f-d416791a9275'],
  'queue': [],
  'tasks': []},
 14: {'completed': ['e44bdf42-427b-44c3-a906-74eaaac64b5d'],
  'queue': [],
  'tasks': []},
 15: {'completed': ['35dec269-cc03-4dc2-8477-924d1a54e841'],
  'queue': [],
  'tasks': []},
 2: {'completed': ['2abaf4b5-f251-44c6-aedc-cf7cee19a560'],
  'queue': [],
  'tasks': []},
 3: {'completed': ['8ebbac40-2c21-484c-a053-81204852466b'],
  'queue': [],
  'tasks': []},
 4: {'completed': ['fbb5a07b-2842-4d54-a4e5-fe1aa298af03'],
  'queue': [],
  'tasks'

In [28]:
#using the backend db to gather all jobs (in case a lost connection from user end)
task_id = rc.hub_history()

In [29]:
task_id #buffers

['4e18219e-cea9-4cdc-a3b8-1661fa9543a9',
 '890aa4d1-688f-460d-bb43-fc3d5ef86c0f',
 '2abaf4b5-f251-44c6-aedc-cf7cee19a560',
 '8ebbac40-2c21-484c-a053-81204852466b',
 'fbb5a07b-2842-4d54-a4e5-fe1aa298af03',
 '161da3d0-9971-4d47-a781-108bc02e15b3',
 '73867a21-2546-4a10-8ca5-cbdc5d91bbf5',
 '382d4300-0929-4f12-8d6f-bdd943760806',
 '38306822-545e-45dd-b7ba-18ef48a2fe85',
 'd586e40c-57f0-4ddd-9c26-8f5afa02bfc0',
 'f22c70df-0740-41f9-b94b-1fbd668a4fe6',
 '2cda185f-08f4-4bcc-aff4-daf3dcbf2802',
 '04d45db9-d54f-4b24-9677-38db90856e50',
 'cd90a688-6642-49f2-b75f-d416791a9275',
 'e44bdf42-427b-44c3-a906-74eaaac64b5d',
 '35dec269-cc03-4dc2-8477-924d1a54e841']

In [27]:
#retrive the tasks run from back rend
result_temp = rc.get_result(task_id)

In [118]:
#parallel process
#val_result = {}
### a dict to store parameters as they're submitted

inputs = []
i = 1
for a in neutron_a:
    for b in neutron_b:
        for m in max_iter:
            for al in alpha:
                inputs.append((a,b,m,al,X_train_block,y_train_cnt,X_val_block,y_val_cnt))    
              

In [119]:
len(inputs)

1080

In [120]:
inputs_batch_lst = []
inputs_batch_lst = [inputs[(x)*100:(x+1)*100] for x in range(10)]
inputs_batch_lst.append(inputs[1000:])

In [34]:
val_result_lst = []
count = 1
    
for input_batch in inputs_batch_lst:
    print("loop "+str(count) + "/10 started...")
    t = time.time()
    val_result_tmp = v.map(validation_MLP,input_batch)
    print("loop " + str(count) + " costed ", time.time() - t) 
    
    #clear buffer while waiting
    while not val_result_tmp.ready():
        #clear_output()
        for stdout in val_result_tmp.stdout:
            if stdout:
                lines = stdout.split('\n')
                for line in lines[-4:-1]:
                    if line:
                        print(line)
        sys.stdout.flush()
        time.sleep(30)

    if (val_result_tmp.ready() == True) & (val_result_tmp.successful() == True):
        val_result_lst.append(val_result_tmp.get())
    else:
        print("job failed at loop: ",count)
        break
        
    count = count + 1
    #clear intermediate result before moving to the next batch
    del val_result_tmp
    rc.purge_everything()
    clear_output()

In [53]:
val_result_tmp.ready()

True

In [54]:
val_result_tmp.successful()

True

In [55]:
val_result_holder = val_result_tmp.get()

In [149]:
save_obj(val_result_holder,"data/val_result")

In [103]:
del val_result_tmp

In [61]:
rc.purge_everything()
clear_output()

In [None]:
rc.close()

In [40]:
#merge dictionaries
result = {}
for b in range(len(val_result_holder)):
    for d in val_result_holder[b]:
        Merge(d,result)

In [56]:
#get the key corresponding to the minimum of the validation value (clipped)
min(result, key=result.get)

(80, 2, 100, 0.001)

In [79]:
#the best parameter is neutron of size (80,2),max_iter = 100, and alpha = 0.001
#mlp
from sklearn.neural_network import MLPRegressor

In [80]:
#MLP
MLP_model_block = MLPRegressor(hidden_layer_sizes = (80,2),activation = "tanh",learning_rate = "adaptive", max_iter = 100,alpha = 0.001,random_state=123)
MLP_model_block.fit(X_train_block, y_train_cnt)

MLPRegressor(activation='tanh', alpha=0.001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(80, 2), learning_rate='adaptive',
             learning_rate_init=0.001, max_iter=100, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=123, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

In [81]:
#prediction performance
pred_val_mlp_block = MLP_model_block.predict(X_val_block)

In [82]:
rmse_mlp_block = np.sqrt(mean_squared_error(y_val_true,pred_val_mlp_block))
rmse_mlp_block

3.59525220115477

In [83]:
#clipped version
pred_val_mlp_block_clip = pred_val_mlp_block.clip(0,20)
rmse_mlp_block_clip = np.sqrt(mean_squared_error(y_val_true_clip,pred_val_mlp_block_clip))
rmse_mlp_block_clip

0.7764864356096106

In [84]:
pred_val_mlp_block_clip

array([0.16507926, 0.17073566, 0.17073566, ..., 0.17073566, 0.17073566,
       0.17073566])

In [85]:
pred_test_xgb_block = MLP_model_block.predict(X_test_block)

In [86]:
pred_test_xgb_block

array([0.1643625 , 0.17073566, 0.17073566, ..., 0.17073566, 0.17073566,
       0.17073566])

In [87]:
X_test_block.head()

Unnamed: 0,shop_id,item_id,item_category_id,top_categories_id,item_price_avg_0,item_price_avg_1,item_price_avg_2,item_price_avg_3,item_price_avg_4,item_price_avg_5,cnt_avg_0,cnt_avg_1,cnt_avg_2,cnt_avg_3,cnt_avg_4,cnt_avg_5
0,0.0,30.0,40.0,13.0,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,0.0,0.0,0.0,0.0,0.0,0.0
2393,0.0,14798.0,40.0,13.0,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,0.0,0.0,0.0,0.0,0.0,0.0
2394,0.0,14799.0,37.0,13.0,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,0.0,0.0,0.0,0.0,0.0,0.0
2395,0.0,14800.0,40.0,13.0,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,0.0,0.0,0.0,0.0,0.0,0.0
2396,0.0,14804.0,37.0,13.0,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,-0.135903,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
test_pred_df = X_test_block[["shop_id","item_id","item_category_id","top_categories_id"]]

In [89]:
test_pred_df["mlp_pred_block"] = pred_test_xgb_block.clip(0,20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [90]:
min(test_pred_df["mlp_pred_block"])

0.03996355837429816

In [91]:
save_obj(test_pred_df,"data/test_pred_df_mlp_complete")