<h1 style="text-align:center;font-size:40px">BlackFriday Model Training</h1>

In [17]:
import numpy as np
import pandas as pd

In [18]:
import sklearn

In [19]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [20]:
from sklearn.metrics import r2_score

In [21]:
def rmse(ytrue, ypred):
    return np.sqrt(np.mean(np.square(ytrue - ypred)))

In [22]:
train = pd.read_csv('Data/small.csv')

In [23]:
train.head()

Unnamed: 0,Gender,Product_Category_1,Product_Category_2_Mode,Purchase
0,0,3,8.0,8370
1,0,1,6.0,15200
2,0,12,8.0,1422
3,0,12,14.0,1057
4,1,8,8.0,7969


In [24]:
train.dtypes

Gender                       int64
Product_Category_1           int64
Product_Category_2_Mode    float64
Purchase                     int64
dtype: object

In [25]:
target = train.Purchase
features = train.drop('Purchase', axis = 1)

In [26]:
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size = 10000, random_state = 0)

In [27]:
print(len(xtrain))
print(len(xtest))
print(len(ytrain))
print(len(ytest))

50000
10000
50000
10000


In [28]:
pipelines = {
    'RF': make_pipeline(StandardScaler(), RandomForestRegressor(min_samples_leaf = 50, random_state = 0)),
    'GB': make_pipeline(StandardScaler(), GradientBoostingRegressor(min_samples_leaf = 50, random_state = 0))
}

In [29]:
pipelines['GB'].get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('gradientboostingregressor',
   GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
                max_leaf_nodes=None, min_impurity_decrease=0.0,
                min_impurity_split=None, min_samples_leaf=50,
                min_samples_split=2, min_weight_fraction_leaf=0.0,
                n_estimators=100, presort='auto', random_state=0,
                subsample=1.0, verbose=0, warm_start=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'gradientboostingregressor': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=50,
   

In [30]:
hypers = {
    'RF':{
        'randomforestregressor__n_estimators': [150, 200],
        'randomforestregressor__max_features': ['auto', 0.5],
    },
    
    'GB':{
        'gradientboostingregressor__n_estimators': [100, 200],
        'gradientboostingregressor__learning_rate': [0.05, 0.03],
        'gradientboostingregressor__max_depth': [5]
    }
}

In [31]:
models = {}

for name, pipeline in pipelines.items():
    
    model = GridSearchCV(pipeline, hypers[name], cv=2, n_jobs=-1)
    model.fit(xtrain, ytrain)
    models[name] = model
    print('{} has been fitted'.format(name))

RF has been fitted
GB has been fitted


In [32]:
for name, model in models.items():
    yhat = model.predict(xtest)
    
    print(name)
    print('r2_score: \t{0:.2f}'.format(r2_score(ytest, yhat)))
    print('rmse: \t\t{0:.2f}'.format(rmse(ytest, yhat)))
    print('-------------------------')

RF
r2_score: 	0.63
rmse: 		3008.48
-------------------------
GB
r2_score: 	0.63
rmse: 		2988.99
-------------------------


In [None]:
import pickle

In [None]:
with open('BlackG.pkl', 'wb') as f:
    pickle.dump(models['GB'].best_estimator_, f)

In [None]:
with open('BlackF.pkl', 'wb') as f:
    pickle.dump(models['RF'].best_estimator_, f)