In [72]:
import pandas as pd

from utils.common_transformers import DateHandler, IsNull, Dropper, DTypeTransformer, FeatureSelector

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_log_error, make_scorer

import numpy as np

In [73]:
train = pd.read_csv("../data/Train.csv")
train.dtypes

session_id         object
session_number      int64
client_agent       object
device_details     object
date               object
purchased           int64
added_in_cart       int64
checked_out         int64
time_spent        float64
dtype: object

In [168]:
data_prepping = make_pipeline(
    FeatureSelector(["date","purchased","added_in_cart"]),
    DateHandler(["date"],date_format="%Y-%m-%d",include=["quater","month","day_num"]),
)

# Modelling

In [172]:
models = {
    "LR" : LinearRegression(),
    "Lasso" : Lasso(alpha=1.0),
    "Ridge" : BayesianRidge(),
    "RFR" : RandomForestRegressor(criterion="mse"),
}
scores = []

In [173]:
for model_name,model in models.items():
    losses = []
    print(model_name)
    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(train.drop("time_spent",axis=1),train["time_spent"],test_size=0.2)
        pipe = make_pipeline(data_prepping,model)
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_test)
        y_pred = np.where(y_pred<0,0,y_pred)
        
        loss = np.sqrt(mean_squared_log_error(y_test,y_pred))
        losses.append(loss)
    
    mean = np.array(losses).mean()
    std = np.array(losses).std()
    
    score = {"Model" : model_name, "Mean" : mean, "Std" : std}
    scores.append(score)

LR
Lasso
Ridge
RFR


In [174]:
pd.DataFrame(scores).sort_values(["Mean","Std"])

Unnamed: 0,Model,Mean,Std
3,RFR,1.996681,0.020965
0,LR,2.021113,0.026799
1,Lasso,2.048192,0.030146
2,Ridge,2.297208,0.151823


In [169]:
X = train.drop("time_spent",axis=1)
y = train["time_spent"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = data_prepping.fit_transform(X_train)

X_train["y"] = y_train
X_train = X_train.query("date_month!=9")
y_train = X_train["y"]
X_train.drop("y",axis=1,inplace=True)

In [171]:
y_pred = rf.predict(data_prepping.transform(X_test))
np.sqrt(mean_squared_log_error(y_test,y_pred))

1.8367469092818474

In [170]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

RandomForestRegressor()

In [28]:
X = train.drop("time_spent",axis=1)
y = train["time_spent"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [33]:
X_train = data_prepping.fit_transform(X_train)

In [34]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [35]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 14.3min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   verbose=2)

In [38]:
rf_random.best_estimator_

RandomForestRegressor(bootstrap=False, max_depth=10, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=5,
                      n_estimators=2000)

In [39]:
rf = RandomForestRegressor(bootstrap=False, max_depth=10, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=5,
                      n_estimators=2000)

In [63]:
rf = RandomForestRegressor()

In [68]:
X = train.drop("time_spent",axis=1)
y = train["time_spent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf.fit(data_prepping.fit_transform(X_train),y_train)

RandomForestRegressor()

In [69]:
y_pred = rf.predict(data_prepping.transform(X_test))

In [70]:
np.sqrt(mean_squared_log_error(y_test,y_pred))

2.0098270309288124

In [188]:
data_prepping = make_pipeline(
    FeatureSelector(["date","purchased","added_in_cart"]),
    DateHandler(["date"],date_format="%Y-%m-%d",include=["quater","month","day_num"]),
)

rf = RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=5,
                      n_estimators=2000)

X = train.drop("time_spent",axis=1)
y = train["time_spent"]

X = data_prepping.fit_transform(X)

X["y"] = y
X = X.query("date_month!=9")
y = X["y"]
X.drop("y",axis=1,inplace=True)

In [189]:
rf.fit(X,y)

RandomForestRegressor(bootstrap=False, max_depth=15, max_features='sqrt',
                      min_samples_leaf=2, min_samples_split=5,
                      n_estimators=2000)

In [190]:
test = pd.read_csv("../data/Test.csv")
print(test.shape)
test = data_prepping.transform(test)

(2327, 8)


In [191]:
y_pred = rf.predict(test)

In [192]:
ans = pd.DataFrame(y_pred)
ans.columns = ["time_spent"]

In [194]:
ans.min()

time_spent    65.516388
dtype: float64

In [195]:
print(ans.shape)

(2327, 1)


In [196]:
ans.to_csv("../submissions/sub_27_12_2.csv",index=False)