In [19]:
# general packages
import numpy as np
import pandas as pd
import datatable as dt

#model
from sklearn.ensemble import RandomForestClassifier

# Tools and metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [12]:
%%time
data = pd.read_csv('features_processed.csv') 

Wall time: 58 s


In [13]:
data = data.iloc[:,1:]

In [15]:
data.iloc[:,:]

Unnamed: 0,Conditions,AreaNumber,PuLat,PuLong,create_by_hour,request_by_hour,pick,BookingSource_Internet,BookingSource_Permanent,BookingSource_StratumIVR,...,create_part_of_day_Afternoon,create_part_of_day_Evening,create_part_of_day_Morning,create_part_of_day_Night,request_part_of_day_Afternoon,request_part_of_day_Evening,request_part_of_day_Morning,request_part_of_day_Night,BookingisDispatch_False,BookingisDispatch_True
0,0,177,-33.921022,151.244202,10,11,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
1,0,111,-33.755931,151.273028,9,9,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
2,0,92,-33.897201,151.261205,0,0,1,1,0,0,...,0,0,0,1,0,0,0,1,0,1
3,0,182,-33.895716,151.228609,9,9,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,0,4,-33.859737,151.213307,21,21,1,1,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3299995,0,265,-33.827898,151.126799,20,6,1,1,0,0,...,0,1,0,0,0,0,1,0,1,0
3299996,0,149,-33.712023,151.095004,17,17,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3299997,0,219,-33.850450,151.134428,8,8,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
3299998,0,177,-33.915317,151.251008,8,8,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0


In [16]:
data.dtypes.head(10)

Conditions                    int64
AreaNumber                    int64
PuLat                       float64
PuLong                      float64
create_by_hour                int64
request_by_hour               int64
pick                          int64
BookingSource_Internet        int64
BookingSource_Permanent       int64
BookingSource_StratumIVR      int64
dtype: object

## Training

In [17]:
training_data = data.iloc[:3000000,:]
validation_data = data.iloc[3000000:,:]

In [18]:
y_train = training_data[['TargetVariable_Cancelled','TargetVariable_Completed','TargetVariable_ELSE','TargetVariable_No Job']]
X_train = training_data.drop(columns=['TargetVariable_Cancelled','TargetVariable_Completed','TargetVariable_ELSE','TargetVariable_No Job'])
y_val = validation_data[['TargetVariable_Cancelled','TargetVariable_Completed','TargetVariable_ELSE','TargetVariable_No Job']]
X_val = validation_data.drop(columns=['TargetVariable_Cancelled','TargetVariable_Completed','TargetVariable_ELSE','TargetVariable_No Job'])

## Modeling

Setup cross validation and define error metrics

In [None]:
# Setup cross validation folds
kf = KFold(n_splits=12, random_state=42, shuffle=True)

In [None]:
rf = RandomForestClassifier()
rf_data = rf.fit(X_train, y_train)

In [None]:
%%time

tuning_parameters = {
    'n_estimators' : [50, 100, 200, 500],
     'max_depth' :[2,4,8,10, 16],
     'min_samples_split' :[2, 4, 6, 8, 10],
    'min_samples_leaf' :[2, 4, 6, 8, 10]
}

rf = RandomizedSearchCV(rf, tuning_parameters, cv = 5, n_jobs=4) #n_iter = 32,
rf_full_data = rf.fit(X_train, y_train)
rf.fit(X_train, yj_y_train)

print('Best parameters found by grid search:', rf.best_params_, '\n')

In [None]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Ridge Regressor
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 20, 30, 50, 75, 100]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=ridge_alphas, cv=kf))

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=1200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)