In [24]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from matplotlib import dates
import seaborn as sns
from scipy import stats
import time
import catboost
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import VotingClassifier 
import warnings
import holidays
import calendar
from varname import nameof
warnings.filterwarnings("ignore")

# Functions

In [25]:
## function of getting the optimized paramters and score
def hypertuning_rscv(est,p_distr,nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,n_jobs=-1, n_iter=nbr_iter,cv=5)
    
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return(ht_params,ht_score)

## submission
def submission(clf,test):
    preds = clf.predict_proba(test)
    test["ID"] = test.index +1
    ctbsubmission = pd.concat([pd.DataFrame(preds[:,1]),test['ID']],axis=1)
    ctbsubmission.columns = ['Delay.Indicator','ID']
    ctbsubmission.to_csv('{}_predictions_1.csv'.format(nameof(clf)),index=False)

In [26]:
class ModelOptimizer:
    best_score = None
    opt = None
    
    def __init__(self, model, X_train, y_train, categorical_columns_indices=None, n_fold=3, seed=2405, early_stopping_rounds=30, is_stratified=True, is_shuffle=True):
        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.categorical_columns_indices = categorical_columns_indices
        self.n_fold = n_fold
        self.seed = seed
        self.early_stopping_rounds = early_stopping_rounds
        self.is_stratified = is_stratified
        self.is_shuffle = is_shuffle
        
        
    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)
            
    def evaluate_model(self):
        pass
    
    def optimize(self, param_space, max_evals=10, n_random_starts=2):
        start_time = time.time()
        
        @use_named_args(param_space)
        def _minimize(**params):
            self.model.set_params(**params)
            return self.evaluate_model()
        
        opt = gp_minimize(_minimize, param_space, n_calls=max_evals, n_random_starts=n_random_starts, random_state=2405, n_jobs=-1)
        best_values = opt.x
        optimal_values = dict(zip([param.name for param in param_space], best_values))
        best_score = opt.fun
        self.best_score = best_score
        self.opt = opt
        
        print('optimal_parameters: {}\noptimal score: {}\noptimization time: {}'.format(optimal_values, best_score, time.time() - start_time))
        print('updating model with optimal values')
        self.update_model(**optimal_values)
        plot_convergence(opt)
        return optimal_values

class XgbOptimizer(ModelOptimizer):
    def evaluate_model(self):
        scores = xgboost.cv(self.model.get_xgb_params(), 
                    xgboost.DMatrix(self.X_train, label=self.y_train),
                    num_boost_round=self.model.n_estimators, 
                    metrics='auc', 
                    nfold=self.n_fold, 
                    stratified=self.is_stratified,
                    shuffle=self.is_shuffle,
                    seed=self.seed,
                    early_stopping_rounds=self.early_stopping_rounds)
        self.scores = scores
        test_scores = scores.iloc[:, 2]
        best_metric = test_scores.max()
        return 1 - best_metric
    
class CatboostOptimizer(ModelOptimizer):
    def evaluate_model(self):
        validation_scores = catboost.cv(
        catboost.Pool(self.X_train, 
                      self.y_train, 
                      cat_features=self.categorical_columns_indices),
        self.model.get_params(), 
        nfold=self.n_fold,
        stratified=self.is_stratified,
        seed=self.seed,
        early_stopping_rounds=self.early_stopping_rounds,
        shuffle=self.is_shuffle,
        plot=False)
        self.scores = validation_scores
        test_scores = validation_scores.iloc[:, 2]
        best_metric = test_scores.max()
        return 1 - best_metric
    
class LightGBMOptimizer(ModelOptimizer):
    def evaluate_model(self):
        lgb_dataset = lgb.Dataset(self.X_train, 
                                  self.y_train, 
                                  self.categorical_columns_indices)
        eval_hist = lgb.cv(self.model.get_params(), 
                           lgb_dataset,
                           self.model.n_estimators, 
                           nfold=self.n_fold,
                           seed=self.seed, 
                           stratified=self.is_stratified, 
                           shuffle=self.is_shuffle,
                           early_stopping_rounds=self.early_stopping_rounds, 
                           metrics='auc')
        self.scores = eval_hist
        test_scores = eval_hist[list(eval_hist.keys())[0]]
        best_metric = max(test_scores)
        return 1 - best_metric

In [27]:
train = pd.read_csv("traffic_clean3.csv")
train1 = pd.read_csv("train3.csv")
test = pd.read_csv("test_clean3.csv")

In [28]:
train1[["lag1","lag2","lag3","lag4","lag_std1","lag_std2","lag_std3"]] = train[["lag1","lag2","lag3","lag4","lag_std1","lag_std2","lag_std3"]]

# Secondary feature preprocessing 

## Reduce the Vessel.name categories that are not present in the test set

We have noticed that there are some Vessel.Name categorical levels in the training dataset that are not present in the test data. Considering that the redundant information would negatively influence the predictive power of our models, we decided to cut out the Vessel name categorical levels that are not seen in the test data from the training data.

In [29]:
vessel_redun = [i for i in np.unique(train1["Vessel.Name"]) if i not in np.unique(test["Vessel.Name"])]
train1 = train1.loc[~train1["Vessel.Name"].isin(vessel_redun),:]

## Categorize the traffic lags from 0 to 5 to reduce noises.

Since the distribution of the lag variables are significant imbalanced, we decided to categorize traffic lags into categroical variables.


In [30]:
trai = train1["Scheduled.Departure"].apply(lambda x: pd.to_datetime(x).strftime("%H:%M"))
tes = test["Scheduled.Departure"].apply(lambda x: pd.to_datetime(x).strftime("%H:%M"))
vessel_sche = [i for i in np.unique(trai) if i not in np.unique(tes)]
train1 = train1.loc[~train1["Scheduled.Departure"].isin(vessel_sche),:]

train1.reset_index(drop = True,inplace = True)

## train cut
train1["cut1"] = pd.cut(train.lag1 , [0,2,3,4,5], right=True)
train1["cut2"] = pd.cut(train.lag2 , [0,2,3,4,5], right=True)
train1["cut3"] = pd.cut(train.lag3 , [0,2,3,4,5], right=True)
train1["cut4"] = pd.cut(train.lag4 , [0,2,3,4,5], right=True)

## test cut
test["cut1"] = pd.cut(test.lag1 , [0,2,3,4,5], right=True)
test["cut2"] = pd.cut(test.lag2 , [0,2,3,4,5], right=True)
test["cut3"] = pd.cut(test.lag3 , [0,2,3,4,5], right=True)
test["cut4"] = pd.cut(test.lag4 , [0,2,3,4,5], right=True)

## Combine the Vessel categories with similar delay frequencies

In [31]:
## Visualize the delay frequency by category
train1[["Vessel.Name","Delay.Indicator"]].groupby("Vessel.Name").mean().sort_values(by = 'Delay.Indicator')

## combine the Vessel categories with similar delay frequencies
train1.loc[train1["Vessel.Name"].isin(["Skeena Queen","Mayne Queen",'Queen of Alberni']),"Vessel"] = "name1"
train1.loc[train1["Vessel.Name"].isin(["Coastal Renaissance","Queen of New Westminster","Bowen Queen","Coastal Inspiration","Queen of Cumberland","Coastal Celebration"]),"Vessel"] = "name2"
train1.loc[train1["Vessel.Name"].isin(['Queen of Coquitlam',"Spirit of Vancouver Island","Queen of Cowichan","Queen of Capilano"]),"Vessel"] = "name3"
train1.loc[train1["Vessel.Name"].isin(["Queen of Oak Bay","Salish Raven","Queen of Surrey","Salish Eagle"]),"Vessel"] = "name4"

test.loc[train1["Vessel.Name"].isin(["Skeena Queen","Mayne Queen",'Queen of Alberni']),"Vessel"] = "name1"
test.loc[train1["Vessel.Name"].isin(["Coastal Renaissance","Queen of New Westminster","Bowen Queen","Coastal Inspiration","Queen of Cumberland","Coastal Celebration"]),"Vessel"] = "name2"
test.loc[train1["Vessel.Name"].isin(['Queen of Coquitlam',"Spirit of Vancouver Island","Queen of Cowichan","Queen of Capilano"]),"Vessel"] = "name3"
test.loc[train1["Vessel.Name"].isin(["Queen of Oak Bay","Salish Raven","Queen of Surrey","Salish Eagle"]),"Vessel"] = "name4"

## Extract the feature Round Hour

In [32]:
## Extract the feature Round Hour
train1.drop("Unnamed: 0",axis=1,inplace =True)
train1["Date_time"] = pd.to_datetime(train1["Date_time"])
test["Date_time"] = pd.to_datetime(test["Date_time"])

train1["Round_time"] = train1["Date_time"].dt.round("H")
train1["Round_Hour"] = train1["Round_time"].apply(lambda x: x.strftime("%H"))

test["Round_time"] = test["Date_time"].dt.round("H")
test["Round_Hour"] = test["Round_time"].apply(lambda x: x.strftime("%H"))

In [33]:
test.loc[test.Round_Hour == "00","Round_Hour"] = "23"

In [34]:
train1["Month"] = train1["Month"].astype("int")
test["Month"] = test["Month"].astype("int")

## Feature extraction

In [36]:
train1 = train1.loc[train1["Status"].isin(["On Time","Traffic delay","Operational delay","Mechanical issue"]),:]

In [47]:
train1["Month"] = train1["Month"].astype("int")
train1["Day"] = train1["Day"].astype("object")

test["Month"] = test["Month"].astype("int")
test["Day"] = test["Day"].astype("object")

In [63]:
X2 = train1[["Round_Hour","Day","Month","Day.of.Month","Num.of.sailings",
       'Vessel.Name_Kfold_Target_Enc','Trip_Kfold_Target_Enc',"holidays_indicator","lag2","lag3","lag4","lag_std1","lag_std2","weekend.Indicator"]]
X2_enc = pd.get_dummies(X2,prefix=["Day","Round_Hour"])

y = train1["Delay.Indicator"]

X_test2 = test[["Round_Hour","Day","Month","Day.of.Month","Num.of.sailings",
       'Vessel.Name_Kfold_Target_Enc','Trip_Kfold_Target_Enc',"holidays_indicator","lag2","lag3","lag4","lag_std1","lag_std2","weekend.Indicator"]]

X2_test_enc = pd.get_dummies(X_test2,prefix=["Day","Round_Hour"])

In [77]:
X2_test_enc.drop("ID",axis = 1,inplace = True)

# Logistic Regression

In [70]:
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, MaxAbsScaler

logreg = LogisticRegression()

imba_pipeline = make_pipeline(SMOTE(random_state=42),
                     PolynomialFeatures(interaction_only=True), 
                     logreg)

param_grid = {'polynomialfeatures__degree': [1]}

nbr_iter=200
random_params, random_score = hypertuning_rscv(imba_pipeline, param_grid,nbr_iter,X2_enc,y)

In [71]:
clf = LogisticRegression()
kf = KFold(n_splits=10, random_state=42, shuffle=False)
imba_pipeline_lg = make_pipeline(SMOTE(random_state=42),PolynomialFeatures(interaction_only=True), clf)

cv_results_auc_log = cross_val_score(imba_pipeline_lg, X2_enc, y, scoring='roc_auc', cv=kf)

In [73]:
cv_results_auc_log.mean()

0.7350214765097542

In [74]:
imba_pipeline_lg.fit(X2_enc,y)

Pipeline(steps=[('smote', SMOTE(random_state=42)),
                ('polynomialfeatures',
                 PolynomialFeatures(interaction_only=True)),
                ('logisticregression', LogisticRegression())])

In [78]:
## logistic regression prediction submitted
submission(imba_pipeline_lg,X2_test_enc)

# Random Forest

For random forest implementation, we use random search to tune hyperparameters to find the maximal 3 fold cross validation
- Oversampling (SMOTE)
- Undersampling 
- class_weight (a hyperparameter accomodating imbalanced dataset)

## Oversampling by SMOTE

### Hyperparameter tuning

In [None]:
param_grid = {
    'criterion':['gini','entropy'],
    'min_samples_split':[2,3,5,6,8,10,12],
    'bootstrap': [True],
    'max_depth': [i for i in range(1,6)],
    'max_features': [i for i in range(6,12)],
    'min_samples_leaf': [3,4,5,6,7],
    'min_samples_split': [2,3,5,6],
    'n_estimators': [300,500,800]
    #"class_weight":["balanced","balanced_subsample"]
}
nbr_iter = 200
new_params = {'randomforestclassifier__' + key: param_grid[key] for key in param_grid}
imba_pipeline = make_pipeline(SMOTE(random_state=42),RandomForestClassifier(random_state=13))


random_params, random_score = hypertuning_rscv(imba_pipeline, new_params,nbr_iter,X2_enc,y)

In [111]:
random_params

{'randomforestclassifier__n_estimators': 300,
 'randomforestclassifier__min_samples_split': 3,
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__max_features': 6,
 'randomforestclassifier__max_depth': 2,
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__bootstrap': True}

### Cross validation

In [112]:
clf = RandomForestClassifier(n_estimators = 300,
 min_samples_split = 3,
 min_samples_leaf = 4,
 max_features = 6,
 max_depth = 2,
 criterion = 'gini',
 #class_weight = "balanced_subsample",
 bootstrap = True,
 n_jobs=-1)

imba_pipeline_rf_SMOTE = make_pipeline(SMOTE(random_state=42),clf)

crossval_scores = cross_validate(imba_pipeline_rf_SMOTE, X2_enc, y, cv = 10,scoring = "roc_auc",return_train_score =True)

In [114]:
crossval_scores["test_score"]

array([0.70422465, 0.75364616, 0.63425216, 0.52105357, 0.69579012,
       0.68146531, 0.67068015, 0.66615306, 0.68821541, 0.70747874])

### Random forest prediction submmited (SMOTE)

In [None]:
submission(imba_pipeline_rf_SMOTE,,X2_test_enc)

## Undersampling

In [115]:
# define resampling
resample = TomekLinks()

## undersampling pipeline
imba_pipeline_rf_un = make_pipeline(resample,RandomForestClassifier(random_state=13))

## tuned hyperparameter
random_params, random_score = hypertuning_rscv(imba_pipeline_rf_un, new_params,nbr_iter,X2_enc,y)

In [116]:
random_params

{'randomforestclassifier__n_estimators': 800,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__max_features': 7,
 'randomforestclassifier__max_depth': 2,
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__bootstrap': True}

### Cross validation

In [117]:
clf = RandomForestClassifier(n_estimators = 800,
 min_samples_split = 5,
 min_samples_leaf = 4,
 max_features = 7,
 max_depth = 2,
 criterion = 'gini',
 bootstrap = True,
 n_jobs=-1)

imba_pipeline_rf = make_pipeline(resample,clf)

## cross validation
crossval_scores = cross_validate(imba_pipeline_rf, X2_enc, y, cv = 10,scoring = "roc_auc",return_train_score =True)

## Class weight 

In [79]:
param_grid = {
    'criterion':['gini','entropy'],
    'min_samples_split':[2,3,5,6,8,10,12],
    'bootstrap': [True],
    'max_depth': [i for i in range(1,6)],
    'max_features': [i for i in range(6,12)],
    'min_samples_leaf': [3,4,5,6,7],
    'min_samples_split': [2,3,5,6],
    'n_estimators': [300,500,800],
    "class_weight":["balanced","balanced_subsample"]
}

clf_3 = RandomForestClassifier()

random_params, random_score = hypertuning_rscv(clf_3, param_grid,nbr_iter,X2_enc,y)

In [80]:
random_params

{'n_estimators': 800,
 'min_samples_split': 6,
 'min_samples_leaf': 4,
 'max_features': 7,
 'max_depth': 1,
 'criterion': 'entropy',
 'class_weight': 'balanced',
 'bootstrap': True}

In [81]:
clf = RandomForestClassifier(n_estimators = 300,
 min_samples_split = 3,
 min_samples_leaf = 4,
 max_features = 10,
 max_depth = 1,
 criterion = 'entropy',
 class_weight = 'balanced_subsample',
 bootstrap = True,
 n_jobs=-1)

## cross validation
crossval_scores = cross_validate(clf, X2_enc, y, cv = 10,scoring = "roc_auc",return_train_score =True)

In [82]:
crossval_scores['test_score'].mean()

0.6749240107454988

In [133]:
clf.fit(X2_enc,y)

submission(clf,X2_test_enc)

# XGBOOST

In [140]:
X2["Round_Hour"] = X2["Round_Hour"].astype("category")
X2["Day"] = X2["Day"].astype("category")
#X2["Day.of.Month"] = X2["Day.of.Month"].astype("category")
X2["Month"] = X2["Month"].astype("category")


X_test2["Round_Hour"] = X_test2["Round_Hour"].astype("category")
X_test2["Day"] = X_test2["Day"].astype("category")
#X_test2["Day.of.Month"] = X_test2["Day.of.Month"].astype("category")
X_test2["Month"] = X_test2["Month"].astype("int")

In [141]:
X2["Month"] = X2["Month"].astype("category")
#X2["Day.of.Month"] = X2["Day.of.Month"].astype("category")
X2_encode = pd.get_dummies(X2,prefix=["Round_Hour","Day","Month"])

#X_test2["Day.of.Month"] = X_test2["Day.of.Month"].astype("category")
X_test2["Month"] = X_test2["Month"].astype("category")
X_test_encode = pd.get_dummies(X_test2,prefix=["Round_Hour","Day","Month"])

In [83]:
X2_encode.rename(columns = {"cut4_(0, 2]":"cut1","cut4_(2, 3]":"cut2","cut4_(3, 4]":"cut3","cut4_(4, 5]":"cut4"},inplace =True)
X_test_encode.rename(columns = {"cut4_(0, 2]":"cut1","cut4_(2, 3]":"cut2","cut4_(3, 4]":"cut3","cut4_(4, 5]":"cut4"},inplace =True)

In [84]:
dtrain = xgb.DMatrix(X2_encode, label=y)

In [85]:
# A parameter grid for XGBoost
p_distr ={
        "objective":['reg:logistic'],
        "booster":["gbtree"],
        'n_estimators': [500,700],
        'min_child_weight': list(range(1,10,1)),
        'gamma': [0,1,3,5,7,10],
        'subsample': [0.6,0.7,0.8],
        'colsample_bytree': [0.7],
        'max_depth': list(range(8,12,1)),
        'eta': [0.1,0.2],  
        "scale_pos_weight":[3,5,6],
        #"lambda":[0.4],
}
model = XGBClassifier()
nbr_iter = 168
random_params, random_score = hypertuning_rscv(model, p_distr, nbr_iter,X2_encode,y)

In [86]:
random_params

{'subsample': 0.6,
 'scale_pos_weight': 3,
 'objective': 'reg:logistic',
 'n_estimators': 500,
 'min_child_weight': 1,
 'max_depth': 8,
 'gamma': 0,
 'eta': 0.1,
 'colsample_bytree': 0.7,
 'booster': 'gbtree'}

In [87]:
 cv_results = xgb.cv(dtrain= dtrain, params=random_params, nfold=10,num_boost_round=300,
                     metrics='auc', early_stopping_rounds = 50,
                        as_pandas=True, seed=123)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings bu

In [88]:
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.768170,0.002473,0.738616,0.010368
1,0.801723,0.008453,0.771416,0.013302
2,0.815890,0.005237,0.787373,0.008863
3,0.823562,0.004091,0.794752,0.008909
4,0.829023,0.002847,0.800170,0.009660
...,...,...,...,...
295,0.982038,0.000477,0.854014,0.008391
296,0.982139,0.000484,0.854025,0.008352
297,0.982253,0.000510,0.853958,0.008334
298,0.982373,0.000529,0.853991,0.008384


## Test 

In [93]:
xg_reg = XGBClassifier(subsample = 0.6,
              scale_pos_weight = 3,
              objective = "reg:logistic",
              min_child_weight = 1,
              n_estimators = 500,
              max_depth = 8,
              gamma = 0,
              eta = 0.1,
              colsample_bytree = 0.7,
              booster = 'gbtree',
              num_boost_round=300)

xg_reg.fit(X2_enc,y)
## submission
submission(xg_reg,X2_test_enc)

Parameters: { num_boost_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




# LightGBM

## LightGBM hyperparameter tuning<a class="tocSkip">
Since hyperparameter tuning is a time consuming process, we will take 7 steps to perform hyperparamter tuning by AUC: 
- step 1: tuning **max_depth** and **num_leaves** (tree complexity)
- step 2: tuning **min_data_in_leaf** and **min_sum_hessian_in_leaf** (prevent overfitting)
- step 3: tuning **feature_fraction** (prevent overfitting, decorrelate trees)
- step 4: tuning **bagging_fraction** and **bagging_freq** (prevent overfitting)
- step 5: tuning **lambda_l1(reg_alpha)** and **lambda_l2(reg_lambda)** (prevent overfitting)
- step 6: tuning **cat_smooth** (reduce the effect of noises in categorical features)
- step 7: tuning **learning_rate** and **num_iterations** (final tuning)



In [97]:
dtrain = lgb.Dataset(X2_encode,y)

### Tuning all lightGBM hyperparameters in one time

In [94]:
para_lgb = {
    'max_depth': [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18],
    'num_leaves': [40,50,60,70,80,90,100],
    'min_data_in_leaf':range(1,102,10),
    'min_sum_hessian_in_leaf':[0.02,0.03,0.04,0.05],
    'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
    'bagging_freq': [2, 4, 5, 6, 8],
    'lambda_l1': [0, 0.1, 0.4, 0.5, 0.6],
    'lambda_l2': [0, 10, 15, 35, 40],
    'cat_smooth': [1, 10, 15, 20, 35],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    #'num_iterations':range(100,1000,100)
}


model_lgb = lgb.LGBMClassifier(is_unbalance = True,metric = 'auc')

random_params_lgb, random_score_lgb = hypertuning_rscv(model_lgb, para_lgb,nbr_iter,X2_encode,y)

In [95]:
random_params_lgb

{'num_leaves': 40,
 'min_sum_hessian_in_leaf': 0.04,
 'min_data_in_leaf': 61,
 'max_depth': 4,
 'learning_rate': 0.01,
 'lambda_l2': 35,
 'lambda_l1': 0,
 'feature_fraction': 0.6,
 'cat_smooth': 20,
 'bagging_freq': 8,
 'bagging_fraction': 0.7}

### Tuning by min_data_in_leaf and min_sum_hessian_in_leaf  (step 2)

In [98]:
# LightGBM, cross-validation
cv_result_lgb = lgb.cv(random_params_lgb, 
                       dtrain, 
                       num_boost_round = 1000, 
                       metrics = "auc",
                       nfold=10, 
                       stratified=True, 
                       early_stopping_rounds=50, 
                       verbose_eval=100, 
                       eval_train_metric =True)

[100]	cv_agg's train auc: 0.775119 + 0.00111859	cv_agg's valid auc: 0.769922 + 0.0104099
[200]	cv_agg's train auc: 0.787696 + 0.00127297	cv_agg's valid auc: 0.781594 + 0.00964075
[300]	cv_agg's train auc: 0.796683 + 0.00114024	cv_agg's valid auc: 0.789772 + 0.0100093
[400]	cv_agg's train auc: 0.803041 + 0.0012455	cv_agg's valid auc: 0.795453 + 0.0100901
[500]	cv_agg's train auc: 0.80791 + 0.00112774	cv_agg's valid auc: 0.799375 + 0.0101369
[600]	cv_agg's train auc: 0.811825 + 0.0010572	cv_agg's valid auc: 0.802474 + 0.0103189
[700]	cv_agg's train auc: 0.815412 + 0.00111558	cv_agg's valid auc: 0.805255 + 0.0102686
[800]	cv_agg's train auc: 0.81867 + 0.0010543	cv_agg's valid auc: 0.807684 + 0.0104024
[900]	cv_agg's train auc: 0.822027 + 0.00098294	cv_agg's valid auc: 0.810171 + 0.0104422
[1000]	cv_agg's train auc: 0.824941 + 0.00110989	cv_agg's valid auc: 0.812159 + 0.0105018


In [120]:
X2_test_enc.drop(["ID"],axis=1,inplace = True)

In [111]:
## classfier with tuned hyperparamters
lgb_clf = lgb.LGBMClassifier(n_estimators=200, boosting_type = "gbdt",
    learning_rate = 0.01,
    max_depth = 4,
    num_leaves = 40, 
    min_sum_hessian_in_leaf = 0.04,
    min_data_in_leaf = 61,
    objective = 'binary',
    lambda_l1 = 0,
    lambda_l2 = 35,
    is_unbalance = True,
    class_weight = "balanced",
    feature_fraction = 0.6,
    cat_smooth = 20,
    bagging_fraction = 0.7,
    bagging_freq = 8,
    metrics ='auc')

## model fitting
lgb_clf.fit(X2_enc, y)

## submission
submission(lgb_clf,X2_test_enc)

## Support vector machine

In [86]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

In [433]:
acv_results = cross_validate(svclassifier, X2_enc, y, cv = 10,return_train_score =True)

In [434]:
cv_results

{'fit_time': array([22649.269274  , 10399.8090055 , 21673.22888589,  6811.99339414,
        39778.07141256,  7923.59264469, 12070.7809546 , 11497.37369609,
        15605.28890753, 48036.19392228]),
 'score_time': array([2.89868355, 3.73362684, 2.06549835, 2.35109138, 2.04566884,
        1.91014409, 1.90039968, 1.97342229, 1.92709255, 2.09032583]),
 'test_score': array([0.81842052, 0.81842052, 0.81842052, 0.81842052, 0.81858586,
        0.81858586, 0.81838384, 0.81838384, 0.81838384, 0.81838384]),
 'train_score': array([0.81844096, 0.81844096, 0.81844096, 0.81844096, 0.81842259,
        0.81842259, 0.81844503, 0.81844503, 0.81844503, 0.81844503])}

## Voting Classifier

In [92]:
# group / ensemble of models 
estimator = [] 

## Logistic regression
kf = KFold(n_splits=10, random_state=42, shuffle=False)
log_imba_pipeline = make_pipeline(SMOTE(random_state=42),PolynomialFeatures(interaction_only=True),LogisticRegression())

## Random forest
df_imba_pipeline = make_pipeline(SMOTE(random_state=42),RandomForestClassifier(random_state=13))

## XGBoost
xgb = XGBClassifier(subsample = 0.6,
              scale_pos_weight = 3,
              objective = "reg:logistic",
              min_child_weight = 6,
              max_depth = 11,
              gamma = 0,
              eta = 0.2,
              colsample_bytree = 0.7,
              booster = 'gbtree',
              num_boost_round=300)

estimator.append(('LR',log_imba_pipeline)) 
estimator.append(('RF', df_imba_pipeline)) 
estimator.append(('XGB', xgb)) 

vot_soft = VotingClassifier(estimators = estimator, voting ='soft') 
vot_soft.fit(X2_enc, y) 

Parameters: { num_boost_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




VotingClassifier(estimators=[('LR',
                              Pipeline(steps=[('smote', SMOTE(random_state=42)),
                                              ('polynomialfeatures',
                                               PolynomialFeatures(interaction_only=True)),
                                              ('logisticregression',
                                               LogisticRegression())])),
                             ('RF',
                              Pipeline(steps=[('smote', SMOTE(random_state=42)),
                                              ('randomforestclassifier',
                                               RandomForestClassifier(random_state=13))])),
                             ('XGB',
                              XGBClassifier(base_sco...
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=11,
                                            min_child_weight=6,

## Stacking Classifier