In [138]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from matplotlib import dates
import seaborn as sns
from scipy import stats
import time
import catboost
import catboost.datasets
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
import holidays
import calendar
warnings.filterwarnings('ignore')

# Functions

In [139]:
## function of getting the optimized paramters and score
def hypertuning_rscv(est, p_distr, nbr_iter,X,y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=3)
    
    #CV = Cross-Validation (here using Stratified KFold CV)
    rdmsearch.fit(X,y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [None]:
class ModelOptimizer:
    best_score = None
    opt = None
    
    def __init__(self, model, X_train, y_train, categorical_columns_indices=None, n_fold=3, seed=2405, early_stopping_rounds=30, is_stratified=True, is_shuffle=True):
        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.categorical_columns_indices = categorical_columns_indices
        self.n_fold = n_fold
        self.seed = seed
        self.early_stopping_rounds = early_stopping_rounds
        self.is_stratified = is_stratified
        self.is_shuffle = is_shuffle
        
        
    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)
            
    def evaluate_model(self):
        pass
    
    def optimize(self, param_space, max_evals=10, n_random_starts=2):
        start_time = time.time()
        
        @use_named_args(param_space)
        def _minimize(**params):
            self.model.set_params(**params)
            return self.evaluate_model()
        
        opt = gp_minimize(_minimize, param_space, n_calls=max_evals, n_random_starts=n_random_starts, random_state=2405, n_jobs=-1)
        best_values = opt.x
        optimal_values = dict(zip([param.name for param in param_space], best_values))
        best_score = opt.fun
        self.best_score = best_score
        self.opt = opt
        
        print('optimal_parameters: {}\noptimal score: {}\noptimization time: {}'.format(optimal_values, best_score, time.time() - start_time))
        print('updating model with optimal values')
        self.update_model(**optimal_values)
        plot_convergence(opt)
        return optimal_values

class XgbOptimizer(ModelOptimizer):
    def evaluate_model(self):
        scores = xgboost.cv(self.model.get_xgb_params(), 
                    xgboost.DMatrix(self.X_train, label=self.y_train),
                    num_boost_round=self.model.n_estimators, 
                    metrics='auc', 
                    nfold=self.n_fold, 
                    stratified=self.is_stratified,
                    shuffle=self.is_shuffle,
                    seed=self.seed,
                    early_stopping_rounds=self.early_stopping_rounds)
        self.scores = scores
        test_scores = scores.iloc[:, 2]
        best_metric = test_scores.max()
        return 1 - best_metric
    
class CatboostOptimizer(ModelOptimizer):
    def evaluate_model(self):
        validation_scores = catboost.cv(
        catboost.Pool(self.X_train, 
                      self.y_train, 
                      cat_features=self.categorical_columns_indices),
        self.model.get_params(), 
        nfold=self.n_fold,
        stratified=self.is_stratified,
        seed=self.seed,
        early_stopping_rounds=self.early_stopping_rounds,
        shuffle=self.is_shuffle,
        plot=False)
        self.scores = validation_scores
        test_scores = validation_scores.iloc[:, 2]
        best_metric = test_scores.max()
        return 1 - best_metric
    
class LightGBMOptimizer(ModelOptimizer):
    def evaluate_model(self):
        lgb_dataset = lgb.Dataset(self.X_train, 
                                  self.y_train, 
                                  self.categorical_columns_indices)
        eval_hist = lgb.cv(self.model.get_params(), 
                           lgb_dataset,
                           self.model.n_estimators, 
                           nfold=self.n_fold,
                           seed=self.seed, 
                           stratified=self.is_stratified, 
                           shuffle=self.is_shuffle,
                           early_stopping_rounds=self.early_stopping_rounds, 
                           metrics='auc')
        self.scores = eval_hist
        test_scores = eval_hist[list(eval_hist.keys())[0]]
        best_metric = max(test_scores)
        return 1 - best_metric

In [140]:
train = pd.read_csv("traffic_clean2.csv")
train1 = pd.read_csv("train1.csv")
test = pd.read_csv("test_clean2.csv")

# Secondary feature preprocessing 

## Reduce the Vessel.name categories that are not present in the test set

In [141]:
vessel_redun = [i for i in np.unique(train1["Vessel.Name"]) if i not in np.unique(test["Vessel.Name"])]
train1 = train1.loc[~train1["Vessel.Name"].isin(vessel_redun),:]

## Categorize the traffic lags from 0 to 5 to reduce noises.

In [142]:
trai = train1["Scheduled.Departure"].apply(lambda x: pd.to_datetime(x).strftime("%H:%M"))
tes = test["Scheduled.Departure"].apply(lambda x: pd.to_datetime(x).strftime("%H:%M"))
vessel_sche = [i for i in np.unique(trai) if i not in np.unique(tes)]
train1 = train1.loc[~train1["Scheduled.Departure"].isin(vessel_sche),:]

train1.reset_index(drop = True,inplace = True)

## train cut
train1["cut1"] = pd.cut(train.lag1 , [0,2,3,4,5], right=True)
train1["cut2"] = pd.cut(train.lag2 , [0,2,3,4,5], right=True)
train1["cut3"] = pd.cut(train.lag3 , [0,2,3,4,5], right=True)
train1["cut4"] = pd.cut(train.lag4 , [0,2,3,4,5], right=True)

## test cut
test["cut1"] = pd.cut(test.lag1 , [0,2,3,4,5], right=True)
test["cut2"] = pd.cut(test.lag2 , [0,2,3,4,5], right=True)
test["cut3"] = pd.cut(test.lag3 , [0,2,3,4,5], right=True)
test["cut4"] = pd.cut(test.lag4 , [0,2,3,4,5], right=True)

In [143]:
train1["Vessel.Name"].value_counts()

Queen of Capilano             6504
Queen of Surrey               5634
Queen of Oak Bay              3433
Skeena Queen                  3293
Coastal Celebration           2863
Coastal Inspiration           2813
Queen of Coquitlam            2702
Spirit of Vancouver Island    2658
Queen of Alberni              2547
Coastal Renaissance           2499
Queen of Cowichan             2317
Queen of New Westminster      1987
Queen of Cumberland           1739
Mayne Queen                   1033
Bowen Queen                    261
Salish Eagle                   241
Salish Raven                   192
Name: Vessel.Name, dtype: int64

## Combine the Vessel categories with similar delay frequencies

In [190]:
## Visualize the delay frequency by category
train1[["Vessel.Name","Delay.Indicator"]].groupby("Vessel.Name").mean().sort_values(by = 'Delay.Indicator')

## combine the Vessel categories with similar delay frequencies
train1.loc[train1["Vessel.Name"].isin(["Skeena Queen","Mayne Queen",'Queen of Alberni']),"Vessel"] = "name1"
train1.loc[train1["Vessel.Name"].isin(["Coastal Renaissance","Queen of New Westminster","Bowen Queen","Coastal Inspiration","Queen of Cumberland","Coastal Celebration"]),"Vessel"] = "name2"
train1.loc[train1["Vessel.Name"].isin(['Queen of Coquitlam',"Spirit of Vancouver Island","Queen of Cowichan","Queen of Capilano"]),"Vessel"] = "name3"
train1.loc[train1["Vessel.Name"].isin(["Queen of Oak Bay","Salish Raven","Queen of Surrey","Salish Eagle"]),"Vessel"] = "name4"

test.loc[train1["Vessel.Name"].isin(["Skeena Queen","Mayne Queen",'Queen of Alberni']),"Vessel"] = "name1"
test.loc[train1["Vessel.Name"].isin(["Coastal Renaissance","Queen of New Westminster","Bowen Queen","Coastal Inspiration","Queen of Cumberland","Coastal Celebration"]),"Vessel"] = "name2"
test.loc[train1["Vessel.Name"].isin(['Queen of Coquitlam',"Spirit of Vancouver Island","Queen of Cowichan","Queen of Capilano"]),"Vessel"] = "name3"
test.loc[train1["Vessel.Name"].isin(["Queen of Oak Bay","Salish Raven","Queen of Surrey","Salish Eagle"]),"Vessel"] = "name4"

## Extract the feature Round Hour

In [147]:
## Extract the feature Round Hour
train1.drop("Unnamed: 0",axis=1,inplace =True)
train1["Date_time"] = pd.to_datetime(train1["Date_time"])
test["Date_time"] = pd.to_datetime(test["Date_time"])

train1["Round_time"] = train1["Date_time"].dt.round("H")
train1["Round_Hour"] = train1["Round_time"].apply(lambda x: x.strftime("%H"))

test["Round_time"] = test["Date_time"].dt.round("H")
test["Round_Hour"] = test["Round_time"].apply(lambda x: x.strftime("%H"))

## Feature extraction

In [191]:
X2 = train1[["Round_Hour","Month","Day",
       'Vessel',"Day.of.Month","Trip","holidays_indicator","lag3","van_tem_lag2","weekend.Indicator"]]
X2_enc = pd.get_dummies(X2,prefix=["Round_Hour","Vessel","Trip"])

y = train1["Delay.Indicator"]

X_test2 = test[["Round_Hour","Month","Day",
       'Vessel',"Day.of.Month","Trip","holidays_indicator","lag3","van_tem_lag2","weekend.Indicator"]]

X2_test_enc = pd.get_dummies(X_test2,prefix=["Round_Hour","Vessel","Trip"])

# Logistic Regression

In [154]:
logreg = LogisticRegression(class_weight ="balanced")
cv_results = cross_validate(logreg, X2_enc, y, cv=10,return_train_score =True)
crossval_scores_log = cross_val_score(logreg, X2_enc, y, scoring='roc_auc', cv=10)

In [155]:
cv_results["test_score"].mean()

0.6349779115224442

In [156]:
cv_results["train_score"].mean()

0.6491035824053807

In [55]:
# feed new data to the model
logreg.fit(X2_enc, y)
# make a prediction
ynew = logreg.predict_proba(X2_test_enc)

In [56]:
X2_test_enc

Unnamed: 0,Month,Day,Day.of.Month,holidays_indicator,van_tem_lag2,weekend.Indicator,Round_Hour_00,Round_Hour_05,Round_Hour_06,Round_Hour_07,...,Trip_Horseshoe Bay to Departure Bay,Trip_Horseshoe Bay to Langdale,Trip_Horseshoe Bay to Snug Cove (Bowen Is.),Trip_Langdale to Horseshoe Bay,Trip_Swartz Bay to Fulford Harbour (Saltspring Is.),Trip_Swartz Bay to Southern Gulf Islands,Trip_Swartz Bay to Tsawwassen,Trip_Tsawwassen to Duke Point,Trip_Tsawwassen to Southern Gulf Islands,Trip_Tsawwassen to Swartz Bay
0,11,0,27,False,6.0,False,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11,0,27,False,5.0,False,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,11,0,27,False,5.0,False,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
3,11,0,27,False,5.0,False,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,11,0,27,False,5.0,False,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12371,3,4,30,True,7.6,False,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
12372,3,4,30,True,7.6,False,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12373,3,4,30,True,7.2,False,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12374,3,4,30,True,7.2,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
ynew[:,1]

array([0.20166592, 0.05440241, 0.05893955, ..., 0.37421961, 0.26746578,
       0.63784809])

In [58]:
test["ID"] = test.index +1
ctbsubmission = pd.concat([pd.DataFrame(ynew[:,1]),test['ID']],axis=1)
ctbsubmission.columns = ['Delay.Indicator','ID']

In [50]:
train1.iloc[0,:]

Vessel.Name                             Queen of Alberni
Scheduled.Departure                                05:15
Trip                            Tsawwassen to Duke Point
Day                                                    6
Month                                                  8
Day.of.Month                                          28
Year                                                2016
Full.Date                                     2016-08-28
Delay.Indicator                                        0
Vessel.Name_Kfold_Target_Enc                   0.0875576
Trip_Kfold_Target_Enc                           0.112426
weekend.Indicator                                   True
Date_time                            2016-08-28 05:15:00
holidays_indicator                                 False
lag1                                                   1
lag2                                                   1
lag3                                                   1
lag4                           

In [59]:
ctbsubmission

Unnamed: 0,Delay.Indicator,ID
0,0.201666,1
1,0.054402,2
2,0.058940,3
3,0.060338,4
4,0.075535,5
...,...,...
12371,0.602196,12372
12372,0.557607,12373
12373,0.374220,12374
12374,0.267466,12375


In [60]:
ctbsubmission.to_csv('log_predictions_1.csv',index=False)

# Random Forest

For random forest implementation, we use random search to tune hyperparameters to find the maximal 3 fold cross validation

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion':['gini','entropy'],
    'min_samples_split':[2,3,5,6,8,10,12],
    'bootstrap': [True],
    'max_depth': [i for i in range(1,6)],
    'max_features': [i for i in range(6,12)],
    'min_samples_leaf': [3,4,5,6,7],
    'min_samples_split': [2,3,5,6],
    'n_estimators': [300,500,800],
    "class_weight":["balanced","balanced_subsample"]
}
nbr_iter = 200
est = RandomForestClassifier()
random_params, random_score = hypertuning_rscv(est, param_grid, nbr_iter,X2,y)

In [21]:
random_params

{'n_estimators': 500,
 'min_samples_split': 3,
 'min_samples_leaf': 4,
 'max_features': 6,
 'max_depth': 5,
 'criterion': 'entropy',
 'class_weight': 'balanced_subsample',
 'bootstrap': True}

In [157]:
clf = RandomForestClassifier(n_estimators = 500,
 min_samples_split = 3,
 min_samples_leaf = 4,
 max_features = 6,
 max_depth = 5,
 criterion = 'entropy',
 class_weight = "balanced_subsample",
 bootstrap = True,
 n_jobs=-1)

In [158]:
crossval_scores = cross_validate(clf, X2_enc, y, cv = 10,return_train_score =True)

In [160]:
crossval_scores

{'fit_time': array([6.62857175, 4.4141984 , 4.13211751, 4.15775824, 4.04812169,
        4.39369369, 3.96480107, 4.36510873, 3.98481607, 4.02431536]),
 'score_time': array([0.22401762, 0.25931978, 0.2210207 , 0.23071909, 0.22601795,
        0.3298676 , 0.22127795, 0.22601819, 0.22501969, 0.23403502]),
 'test_score': array([0.53979401, 0.66104869, 0.80571161, 0.69990637, 0.52879213,
        0.4946161 , 0.50760946, 0.49449778, 0.55841723, 0.61367361]),
 'train_score': array([0.63172407, 0.64764333, 0.60396941, 0.65261159, 0.64535428,
        0.65164915, 0.65459748, 0.66253089, 0.65982573, 0.65568995])}

In [161]:
crossval_scores["train_score"].mean()

0.6465595862968554

In [162]:
crossval_scores["test_score"].mean()

0.5904066993932602

In [41]:
clf.fit(X2_enc, y)

# make a prediction
ynew = clf.predict_proba(X2_test_enc)

In [43]:
test["ID"] = test.index +1

In [44]:
ctbsubmission = pd.concat([pd.DataFrame(ynew[:,1]),test['ID']],axis=1)
ctbsubmission.columns = ['Delay.Indicator','ID']

In [45]:
ctbsubmission.to_csv('random_predictions_2.csv',index=False)

In [42]:
ynew[:,1]

array([0.43528374, 0.31501186, 0.31061838, ..., 0.42821229, 0.37339628,
       0.48209109])

In [46]:
ctbsubmission

Unnamed: 0,Delay.Indicator,ID
0,0.435284,1
1,0.315012,2
2,0.310618,3
3,0.321119,4
4,0.340970,5
...,...,...
12371,0.485879,12372
12372,0.455230,12373
12373,0.428212,12374
12374,0.373396,12375


In [168]:
X2.dtypes

Round_Hour            category
Month                    int32
Day                   category
Vessel                  object
Day.of.Month             int32
Trip                    object
holidays_indicator        bool
cut4                  category
van_tem_lag2           float64
weekend.Indicator         bool
dtype: object

# XGBOOST

In [192]:
X2["Round_Hour"] = X2["Round_Hour"].astype("category")
X2["Day"] = X2["Day"].astype("category")
X2["Day.of.Month"] = X2["Day.of.Month"].astype("category")
X2["Month"] = X2["Month"].astype("category")


X_test2["Round_Hour"] = X_test2["Round_Hour"].astype("category")
X_test2["Day"] = X_test2["Day"].astype("category")
X_test2["Day.of.Month"] = X_test2["Day.of.Month"].astype("category")
X_test2["Month"] = X_test2["Month"].astype("int")

In [193]:
X2["Month"] = X2["Month"].astype("int")
X2["Day.of.Month"] = X2["Day.of.Month"].astype("int")
X2_encode = pd.get_dummies(X2,prefix=["Round_Hour",'Vessel',"Trip","Day"])

X_test2["Day.of.Month"] = X_test2["Day.of.Month"].astype("int")
X_test2["Month"] = X_test2["Month"].astype("int")
X_test_encode = pd.get_dummies(X_test2,prefix=["Round_Hour",'Vessel',"Trip","Day"])

In [194]:
X2_encode.rename(columns = {"cut4_(0, 2]":"cut1","cut4_(2, 3]":"cut2","cut4_(3, 4]":"cut3","cut4_(4, 5]":"cut4"},inplace =True)
X_test_encode.rename(columns = {"cut4_(0, 2]":"cut1","cut4_(2, 3]":"cut2","cut4_(3, 4]":"cut3","cut4_(4, 5]":"cut4"},inplace =True)

In [187]:
dtrain = xgb.DMatrix(X2_encode, label=y)

In [202]:
# A parameter grid for XGBoost
p_distr ={
        "objective":['reg:logistic'],
        "booster":["gbtree"],
        #'n_estimators': [300],
        #'min_child_weight': [7],
        #'gamma': [7],
        'subsample': [0.7],
        'colsample_bytree': [0.7],
        #'max_depth': [3,4,5,6,7,8,9,10],
        #'eta': [0.2],
        #"scale_pos_weight":[5],
        "lambda":[0.4],
        #"alpha":[0.9]
}

est =  XGBClassifier()
nbr_iter = 120

random_params, random_score = hypertuning_rscv(est, p_distr, nbr_iter,X2_encode,y)

In [70]:
random_params

{'subsample': 0.7,
 'objective': 'reg:logistic',
 'lambda': 0.4,
 'colsample_bytree': 0.7,
 'booster': 'gbtree'}

In [188]:
 cv_results = xgb.cv(dtrain= dtrain, params=random_params, nfold=10,num_boost_round=300,
                     metrics='auc', early_stopping_rounds = 50,
                        as_pandas=True, seed=123)

Parameters: { bootstrap, class_weight, criterion, max_features, min_samples_leaf, min_samples_split, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { bootstrap, class_weight, criterion, max_features, min_samples_leaf, min_samples_split, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { bootstrap, class_weight, criterion, max_features, min_samples_leaf, min_samples_split, n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost 

In [189]:
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.738000,0.009857,0.733703,0.012302
1,0.756530,0.002475,0.751421,0.009920
2,0.770740,0.002577,0.764712,0.007153
3,0.779473,0.002121,0.771080,0.006872
4,0.786726,0.002449,0.777465,0.007017
...,...,...,...,...
295,0.951457,0.000909,0.846751,0.007307
296,0.951626,0.000923,0.846760,0.007332
297,0.951789,0.000916,0.846744,0.007294
298,0.951917,0.000914,0.846713,0.007298


In [22]:
est =  XGBClassifier()

## Test 

In [195]:
xg_reg = XGBClassifier(subsample = 0.7,
 objective = "reg:logistic",
 colsample_bytree = 0.7,
 booster = 'gbtree',num_boost_round=300)

In [196]:
xg_reg.fit(X2_encode,y)

Parameters: { num_boost_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_boost_round=300,
              num_parallel_tree=1, objective='reg:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [197]:
preds = xg_reg.predict_proba(X_test_encode)

In [198]:
test["ID"] = test.index +1

In [199]:
ctbsubmission = pd.concat([pd.DataFrame(preds[:,1]),test['ID']],axis=1)
ctbsubmission.columns = ['Delay.Indicator','ID']

In [200]:
ctbsubmission

Unnamed: 0,Delay.Indicator,ID
0,0.016838,1
1,0.005968,2
2,0.002639,3
3,0.005356,4
4,0.004510,5
...,...,...
12371,0.119778,12372
12372,0.316162,12373
12373,0.434904,12374
12374,0.080282,12375


In [201]:
ctbsubmission.to_csv('xgb_logistic_predictions_2.csv',index=False)

In [107]:
ctbsubmission["Delay.Indicator"].value_counts()

0.072874    2
0.064088    2
0.001841    2
0.076633    2
0.069706    2
           ..
0.004682    1
0.301621    1
0.091147    1
0.018880    1
0.005249    1
Name: Delay.Indicator, Length: 12208, dtype: int64

## LightGBM

In [373]:
X2["Round_Hour"] = X2["Round_Hour"].astype('category')

In [395]:
dtrain = lgb.Dataset(X2,y)

params = {
    'boosting_type':'gbdt',
    'learning_rate': 0.01,
    'max_depth': 5,
    'num_leaves': 40, 
    'objective':'binary',
    'class_weight':'balanced',
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'max_bin': 100}

# LightGBM, cross-validation
cv_result_lgb = lgb.cv(params, 
                       dtrain, 
                       num_boost_round=1000, 
                       metrics = "auc",
                       nfold=10, 
                       stratified=True, 
                       early_stopping_rounds=50, 
                       verbose_eval=100, 
                       eval_train_metric =True)

[100]	cv_agg's train auc: 0.789679 + 0.000814879	cv_agg's valid auc: 0.782431 + 0.0066354
[200]	cv_agg's train auc: 0.801386 + 0.000638589	cv_agg's valid auc: 0.792902 + 0.00655273
[300]	cv_agg's train auc: 0.812434 + 0.000816405	cv_agg's valid auc: 0.802217 + 0.00590343
[400]	cv_agg's train auc: 0.823671 + 0.000943514	cv_agg's valid auc: 0.811534 + 0.0051792
[500]	cv_agg's train auc: 0.832589 + 0.0011072	cv_agg's valid auc: 0.818514 + 0.00470185
[600]	cv_agg's train auc: 0.841104 + 0.00119141	cv_agg's valid auc: 0.825065 + 0.0045277
[700]	cv_agg's train auc: 0.848504 + 0.00109121	cv_agg's valid auc: 0.83064 + 0.00435385
[800]	cv_agg's train auc: 0.854919 + 0.000882007	cv_agg's valid auc: 0.835295 + 0.00440489
[900]	cv_agg's train auc: 0.860224 + 0.000776781	cv_agg's valid auc: 0.83905 + 0.00450707
[1000]	cv_agg's train auc: 0.864786 + 0.000682995	cv_agg's valid auc: 0.841996 + 0.0046066


In [425]:
lgb_clf = lgb.LGBMClassifier(n_estimators=200, boosting_type = "gbdt",
    learning_rate = 0.01,
    max_depth = 5,
    num_leaves = 40, 
    objective = 'binary',
    class_weight = "balanced",
    feature_fraction = 0.75,
    bagging_fraction = 0.75,
    max_bin = 100,metrics ='auc')
lgb_clf.fit(X2_enc, y)
preds = lgb_clf.predict_proba(X2_test_enc)

In [406]:
X2_enc
import re
X2_enc = X2_enc.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [428]:
preds[:,1]

array([0.42294236, 0.13748757, 0.13705473, ..., 0.31954765, 0.20150389,
       0.5732822 ])

In [427]:
test["ID"] = test.index +1
ctbsubmission = pd.concat([pd.DataFrame(preds[:,1]),test['ID']],axis=1)
ctbsubmission.columns = ['Delay.Indicator','ID']
ctbsubmission.to_csv('lgt_predictions_1.csv',index=False)

## Support vector machine

In [431]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

In [433]:
acv_results = cross_validate(svclassifier, X2_enc, y, cv = 10,return_train_score =True)

In [434]:
cv_results

{'fit_time': array([22649.269274  , 10399.8090055 , 21673.22888589,  6811.99339414,
        39778.07141256,  7923.59264469, 12070.7809546 , 11497.37369609,
        15605.28890753, 48036.19392228]),
 'score_time': array([2.89868355, 3.73362684, 2.06549835, 2.35109138, 2.04566884,
        1.91014409, 1.90039968, 1.97342229, 1.92709255, 2.09032583]),
 'test_score': array([0.81842052, 0.81842052, 0.81842052, 0.81842052, 0.81858586,
        0.81858586, 0.81838384, 0.81838384, 0.81838384, 0.81838384]),
 'train_score': array([0.81844096, 0.81844096, 0.81844096, 0.81844096, 0.81842259,
        0.81842259, 0.81844503, 0.81844503, 0.81844503, 0.81844503])}