In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from lightgbm import LGBMClassifier
import lightgbm as lgb


In [3]:
policy_df= pd.read_csv(r"C:\Ziyuan Sui\MSBA\Fall22\Travelers\data\finalized.csv")

Model Building

In [4]:
import re
policy_df = policy_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [5]:
ptrain = policy_df[policy_df['split'] == 'Train'].drop('split', axis=1)
msk = np.random.rand(len(ptrain)) < 0.75
df = ptrain.drop(['policy_id'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(df.drop(['convert_ind'], axis=1), df['convert_ind'], test_size=0.25)
train, test = df[msk], df[~msk]
predictors = X_test.columns.values
target = 'convert_ind'

In [6]:
lgb2 = LGBMClassifier()

In [7]:
param_test = {
 'num_leaves':range(5,45,5),
 'min_data_in_leaf':range(5,100,5),
 'reg_alpha': range(4,20,2)
}

In [53]:
gs = GridSearchCV(estimator=lgb2,
                  param_grid=param_test,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=-1)

gs= gs.fit(X_train,y_train)
print("\n Parameter Tuning #4")
print("Non-nested CV Accuracy: ", gs.best_score_)
print("Optimal Parameter: ", gs.best_params_)
print("Optimal Estimator: ", gs.best_estimator_)



 Parameter Tuning #4
Non-nested CV Accuracy:  0.6973321217286484
Optimal Parameter:  {'min_data_in_leaf': 95, 'num_leaves': 10, 'reg_alpha': 10}
Optimal Estimator:  LGBMClassifier(min_data_in_leaf=95, num_leaves=10, reg_alpha=10)


In [15]:
rs = RandomizedSearchCV(estimator=lgb2,
                  param_distributions=param_test,
                  scoring='roc_auc',
                  cv=10,
                  n_jobs=-1)

rs= rs.fit(X_train,y_train)
print("\n Parameter Tuning #4")
print("Non-nested CV Accuracy: ", rs.best_score_)
print("Optimal Parameter: ", rs.best_params_)
print("Optimal Estimator: ", rs.best_estimator_)


 Parameter Tuning #4
Non-nested CV Accuracy:  0.692298484435789
Optimal Parameter:  {'reg_alpha': 10, 'num_leaves': 30, 'min_data_in_leaf': 35, 'feature_fraction': 0.7, 'bagging_fraction': 0.8}
Optimal Estimator:  LGBMClassifier(bagging_fraction=0.8, feature_fraction=0.7, min_data_in_leaf=35,
               num_leaves=30, reg_alpha=10)


In [34]:
policy_train_df = policy_df[policy_df['split'] == 'Train'].drop('split', axis=1)
policy_test_df = policy_df[policy_df['split'] == 'Test'].drop('split', axis=1)

In [35]:
ptrain, ptest = policy_train_df.copy(), policy_test_df.copy()
X, y = ptrain.drop(['convert_ind', 'policy_id'], axis=1), ptrain['convert_ind']
X_test = ptest.drop(['convert_ind', 'policy_id'], axis=1)

In [36]:
X_test.shape

(12291, 34)

In [41]:
policy_df.shape

(49162, 37)

In [22]:
lgb3 = LGBMClassifier(min_data_in_leaf=70, num_leaves=20, reg_alpha=10)

In [25]:
param_test2 = {
'n_estimators' : range(500,6000,500)
}

In [26]:
gs2 = GridSearchCV(estimator=lgb3,
                  param_grid=param_test2,
                  scoring='roc_auc',
                  cv=10,
                  n_jobs=-1)

gs2= gs2.fit(X_train,y_train)
print("auc: ", gs2.best_score_)
print("Optimal Parameter: ", gs2.best_params_)
print("Optimal Estimator: ", gs2.best_estimator_)


auc:  0.6852384834309744
Optimal Parameter:  {'n_estimators': 500}
Optimal Estimator:  LGBMClassifier(min_data_in_leaf=70, n_estimators=500, num_leaves=20,
               reg_alpha=10)


In [39]:
lgb3.fit(X,y)
y_pred = lgb3.predict_proba(X_test)

In [40]:
TARGET=y_pred[:,1]

In [43]:
submission = pd.DataFrame({
    "policy_id": policy_test_df["policy_id"],
    "TARGET": TARGET
})

In [44]:
submission.to_csv(r"C:\Ziyuan Sui\MSBA\Fall22\Travelers\submission4.csv",index=False, mode='w')

In [36]:
lgb4 = LGBMClassifier(min_data_in_leaf=95, num_leaves=10, reg_alpha=10)

In [39]:
param_test4 = {
'bagging_fraction':[i/10.0 for i in range(1,10)],
 'feature_fraction':[i/10.0 for i in range(1,10)]
}

In [40]:
gs4 = GridSearchCV(estimator=lgb4,
                  param_grid=param_test4,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=-1)

gs4= gs4.fit(X_train,y_train)
print("auc: ", gs4.best_score_)
print("Optimal Parameter: ", gs4.best_params_)
print("Optimal Estimator: ", gs4.best_estimator_)

auc:  0.698302226981198
Optimal Parameter:  {'bagging_fraction': 0.1, 'feature_fraction': 0.5}
Optimal Estimator:  LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.5, min_data_in_leaf=95,
               num_leaves=10, reg_alpha=10)


In [44]:
lgb5 = LGBMClassifier(min_data_in_leaf=95, num_leaves=10, reg_alpha=10)

In [47]:
param_test5 = {
'bagging_fraction':[i/10.0 for i in range(1,10)],
 'feature_fraction':[i/10.0 for i in range(1,10)],
 'min_sum_hessian_in_leaf':[i for i in range(11,40,2)]
}

In [48]:
gs5 = GridSearchCV(estimator=lgb5,
                  param_grid=param_test5,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=-1)

gs5= gs5.fit(X_train,y_train)
print("auc: ", gs5.best_score_)
print("Optimal Parameter: ", gs5.best_params_)
print("Optimal Estimator: ", gs5.best_estimator_)

auc:  0.6987419811050097
Optimal Parameter:  {'bagging_fraction': 0.1, 'feature_fraction': 0.5, 'min_sum_hessian_in_leaf': 29}
Optimal Estimator:  LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.5, min_data_in_leaf=95,
               min_sum_hessian_in_leaf=29, num_leaves=10, reg_alpha=10)


In [22]:
lgb6 = LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.5, min_data_in_leaf=95,
               min_sum_hessian_in_leaf=29, num_leaves=10, reg_alpha=10)

param_test6 = {
'random_seed':range(8800,9200,1)

}
gs6 = GridSearchCV(estimator=lgb6,
                  param_grid=param_test6,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=-1)

gs6= gs6.fit(X_train,y_train)
print("auc: ", gs6.best_score_)
print("Optimal Parameter: ", gs6.best_params_)
print("Optimal Estimator: ", gs6.best_estimator_)

auc:  0.7040793278128996
Optimal Parameter:  {'random_seed': 9166}
Optimal Estimator:  LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.5, min_data_in_leaf=95,
               min_sum_hessian_in_leaf=29, num_leaves=10, random_seed=9166,
               reg_alpha=10)


In [17]:
lgb6 = LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.5, min_data_in_leaf=95,
               min_sum_hessian_in_leaf=29, num_leaves=10, reg_alpha=10)

param_test6 = {
'random_seed':range(6800,7200,1)
}
gs6 = GridSearchCV(estimator=lgb6,
                  param_grid=param_test6,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=-1)

gs6= gs6.fit(X_train,y_train)
print("auc: ", gs6.best_score_)
print("Optimal Parameter: ", gs6.best_params_)
print("Optimal Estimator: ", gs6.best_estimator_)

auc:  0.7045604612701543
Optimal Parameter:  {'random_seed': 7044}
Optimal Estimator:  LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.5, min_data_in_leaf=95,
               min_sum_hessian_in_leaf=29, num_leaves=10, random_seed=7044,
               reg_alpha=10)


In [14]:
lgb7 = LGBMClassifier(bagging_fraction=0.1, feature_fraction=0.4, min_data_in_leaf=90,
               min_sum_hessian_in_leaf=6, num_leaves=10, reg_alpha=10)

param_test7 = {
'learning_rate':[0,0.1,0.15,0.05]
}
gs7 = GridSearchCV(estimator=lgb7,
                  param_grid=param_test7,
                  scoring='roc_auc',
                  cv=5,
                  n_jobs=-1)

gs7= gs7.fit(X_train,y_train)
print("auc: ", gs7.best_score_)
print("Optimal Parameter: ", gs7.best_params_)
print("Optimal Estimator: ", gs7.best_estimator_)





auc:  0.6978408156805933
Optimal Parameter:  {'learning_rate': 0.1}
Optimal Estimator:  LGBMClassifier(bagging_fraction=0.1,
               class_weight={0: 0.5627391127391127, 1: 4.484755108660396},
               feature_fraction=0.4, min_data_in_leaf=90,
               min_sum_hessian_in_leaf=6, num_leaves=10, reg_alpha=10)


In [15]:
policy_train_df = policy_df[policy_df['split'] == 'Train'].drop('split', axis=1)
policy_test_df = policy_df[policy_df['split'] == 'Test'].drop('split', axis=1)
ptrain, ptest = policy_train_df.copy(), policy_test_df.copy()
X, y = ptrain.drop(['convert_ind', 'policy_id'], axis=1), ptrain['convert_ind']
X_test = ptest.drop(['convert_ind', 'policy_id'], axis=1)

In [29]:
lgb = LGBMClassifier(bagging_fraction=0.1,
               feature_fraction=0.4, min_data_in_leaf=90,
               min_sum_hessian_in_leaf=6, num_leaves=10, reg_alpha=10)

In [30]:
lgb.fit(X,y)
y_pred = lgb.predict_proba(X_test)



In [31]:
TARGET=y_pred[:,1]
submission = pd.DataFrame({
    "policy_id": policy_test_df["policy_id"],
    "TARGET": TARGET
})

In [32]:
submission.to_csv(r"C:\Ziyuan Sui\MSBA\Fall22\Travelers\submission8.csv",index=False, mode='w')