In [14]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import KFold
from sklearn import preprocessing
from datetime import datetime
import xgboost as xgb
import pandas as pd
import numpy as np
import csv

In [12]:
features_to_encode = ['Gender', 'Married', 'Children', 'Phone_Service', 'Multiple_Lines', 'Internet_Service', 'Online_Security', 'Online_Backup', 'Device_Protection', 'Tech_Support', 'Streaming_TV', 'Streaming_Movies', 'Type_contract', 'Paperless_Billing', 'Payment_Method']
train = pd.read_csv('train.csv')
train = train.replace({'': 0, ' ': 0})
le = preprocessing.LabelEncoder()
for feature in features_to_encode:
    train[feature] = le.fit_transform(train[feature])
train.head()

Unnamed: 0,id,Leave_Next_Month,Gender,Pensioner,Married,Children,Months_with_company,Phone_Service,Multiple_Lines,Internet_Service,...,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Type_contract,Paperless_Billing,Payment_Method,Monthly_Payment,Total_Payment
0,104,1,0,0,0,0,2,1,0,1,...,0,0,0,0,0,0,1,1,70.7,151.65
1,107,0,0,0,0,0,10,0,1,0,...,0,0,0,0,0,0,0,2,29.75,301.9
2,108,1,0,0,1,0,28,1,2,1,...,0,2,2,2,2,0,1,1,104.8,3046.05
3,109,0,1,0,0,1,62,1,0,0,...,2,0,0,0,0,1,0,0,56.15,3487.95
4,111,0,1,0,0,0,16,1,0,2,...,1,1,1,1,1,2,0,3,18.95,326.8


In [13]:
test = pd.read_csv('test.csv')
test = test.replace({'': 0, ' ': 0})
for feature in features_to_encode:
    test[feature] = le.fit_transform(test[feature])
train.head()

Unnamed: 0,id,Leave_Next_Month,Gender,Pensioner,Married,Children,Months_with_company,Phone_Service,Multiple_Lines,Internet_Service,...,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Type_contract,Paperless_Billing,Payment_Method,Monthly_Payment,Total_Payment
0,104,1,0,0,0,0,2,1,0,1,...,0,0,0,0,0,0,1,1,70.7,151.65
1,107,0,0,0,0,0,10,0,1,0,...,0,0,0,0,0,0,0,2,29.75,301.9
2,108,1,0,0,1,0,28,1,2,1,...,0,2,2,2,2,0,1,1,104.8,3046.05
3,109,0,1,0,0,1,62,1,0,0,...,2,0,0,0,0,1,0,0,56.15,3487.95
4,111,0,1,0,0,0,16,1,0,2,...,1,1,1,1,1,2,0,3,18.95,326.8


In [28]:
features = list(test.columns)
features.remove('id')

params_xgb = {
    "objective": "binary:logistic",
    "eval_metric": 'auc',
    "eta": 0.05,
    "max_depth": 2,
    "subsample": 0.6,
    "colsample_bytree": 0.8,
}
num_boost_round = 200
early_stopping_rounds = 10
kf = KFold(5, shuffle=True, random_state=5)
pred = np.zeros(len(test))
for train_index, test_index in kf.split(train):
    train_data = train.loc[train_index]
    test_data = train.loc[test_index]
    x_train = train_data[features]
    y_train = train_data['Leave_Next_Month']
    x_test = test_data[features]
    y_test = test_data['Leave_Next_Month']
    dtrain = xgb.DMatrix(x_train.values, y_train.values)
    dvalid = xgb.DMatrix(x_test.values, y_test.values)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=10)
    test_pred = gbm.predict(xgb.DMatrix(x_test.values), iteration_range=(0, gbm.best_iteration + 1))
    accuracy = accuracy_score(y_test.values, np.round(test_pred))
    auc = roc_auc_score(y_test.values, test_pred)
    print('Accuracy: {:.2f} %, ROC AUC: {:.2f}'.format(100*accuracy, auc))
    pred += gbm.predict(xgb.DMatrix(test[features].values), iteration_range=(0, gbm.best_iteration + 1))

[0]	train-auc:0.77224	eval-auc:0.75993
[10]	train-auc:0.84132	eval-auc:0.81573
[20]	train-auc:0.84761	eval-auc:0.81997
[30]	train-auc:0.85016	eval-auc:0.82204
[40]	train-auc:0.85537	eval-auc:0.82396
[50]	train-auc:0.85951	eval-auc:0.82550
[60]	train-auc:0.86251	eval-auc:0.82659
[70]	train-auc:0.86549	eval-auc:0.83022
[80]	train-auc:0.86717	eval-auc:0.83288
[90]	train-auc:0.86872	eval-auc:0.83349
[100]	train-auc:0.87008	eval-auc:0.83452
[110]	train-auc:0.87149	eval-auc:0.83435
[120]	train-auc:0.87249	eval-auc:0.83554
[130]	train-auc:0.87342	eval-auc:0.83624
[140]	train-auc:0.87418	eval-auc:0.83539
[144]	train-auc:0.87444	eval-auc:0.83570
Accuracy: 78.72 %, ROC AUC: 0.84
[0]	train-auc:0.77446	eval-auc:0.77138
[10]	train-auc:0.83927	eval-auc:0.82684
[20]	train-auc:0.84231	eval-auc:0.83126
[30]	train-auc:0.84624	eval-auc:0.83827
[40]	train-auc:0.85092	eval-auc:0.84119
[50]	train-auc:0.85454	eval-auc:0.84308
[60]	train-auc:0.85692	eval-auc:0.84550
[70]	train-auc:0.85914	eval-auc:0.84753
[80

In [7]:
ids = test['id'].values
pred = pred/5
# Saving predictions
date = datetime.now().strftime('%d%m%y_%H%M%S')
with open(f'output/{date}.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'Leave_Next_Month'])
    for i in range(len(ids)):
        writer.writerow([ids[i], float(pred[i])])
print(f'Prediction saved to output/{date}.csv')

Prediction saved to output/180522_163013.csv
