In [23]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from datetime import datetime
import xgboost as xgb
import pandas as pd
import numpy as np
import csv

In [24]:
features_to_encode = ['Gender', 'Married', 'Children', 'Phone_Service', 'Multiple_Lines', 'Internet_Service', 'Online_Security', 'Online_Backup', 'Device_Protection', 'Tech_Support', 'Streaming_TV', 'Streaming_Movies', 'Type_contract', 'Paperless_Billing', 'Payment_Method']
train = pd.read_csv('train.csv')
train = train.replace({'': 0, ' ': 0})
le = preprocessing.LabelEncoder()
for feature in features_to_encode:
    train[feature] = le.fit_transform(train[feature])
train.head()

Unnamed: 0,id,Leave_Next_Month,Gender,Pensioner,Married,Children,Months_with_company,Phone_Service,Multiple_Lines,Internet_Service,...,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Type_contract,Paperless_Billing,Payment_Method,Monthly_Payment,Total_Payment
0,104,1,0,0,0,0,2,1,0,1,...,0,0,0,0,0,0,1,1,70.7,151.65
1,107,0,0,0,0,0,10,0,1,0,...,0,0,0,0,0,0,0,2,29.75,301.9
2,108,1,0,0,1,0,28,1,2,1,...,0,2,2,2,2,0,1,1,104.8,3046.05
3,109,0,1,0,0,1,62,1,0,0,...,2,0,0,0,0,1,0,0,56.15,3487.95
4,111,0,1,0,0,0,16,1,0,2,...,1,1,1,1,1,2,0,3,18.95,326.8


In [25]:
test = pd.read_csv('test.csv')
test = test.replace({'': 0, ' ': 0})
for feature in features_to_encode:
    test[feature] = le.fit_transform(test[feature])
train.head()

Unnamed: 0,id,Leave_Next_Month,Gender,Pensioner,Married,Children,Months_with_company,Phone_Service,Multiple_Lines,Internet_Service,...,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Type_contract,Paperless_Billing,Payment_Method,Monthly_Payment,Total_Payment
0,104,1,0,0,0,0,2,1,0,1,...,0,0,0,0,0,0,1,1,70.7,151.65
1,107,0,0,0,0,0,10,0,1,0,...,0,0,0,0,0,0,0,2,29.75,301.9
2,108,1,0,0,1,0,28,1,2,1,...,0,2,2,2,2,0,1,1,104.8,3046.05
3,109,0,1,0,0,1,62,1,0,0,...,2,0,0,0,0,1,0,0,56.15,3487.95
4,111,0,1,0,0,0,16,1,0,2,...,1,1,1,1,1,2,0,3,18.95,326.8


In [90]:
features = list(test.columns)
features.remove('id')

params_xgb = {
    "objective": "binary:logistic",
    "eval_metric": 'auc',
    "eta": 0.05,
    "max_depth": 2,
    "subsample": 0.6,
    "colsample_bytree": 0.8,
}
num_boost_round = 200
early_stopping_rounds = 10
x_train, x_test, y_train, y_test = train_test_split(train[features], train['Leave_Next_Month'], test_size=0.2, random_state=1, shuffle=True, stratify=train['Leave_Next_Month'])
dtrain = xgb.DMatrix(x_train.values, y_train.values)
dvalid = xgb.DMatrix(x_test.values, y_test.values)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params_xgb, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=10)
test_pred = gbm.predict(xgb.DMatrix(x_test.values), iteration_range=(0, gbm.best_iteration + 1))
accuracy = accuracy_score(y_test.values, np.round(test_pred))
auc = roc_auc_score(y_test.values, test_pred)
print('Accuracy: {:.2f} %, ROC AUC: {:.2f}'.format(100*accuracy, auc))
pred = gbm.predict(xgb.DMatrix(test[features].values), iteration_range=(0, gbm.best_iteration + 1))

[0]	train-auc:0.77403	eval-auc:0.71252
[10]	train-auc:0.83850	eval-auc:0.81533
[20]	train-auc:0.84405	eval-auc:0.81471
[30]	train-auc:0.85078	eval-auc:0.82083
[40]	train-auc:0.85576	eval-auc:0.82201
[50]	train-auc:0.86027	eval-auc:0.82520
[60]	train-auc:0.86291	eval-auc:0.82818
[70]	train-auc:0.86545	eval-auc:0.82897
[80]	train-auc:0.86740	eval-auc:0.83112
[90]	train-auc:0.86897	eval-auc:0.83071
[92]	train-auc:0.86936	eval-auc:0.83090
Accuracy: 77.73 %, ROC AUC: 0.83


In [94]:
ids = test['id'].values
# Saving predictions
date = datetime.now().strftime('%d%m%y_%H%M%S')
with open(f'output/{date}.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'Leave_Next_Month'])
    for i in range(len(ids)):
        writer.writerow([ids[i], float(pred[i])])
print(f'Prediction saved to output/{date}.csv')

Prediction saved to output/180522_170449.csv
