In [54]:
import pandas as pd
from xgboost import XGBClassifier,DMatrix,train
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold,cross_val_score
from numpy import mean,std
from datetime import datetime
from sklearn.metrics import accuracy_score,classification_report,precision_score,recall_score

## Loading Data

In [1]:
# data = pd.read_csv('../Datasets/telecommunications_churn.csv',sep=';')
# data.head()

In [60]:
data = pd.read_csv('../Datasets/oversampled_data.csv')
del data['Unnamed: 0']
data.head()

Unnamed: 0,account_length,voice_mail_plan,voice_mail_messages,day_mins,evening_mins,night_mins,international_mins,customer_service_calls,international_plan,day_calls,day_charge,evening_calls,evening_charge,night_calls,night_charge,international_calls,international_charge,total_charge,churn
0,128,1,25,265.1,197.4,244.7,10.0,1,0,110,45.07,99,16.78,91,11.01,3,2.7,75.56,0
1,107,1,26,161.6,195.5,254.4,13.7,1,0,123,27.47,103,16.62,103,11.45,3,3.7,59.24,0
2,137,0,0,243.4,121.2,162.6,12.2,0,0,114,41.38,110,10.3,104,7.32,5,3.29,62.29,0
3,84,0,0,299.4,61.9,196.9,6.6,2,1,71,50.9,88,5.26,89,8.86,7,1.78,66.8,0
4,75,0,0,166.7,148.3,186.9,10.1,3,1,113,28.34,122,12.61,121,8.41,3,2.73,52.09,0


### Scaling data and seperating X and y

In [61]:
std_sclr = StandardScaler()
X = data.drop(labels='churn',axis=1)
cols = X.columns
X = pd.DataFrame(std_sclr.fit_transform(X))
X.columns = cols

y = data[['churn']]
X.shape,y.shape

((5700, 18), (5700, 1))

### Train Test Split

In [62]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,shuffle=True,random_state=10,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((4845, 18), (855, 18), (4845, 1), (855, 1))

### Model cross validation using RepeatedStratifiedKfold

In [63]:

cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=10)

cv_scores = cross_val_score(XGBClassifier(),X,y,cv=cv,scoring='accuracy',n_jobs=-1)
print('accuarcy %.3f : '%mean(cv_scores))
print('with a std of %.3f : '%std(cv_scores))

accuarcy 0.998 : 
with a std of 0.002 : 


### Model Building

In [64]:
#xgb model
xgb_model = XGBClassifier()

### Model Training

In [65]:
## using Dtrain to train XGb model

start_time = datetime.now()
train_matix = DMatrix(data=X_train,label=y_train)

parameters = {'max_depth':4, 
              'objective':'binary:logistic',
              'eval_metric':'auc',
              'learning_rate':.05,}

xgb_model = train(params=parameters,dtrain=train_matix)
end_time = datetime.now()

In [66]:
execution_time = end_time - start_time
print(execution_time)

0:00:00.029982


### Model testing using train data

In [67]:
## testing and eavluation of Dmatrix for train data
test_matrix = DMatrix(data=X_train)
y_train_pred = xgb_model.predict(test_matrix)
y_train_pred.shape

(4845,)

#### converting probabilities to labels

In [68]:
for i in range(0,y_train_pred.shape[0]):
    if y_train_pred[i]>0.5:
        y_train_pred[i] = 1
    elif y_train_pred[i]<=0.5:
        y_train_pred[i] = 0
        
print(y_train_pred)

[1. 1. 0. ... 0. 0. 0.]


### Model evaluation using train data

In [69]:
round(accuracy_score(y_train,y_train_pred)*100,2)

90.2

### Model Testing using test data

In [70]:
## testing and eavluation of Dmatrix
test_matrix = DMatrix(data=X_test)
y_test_pred = xgb_model.predict(test_matrix)
y_test_pred.shape

(855,)

### converting predictions into labels

In [71]:
for i in range(0,y_test_pred.shape[0]):
    if y_test_pred[i]>0.5:
        y_test_pred[i] = 1
    elif y_test_pred[i]<=0.5:
        y_test_pred[i] = 0
        
print(y_test_pred)

[0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0.
 0. 1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0.
 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0.
 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1.
 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0.

### Model evaluation using test data

In [72]:
round(accuracy_score(y_test,y_test_pred)*100,2)

90.76

In [73]:
print('/n',classification_report(y_test,y_test_pred))

/n               precision    recall  f1-score   support

           0       0.88      0.94      0.91       427
           1       0.94      0.88      0.90       428

    accuracy                           0.91       855
   macro avg       0.91      0.91      0.91       855
weighted avg       0.91      0.91      0.91       855



In [74]:
round(precision_score(y_test,y_test_pred)*100,2)

93.52

In [75]:
round(recall_score(y_test,y_test_pred)*100,2)

87.62