In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

In [2]:
data = pd.read_csv('final_telecom_churn_data.csv')

In [3]:
inputs = data.drop('Churn', axis=1)
outputs = data['Churn']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(inputs, outputs, test_size=0.2, random_state=42)

In [5]:
kfolds = 5
startified_split = StratifiedKFold(n_splits=kfolds, shuffle=True, random_state=42)

In [6]:
models = [('XGB', XGBClassifier()), ('DT', DecisionTreeClassifier()), ('GB', GradientBoostingClassifier()),
         ('LGBM', LGBMClassifier()), ('RF', RandomForestClassifier()), ('SVM', SVC())]

In [7]:
for name, model in models:
    cv_result = cross_val_score(model, x_train,y_train, cv=startified_split, scoring='accuracy', n_jobs=-1)
    score = round(np.mean(cv_result), 4)
    print(f'{name} cross validation accuracy score is : {score}')

XGB cross validation accuracy score is : 0.7854
DT cross validation accuracy score is : 0.7299
GB cross validation accuracy score is : 0.7977
LGBM cross validation accuracy score is : 0.7888
RF cross validation accuracy score is : 0.7883
SVM cross validation accuracy score is : 0.7863


Gradient boosting, random forest and Light gradient boosting performing best. <br>
due to system limitations we will hypertune only these models.

In [8]:
selected_model = [('GB', GradientBoostingClassifier()), ('LGBM', LGBMClassifier()), ('RF', RandomForestClassifier())]

In [9]:
hypertuning_params = {
    'GB': {
        'learning_rate': [0.05],
        'n_estimators' : [10, 50, 100, 200, 500],
        'max_depth' : [3, 5, 10],
        'subsample': [0.7, 0.8, 1]
    },
    'LGBM': {
        'learning_rate': [0.05],
        'n_estimators' : [10, 50, 100, 200, 500],
        'max_depth' : [3, 5, 10],
        'subsample': [0.7, 0.8, 1],
        'max_bin ': [200, 250, 300]
    },
    'RF': {
        'max_depth' : [3, 5, 10],
        'max_leaf_nodes': [10, 25, 30, None],
        'min_samples_leaf': [1, 10, 50, 100],
        'n_estimators' : [10, 50, 100, 200, 500],
        'bootstrap' : [True, False],
    }
}

In [11]:
for name, model in selected_model:
    randomize_result = RandomizedSearchCV(model, hypertuning_params[name], cv=startified_split, verbose=1, n_jobs=-1)
    result = randomize_result.fit(x_train, y_train)
    print(name)
    print('Best Parameters',result.best_params_)
    print('Best Score',result.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.1min finished


GB
Best Parameters {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}
Best Score 0.8010296280068904
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.4s finished


LGBM
Best Parameters {'subsample': 1, 'n_estimators': 100, 'max_depth': 5, 'max_bin ': 250, 'learning_rate': 0.05}
Best Score 0.7990788036583079
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   24.3s finished


RF
Best Parameters {'n_estimators': 50, 'min_samples_leaf': 10, 'max_leaf_nodes': None, 'max_depth': 10, 'bootstrap': True}
Best Score 0.8010302584235486


### Let's stack all three models.

In [12]:
gb = GradientBoostingClassifier(subsample=0.8, n_estimators=200, max_depth=3, learning_rate=0.05)
lgbm = LGBMClassifier(subsample=0.8, n_estimators=100, max_depth=5, learning_rate=0.05)
rf = RandomForestClassifier(n_estimators=50, min_samples_leaf=10, max_leaf_nodes=None, max_depth=10, bootstrap=True)

In [14]:
gb.fit(x_train, y_train)
lgbm.fit(x_train, y_train)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
pickle.dump(gb, open('gradient_boosting_model.sav', 'wb'))
pickle.dump(lgbm, open('light_gradient_boosting.sav', 'wb'))
pickle.dump(rf, open('random_forest.sav', 'wb'))

In [34]:
def most_frequent(List): 
    return max(set(List), key = List.count)
def predict(x):
    gb_re = np.round(gb.predict(x))
    lgbm_re = np.round(lgbm.predict(x))
    rf_re = np.round(rf.predict(x))
    final_result = []
    for i in range(len(rf_re)):
        result = most_frequent([gb_re[i], lgbm_re[i], rf_re[i]])
        final_result.append(result)
    return np.array(final_result)

In [35]:
predictions = predict(x_test)

In [37]:
accuracy_score(y_test, predictions)

0.8126330731014905

#### By stacking models accuracy increased by one percent.
We would need more data to increase accuracy.