### Importing Libraries

In [49]:
import pandas as pd # data analysis
import numpy as np # linear algebra

import matplotlib.pyplot as plt # data_viz
import seaborn as sns # data_viz
import scipy.stats as stats

### Config
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)



### Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score,cross_validate
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder

### Feature Selection
### Filtering
from sklearn.feature_selection import chi2,f_classif,f_regression,mutual_info_classif,mutual_info_regression,RFE,SelectFromModel,SelectKBest,SelectPercentile
from scipy.stats import chi2_contingency
### Wrapper 

from sklearn.feature_selection import RFE




### pipeline
from sklearn.pipeline import Pipeline

## Feature Selection
from sklearn.feature_selection import chi2,f_regression,f_classif,mutual_info_classif,mutual_info_regression,RFECV
from sklearn.feature_selection import SelectKBest,SelectPercentile,SelectFromModel


### Machine Learning Model

### Tree Based Algorithms

from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier,VotingRegressor
from xgboost import XGBClassifier,XGBRegressor
from lightgbm import LGBMClassifier,LGBMRegressor

### Linear Algoritms


from sklearn.linear_model import LinearRegression,LogisticRegression,ElasticNet
from sklearn.svm import SVC,SVR

#### Voting Classifier



### Metrics ###
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score,confusion_matrix,recall_score,precision_score,f1_score,\
r2_score,mean_absolute_error,mean_squared_error

#### Model Explain


## Others
## Warning
import warnings
warnings.filterwarnings('ignore')

### Saving Model
import pickle

### Reading Data

In [50]:
df = pd.read_csv('datasets\selected_features.csv')

In [51]:
df.head()

Unnamed: 0,CONTRACT,TENURE,TECHSUPPORT,INTERNETSERVICE,ONLINESECURITY,DEVICEPROTECTION,MONTHLYCHARGES,PAYMENTMETHOD,STREAMINGMOVIES,ONLINEBACKUP,CHURN
0,Month-to-month,1,No,DSL,No,No,29.85,Electronic check,No,Yes,0
1,One year,34,No,DSL,Yes,Yes,56.95,Mailed check,No,No,0
2,Month-to-month,2,No,DSL,Yes,No,53.85,Mailed check,No,Yes,1
3,One year,45,Yes,DSL,Yes,Yes,42.3,Bank transfer (automatic),No,No,0
4,Month-to-month,2,No,Fiber optic,No,No,70.7,Electronic check,No,No,1


In [52]:
df['ONLINESECURITY']= df.ONLINESECURITY.replace({'No internet service':'No_internet'})
df['DEVICEPROTECTION'] = df.ONLINESECURITY.replace({'No internet service':'No_internet'})
df['STREAMINGMOVIES'] = df['STREAMINGMOVIES'].replace({'No internet service':'No_internet'})
df['ONLINEBACKUP'] = df['ONLINEBACKUP'].replace({'No internet service':'No_internet'})
df['TECHSUPPORT'] =df['TECHSUPPORT'].replace({'No internet service':'No_internet'})

In [55]:
X = df.copy()
y = X.pop('CHURN')
X = pd.get_dummies(X,drop_first=True)

In [56]:
X.head()

Unnamed: 0,TENURE,MONTHLYCHARGES,CONTRACT_One year,CONTRACT_Two year,TECHSUPPORT_No_internet,TECHSUPPORT_Yes,INTERNETSERVICE_Fiber optic,INTERNETSERVICE_No,ONLINESECURITY_No_internet,ONLINESECURITY_Yes,DEVICEPROTECTION_No_internet,DEVICEPROTECTION_Yes,PAYMENTMETHOD_Credit card (automatic),PAYMENTMETHOD_Electronic check,PAYMENTMETHOD_Mailed check,STREAMINGMOVIES_No_internet,STREAMINGMOVIES_Yes,ONLINEBACKUP_No_internet,ONLINEBACKUP_Yes
0,1,29.85,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,34,56.95,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0
2,2,53.85,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1
3,45,42.3,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
4,2,70.7,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


### Hyperparameter Tuning

In [59]:
lr_params = {'solver' : ['lbfgs'],
             'penalty' : ['l2'],
             'C' : [100, 10, 1.0, 0.1, 0.01]}

cart_params = {'max_depth': range(1, 20),
               "min_samples_split": range(2, 30)}

rf_params = {"max_depth": [5, 8, 15, None],'n_estimators': [100,200,300]}

xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8, 12, 20],
                  "n_estimators": [100, 200],
                  "colsample_bytree": [0.5, 0.8, 1]}

lightgbm_params = {"learning_rate": [0.01, 0.1],
                   "n_estimators": [300, 500, 1500],
                   "colsample_bytree": [0.5, 0.7, 1]}


gbm_params =      {"learning_rate": [0.01, 0.1],
                   "n_estimators": [300, 500, 1500],}


classifiers = [('LR', LogisticRegression(), lr_params),
               ("CART", DecisionTreeClassifier(random_state=1), cart_params),
               ("RF", RandomForestClassifier(random_state=1), rf_params),
               ("GBM", GradientBoostingClassifier(),gbm_params),
               ('XGBoost', XGBClassifier(random_state=1,eval_metric='auc'), xgboost_params),
               ('LightGBM', LGBMClassifier(random_state=1), lightgbm_params)]


best_models = {}


for name, classifier, params in classifiers:
    print(f"########## {name} ##########")
    cv_results = cross_validate(classifier, X, y, cv=10, scoring=["roc_auc"])
    print(f"AUC (Before): {round(cv_results['test_roc_auc'].mean(),4)}")

    gs_best = GridSearchCV(classifier, params, cv=10, n_jobs=-1, verbose=False).fit(X, y)
    final_model = classifier.set_params(**gs_best.best_params_)

    cv_results = cross_validate(final_model, X, y, cv=10, scoring=["roc_auc"])
    print(f"AUC (After): {round(cv_results['test_roc_auc'].mean(), 4)}")
    print(f"{name} best params: {gs_best.best_params_}", end="\n\n")

    best_models[name] = final_model

########## LR ##########
AUC (Before): 0.8395
AUC (After): 0.8396
LR best params: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

########## CART ##########
AUC (Before): 0.6554
AUC (After): 0.8284
CART best params: {'max_depth': 5, 'min_samples_split': 8}

########## RF ##########
AUC (Before): 0.7997
AUC (After): 0.8429
RF best params: {'max_depth': 8, 'n_estimators': 100}

########## GBM ##########
AUC (Before): 0.8447
AUC (After): 0.8447
GBM best params: {'learning_rate': 0.01, 'n_estimators': 300}

########## XGBoost ##########
AUC (Before): 0.8243
AUC (After): 0.843
XGBoost best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

########## LightGBM ##########
AUC (Before): 0.8351
AUC (After): 0.8453
LightGBM best params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'n_estimators': 300}



#### Ensemble Approach ( Voting Classifier )

In [60]:
voting_clf = VotingClassifier(estimators=[('LightGBM',best_models['LightGBM']),
                                          ('XGBoost',best_models['XGBoost']),
                                          ('LogisticRegression',best_models['LR']),
                                          ('GBM',best_models['GBM'])],
                                           voting ='soft')


In [61]:
voting_clf.fit(X,y)

VotingClassifier(estimators=[('LightGBM',
                              LGBMClassifier(colsample_bytree=0.7,
                                             learning_rate=0.01,
                                             n_estimators=300,
                                             random_state=1)),
                             ('XGBoost',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.8,
                                            enable_categorical=False,
                                            eval_metric='auc', gamma=None,
                                            gpu_id=None, importance_type=None,
                                            interaction_c...
                                            monotone_constraints=None,
                           

In [62]:
cv_result = cross_validate(voting_clf,X,y,cv=10,scoring=["accuracy", "f1", "roc_auc"])

In [63]:
print('Accuracy: ',cv_result['test_accuracy'].mean())
print('F1_Score: ',cv_result['test_f1'].mean())
print('Roc_auc_Score: ',cv_result['test_roc_auc'].mean())

Accuracy:  0.8010500856717963
F1_Score:  0.5730062386950437
Roc_auc_Score:  0.8469842119582017


In [65]:
pickle.dump(voting_clf,open('model.pkl','wb'))

In [66]:
model = pickle.load(open('model.pkl','rb'))

In [67]:
model.predict(X.sample(2,random_state=1))

array([0, 1], dtype=int64)

In [69]:
os.getcwd()

'C:\\Users\\Ugur\\Desktop\\Hazir\\Customer_Churn_Analysis\\notebooks'

In [70]:
print('Churn Probability: %',int(100*round(model.predict_proba(X.sample(1,random_state=1))[0][1],2)))

Churn Probability: % 2


In [75]:
np.array([1,2,3,4]).reshape(-1,1)

array([[1],
       [2],
       [3],
       [4]])