# Package and Data Importing

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from category_encoders import OneHotEncoder
import seaborn as sns
from xgboost import XGBClassifier

df = pd.read_csv('../assets/churn_ds.csv')
df.head()

ModuleNotFoundError: No module named 'category_encoders'

In [2]:
# Graphing features for exploratory analysis

import plotly.express as px

px.bar(df, x='Contract', y='tenure', color='Churn')

ModuleNotFoundError: No module named 'plotly'

# Data Cleaning and Train/Validate/Test Split

In [None]:
def fix_float(cell):
  try: 
    return float(cell)
  except: 
    print(cell)
    return np.NaN

### Feature / Target Split 
- By using this set of data our goal is to predict if a customer is likely to continue service with the provider. The **Target** will be if the customer has Churned - that means they have discontinued service. Churn is a term used in marketing to describe when a customer comes into the system and leaves rather quickly, yeild a low CLV. The Feature we will be using to predict this will be a combination of person charateristics: Age, Gender, Family. Service offers they currently have with the provider: Phone Service, Internet Service, Online Security, Streaming TV, etc. Also Subscription details: Contract, Billing, Payment Method, Monthly and Total Charges. 

In [None]:
# Cleaning Wrangle function 

def wrangle(X):
    
    X = X.copy()
    
    # fixing column to change to float
    X['TotalCharges'] = X['TotalCharges'].apply(fix_float)
                                                  
    # replacing Yes/No with True/False
    columns = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']                                              
    for col in columns:
        X[col] = X[col].apply(lambda cell: cell.lower() == 'yes')

    y = X['Churn']    
        
    X.drop(['customerID', 'Churn'], axis=1, inplace=True)
                                                  
    return X, y

X, y = wrangle(df)

In [None]:
df1.sort_values(by='Importances', ascending=False)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# Train Val split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.30, stratify=y, random_state=42)

In [None]:
# Val test split 
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=.50, stratify=y_val, random_state=42)

In [None]:
# checking for CV sizes
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.head()

In [None]:
X_train.info()

# Baseline

In [None]:
# Baseline

y_train.value_counts(normalize=True)

### Baseline for Churn
- From the above baseline we can see our mean values for predicting the positive charatertic of Churn is 73.5%. Our Goal is to create a model in which we can predict to a higher level of certaintity if a customer will decide to terminate service. As well as what features are indicative of this.


# Fitting Different Model Types

### Initial Simple Model: Logistic Regression 

In [None]:
lin_model = make_pipeline(
    ce.OneHotEncoder(), 
    SimpleImputer(strategy='median'), 
    LogisticRegression()
)

lin_model.fit(X_train, y_train)

In [None]:
print('Training Accuracy: ', lin_model.score(X_train, y_train))
print('Validation Accuracy:', lin_model.score(X_val, y_val))


### Hyper Parameter Tuning for Logistic Regression Model

In [None]:
# pipeline for model
lin_model_tuned = make_pipeline(
    ce.OneHotEncoder(), 
    SimpleImputer(), 
    LogisticRegression()
)

# Params for grid search
params = {
    'logisticregression__penalty' : ['l1', 'l2'],
    'logisticregression__C' : np.logspace(-4, 4, 20), 
    'logisticregression__solver': ['lbfgs', 'liblinear'], 
    'logisticregression__max_iter': range(50,150, 25), 
}

# Grid Search object
gridcv = GridSearchCV(
    lin_model_tuned,
    param_grid = params, 
    n_jobs=-1, 
    cv=5, 
    scoring='accuracy', 
    verbose= True, 
    return_train_score= True,

)

# fit on data
best_gridcv = gridcv.fit(X_train, y_train)

# Print Best accuracy score
print(best_gridcv.score(X_val, y_val))

- After Tuning the model, it looks like the accuracy has improved just slightly. This leads us 

### Random Forest Model

In [None]:
# basic pipeline model
RF_model = make_pipeline(
    ce.OneHotEncoder(), 
    SimpleImputer(), 
    RandomForestClassifier()
)

RF_model.fit(X_train, y_train)
RF_model.score(X_val, y_val)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
y_pred_prob_rf = RF_model.predict_proba(X_val)[:, -1]

roc_auc_score(y_val, y_pred_prob_rf)

- Due to initual Validation score being very low compared to other models, Not going to further tune this model

### Tree Based Model + Hyper Param tuning

In [None]:
tree_model = make_pipeline(
    ce.OneHotEncoder(),
    XGBClassifier(n_estimators=100, random_state=42, n_jobs=6)
)

param_distributions= {
    'xgbclassifier__max_depth': range(4,6,1), 
}

search = RandomizedSearchCV(
    tree_model,
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='accuracy', 
    verbose= 5, 
    return_train_score= True,

)
search.fit(X_train, y_train)
tree_model_best = search.best_estimator_

In [None]:
tree_model_best.score(X_val, y_val)

- Sticking with Logistic Regression seems like the way to go

# Plotting Results

In [None]:
# Classification Report
from sklearn.metrics import classification_report

y_pred_lin = best_gridcv.predict(X_val)

report1 = classification_report(y_val, y_pred_lin)
print(report1)

In [None]:
# Classification Report
from sklearn.metrics import classification_report

y_pred_XG = tree_model_best.predict(X_val)

report2 = classification_report(y_val, y_pred_XG)
print(report2)

In [None]:
# Classification Report
from sklearn.metrics import classification_report

y_pred_RF = RF_model.predict(X_val)

report3 = classification_report(y_val, y_pred_RF)
print(report3)

### ROC - AUC Curves + Comparison

In [None]:
# Looking into ROC-AUC score for Linear Regression
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

y_pred_prob = best_gridcv.predict_proba(X_val)[:, -1]

fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)

dfroc = pd.DataFrame({'False Positive Rate': fpr, 
                     'True Positive Rate': tpr,
                    'Threshold': np.round(thresholds, 2)})

plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
print('LR Model ROC-AUC Score:', roc_auc_score(y_val, y_pred_prob))

In [None]:
# Obtaining ROC + COmparing it to LR model
y_pred_prob_t = tree_model_best.predict_proba(X_val)[:, -1]

# Creating ROC-AUC Curve
fpr_t, tpr_t, thresholds_t = roc_curve(y_val, y_pred_prob_t)

#DF for ROC-AUC information 
dfroc_t = pd.DataFrame({'False Positive Rate': fpr, 
                     'True Positive Rate': tpr,
                    'Threshold': np.round(thresholds, 2)})


#Plotting both lines on the same graph
plt.plot(fpr, tpr, label='Logistic Regression')
plt.plot(fpr_t, tpr_t, label='Boosting Model')
plt.legend()
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
# ROC_AUC for both models
print('LR Model ROC-AUC Score:', roc_auc_score(y_val, y_pred_prob))
print('XG Model ROC-AUC Score:', roc_auc_score(y_val, y_pred_prob_t))

### Permutation Dependence

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
result = permutation_importance(best_gridcv, X_val, y_val, 
                                n_repeats=5, random_state=0)

In [None]:
df1 = pd.DataFrame({'Feature': X_val.columns,
                   'Importances': np.round(result['importances_mean'], 3),
                   'importances_std': result['importances_std']})

In [None]:
df1.sort_values(by='Importances', ascending=False)

In [None]:
px.bar(df1, x='Feature', y='Importances')

### PDP Plots : Islolate + Interact

In [None]:
# PDP looking directly at Tenure and its interaction with the target Churn

from pdpbox.pdp import pdp_isolate, pdp_plot
feature = 'tenure'

isolated = pdp_isolate(
    model=best_gridcv, 
    dataset=X_val, 
    model_features=X_val.columns, 
    feature=feature
)

pdp_plot(isolated, feature_name=feature, plot_lines=True);


In [None]:
# PDP on Tenure and Monthly Charges 
from pdpbox.pdp import pdp_interact, pdp_interact_plot

features = ['tenure', 'MonthlyCharges']

interaction = pdp_interact(
    model=best_gridcv, 
    dataset=X_val, 
    model_features=X_val.columns, 
    features=features
)

pdp_interact_plot(interaction, plot_type='grid', feature_names=features);

# Running Model on Test Data

In [None]:
best_gridcv.score(X_test, y_test)

In [None]:
y_pred_prob_test = best_gridcv.predict_proba(X_test)[:, -1]



roc_auc_score(y_test, y_pred_prob_test)


In [None]:
from joblib import dump

dump(best_gridcv, 'model.joblib', compress=True)

In [None]:
import joblib
import sklearn
import category_encoders as ce
import xgboost
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'category_encoders=={ce.__version__}')
print(f'xgboost=={xgboost.__version__}')
