In [None]:
# TELCO CUSTOMER CHURN BUSINESS CASE

# Exploratory analysis w/ pandas & seaborn

import pandas as pd

telcoc.value_counts
print(telco.groupby(['Churn']).mean())
print(telco.groupby(['Churn']).std())

# Count the number of churners and non-churners by State
print(telco.groupby('State')['Churn'].value_counts())

# Distribution visualization
import matplolib.pyplot as plt
import seaborn as sns #seaborn is built on top of matplotlib

# Create the distribution plot
sns.distplot(telco['Account_length']) #check the distribution ie normal
plt.show()

# Create the box plot
sns.boxplot(x = 'Churn',
            y = 'CustServ_Calls',
            data = telco,
            sym = "", #remove the outliers
            hue = 'Intl_Plan')#hue: 3rd variable to visualize whether or not having 
#a voice mail plan affects the number of customer service calls or churn.

# Display the plot
plt.show()

In [None]:
#Preprocessing including feature scaling > standardization 

# standardization centers the distribution around the mean
# calculates the number of std away from the mean each point is


# ENCODING BINARY FEATURES
telco.info() #transform objects into binary

# Replace 'no' with 0 and 'yes' with 1 in 'Vmail_Plan'
telco['Vmail_Plan'] = telco['Vmail_Plan'].replace(('yes', 'no'), (1, 0))

# Replace 'no' with 0 and 'yes' with 1 in 'Churn'
telco['Churn'] = telco['Churn'].replace(('yes', 'no'), (1, 0))

# Print the results to verify
print(telco['Vmail_Plan'].head())
print(telco['Churn'].head())


# DUMMIES
# Perform one hot encoding on 'State'
telco_state = pd.get_dummies(telco.State)

# Print the head of telco_state
print(telco_state.head())


# FEATURE SCALING
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Scale telco using StandardScaler
telco_scaled = StandardScaler().fit_transform(telco)
telco_scaled_df = pd.DataFrame(telco_scaled, columns=["Intl_Calls", "Night_Mins"]) # Add column names back for readability
print(telco_scaled_df.describe())

# FEATURES ENGINEERING
# Drop the unnecessary features
telco = telco.drop(['Area_Code', 'Phone'], axis=1)

# Engineering a new column
telco['Avg_Night_Calls'] = telco.Night_Mins / telco.Night_Calls
print(telco.Avg_Night_Calls)

In [None]:
# Dirty models

# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression() # Instantiate the classifier
clf.fit(telco[features], telco['Churn']) # Train the classifier
print(clf.predict(new_customer)) #print the prediction

# Decision Tree Classifier
from sklearn.linear_model import DecisionTreeClassifier

clf = DecisionTreeClassifier() # Instantiate the classifier
clf.fit(telco[features], telco['Churn']) # Train the classifier
print(clf.predict(new_customer)) #print the prediction

# Support Vector Classifier
from sklearn.svm import SVC

svc = SVC() #instantiate the model
svc.fit(telco[features], telco['Churn']) #train the model
print(svc.predict(new_customer))

In [None]:
# RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score 

X = telco.drop('Churn', axis=1) # drop the target variable
y = telco['Churn'] # Create target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # Create training and testing sets

clf = RandomForestClassifier() #instantiate the model
clf.fit(X_train,y_train) #train the model

print(clf.score(X_test, y_test)) # Compute accuracy

# <script.py> output:
#  0.934

# Evaluating Model Performance

# accuracy might not be that useful with unbalanced classes
# precision: true positives / (true positives + false positives)
# recall: true positives / (true positives + false negatives) =synonymous with sensitivity

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred)) 
# <script.py> output:
#[[842  13]
# [ 53  92]]

print(precision_score(y_test, y_pred))
# <script.py> output:
# 0.9176470588235294

print(recall_score(y_test, y_pred))
# <script.py> output:
# 0.7513761467889908

y_pred_prob = clf.predict_proba(X_test)[:, 1] # Generate the probabilities
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) # Calculate the roc metrics

plt.plot(fpr,tpr) #plot
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot([0, 1], [0, 1], "k--")
plt.show()

print(roc_auc_score(y_test, y_pred_prob)) # Print the Area Under the Curve AUC
# <script.py> output:
#    0.8938011695906432

print(f1_score(y_test, y_pred))
# <script.py> output:
#    0.723404255319149

In [None]:
# Model tuning

# Hyper parameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'max_features': ['auto', 'sqrt', 'log2']} # Create the hyperparameter grid

grid_search = GridSearchCV(clf, param_grid) # Call GridSearchCV
grid_search.fit(X, y) # Fit the model

print(grid_search.best_params_) # Print the optimal parameters
#{'max_features': 'log2'}

param_dist = {"max_depth": [3, None], # Create the hyperparameter grid
              "max_features": randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

random_search = RandomizedSearchCV(clf, param_dist)
random_search.fit(X,y)

print(random_search.best_params_) 
#<script.py> output:
#    {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10}

# Feature importances

# > Which features are important to predict churn? Which ones can be removed from the model?

importances = clf.feature_importances_ # Calculate feature importances

plt.barh(range(X.shape[1]), importances) #horizontal bar plot
plt.show()

sorted_index = np.argsort(importances) # Sort importances
labels = X.columns[sorted_index]

plt.clf() # Clear current plot
plt.barh(range(X.shape[1]), importances[sorted_index], tick_label=labels) # new plot
plt.show()

In [None]:
# Adding new features

# 6 new features have been added to the telco DataFrame:

# Region_Code
# Cost_Call
# Total_Charge
# Total_Minutes
# Total_Calls
# Min_Call

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = RandomForestClassifier() #instantiate the model
clf.fit(X_train,y_train) #train the model

print(clf.score(X_test, y_test)) # Compute accuracy
# <script.py> output:
#  0.954

y_pred = clf.predict(X_test)

print(f1_score(y_test, y_pred))
# <script.py> output:
#  0.8130081300813008