In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import sklearn.impute
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import env
import acquire
import prepare

pd.set_option('display.max_columns', None)

In [2]:
df = acquire.get_telco_data()

AttributeError: module 'acquire' has no attribute 'get_telco_data'

In [None]:
# split data into train, validate and test
X_train, y_train, X_validate, y_validate, X_test, y_test = prepare.split_telco(df)

In [None]:
X_train.shape

In [None]:
#print(X_train.head())
X_train.head()

In [None]:
X_train.dtypes

In [None]:
from scipy import stats

In [None]:
# the relatioship between churn rate and tenure in years
plt.figure(figsize=(15,10))
ax = sns.lineplot(x="tenure_years", y= "churn", data=X_train)
plt.title("Churn Rate VS Tenure Year")
ax.set_xlabel("Tenure Year")
ax.set_ylabel("Churn Rate")
plt.show()

Takeaway from Churn Rate VS Tenure Year: Generally speaking, the longer the tenure, the lower the churn rate

In [None]:
plt.title('partner_and_dependents VS churn rare')
ax = sns.barplot(x="partner_and_dependents", y="churn", hue = 'senior_citizen', ci=None, data=X_train)
ax.set_ylabel("Churn Rate")

Takeaways: It seems custmoers being as senior citizens always have a lower churn rate no matter they have partner and dependent or not!

In [None]:
plt.figure(figsize=(16, 12))
plt.suptitle('Churn Rate VS Internet Service Type', fontsize=15)

plt.subplot(331)
ax = sns.barplot(x="DSL", y="churn", ci=None, data=X_train)
ax.set_ylabel("Churn Rate")
plt.ylim(0, .5)

plt.subplot(332)
ax = sns.barplot(x="Fiber optic", y="churn", ci=None, data=X_train)
ax.set_ylabel("Churn Rate")
plt.ylim(0, .5)

plt.subplot(333)
ax = sns.barplot(x="None", y="churn", ci=None, data=X_train)
ax.set_ylabel("Churn Rate")
plt.ylim(0, .5)
plt.show()


Takeways: Comparing with other internet service, Fiber optic has the highest churn rate, while None internet service has the lowest churn rate.

## Hypothesis Testing
### T_Test for Tenure vs Churn
$H_0$: there is no difference in tenure between customers who are still with telco and who have churned

$H_a$: there is a difference in tenure between customers who are still with telco and who have churned

In [None]:
# since churn is a categorical variable, and tenure_years is a continous variable, use T-Test
alpha = 0.05
x1 = X_train[X_train.churn == 1].tenure_years
x2 = X_train[X_train.churn == 0].tenure_years

In [None]:
tstat, p = stats.ttest_ind(x1, x2)
tstat, p 

In [None]:
print(f'''
Because p ({p:.4f}) < alpha (0.05), reject the null hypothesis, which means there is a statistically significant
diffrence in tenure between customers who are still with telco and who have churned.
''')

In [None]:
X_train.groupby('churn').tenure_years.mean().plot.bar()
plt.xticks(rotation=0)
plt.xlabel('')
plt.ylabel('Average tenure in years')
plt.title('Is tenure different among customers who are churned or not?')

### Pearson's correlation coefficient test

𝐻0 : There is no linear correlation between monthly charges and Fiber optic(one type of internent service)

𝐻𝑎:  There is a linear correlation between monthly charges and Fiber optic(one type of internent service)

In [None]:
# set alpha value to .01
alpha = .01 
r, p = stats.pearsonr(X_train.monthly_charges, X_train['Fiber optic'])

if p < alpha:
    print("Reject the null hypothesis")

else:
    print("Fail to reject our null hypothesis")

print("R is", r)
print("p is", p)

Takeaways: There is a strong linear correlation between monthly charges and Fiber optic(one type of internent service)

### Chi squared test

Since vast majority variables in the dataset are categorical variables, here it would be great to apply the Chi squared test for testing independence between variables

In [None]:
# Set alpha value
alpha = 0.05

for col in X_train.columns:

    a, b = X_train[col], X_train["churn"]

    observed = pd.crosstab(a, b) 
    chi2, p, degf, expected = stats.chi2_contingency(observed)

    if p < alpha:
        # Reject the null hypothesis
        print("({} and churn) are  dependent of each other. (p = {})".format(col, p))
    else:
         # Failed to reject the null hypothesis
        print("({} and churn) are  independent of each other. (p = {})".format(col, p))

Takeaways: Vast majority of variables are dependent with churn, except gender and total_charges are independent with churn respectively.

In [None]:
# Cramer's V is a statistic used to measure the strength of association between two nominal variables, 
# and it take values from 0 to 1. Values close to 0 indicate a weak association between the variables
# and values close to 1 indicate a strong association between the variables.
import itertools
def cramers_corrected_stat(confusion_matrix):
    """
    Calculates the corrected Cramer's V statistic
    
    Args:
        confusion_matrix: The confusion matrix of the variables to calculate the statistic on
    
    Returns:
        The corrected Cramer'v V statistic
    """
    
    chi2, _, _, _ = stats.chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cols = list(X_train.columns.values)
corrM = np.zeros((len(cols),len(cols)))

# Calculate s of every combination of variables
for col1, col2 in itertools.combinations(cols, 2):

    a, b = X_train[col1], X_train[col2]
    
    idx1, idx2 = cols.index(col1), cols.index(col2)
    dfObserved = pd.crosstab(a,b) 
    corrM[idx1, idx2] = cramers_corrected_stat(dfObserved.values)
    corrM[idx2, idx1] = corrM[idx1, idx2]

corr = pd.DataFrame(corrM, index=cols, columns=cols)

# Mask to get lower triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

cmap = sns.cubehelix_palette(light=1, as_cmap=True)

# Draw the heatmap with the mask 
fig = plt.figure(figsize=(20, 20))

sns.heatmap(corr, mask=mask, cmap=cmap, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
plt.title('Cramer’s V calculated for the telco churn dataset')
plt.show()

Takeaway from the above heatmap: Except gender and total_charges, all features have some kind of association with churn, although all of the associations are not strong. Nevertheless, the strongest association exists between Month_to_month(contract_type) and tenure, the Cramer's V value is 0.41.

## Modeling

## logistic regression  model

In [None]:
X_train = X_train.drop(['churn'],axis=1)

In [None]:
X_validate = X_validate.drop(['churn'],axis=1)

In [None]:
X_test = X_test.drop(['churn'],axis=1)

In [None]:
model = LogisticRegression().fit(X_train, y_train)
y_pred = model.predict(X_validate)

In [None]:
pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))

In [None]:
#X_train = X_train.drop(['gender','total_charges'],axis=1)
#X_validate = X_validate.drop(['gender','total_charges'],axis=1)
# model = LogisticRegression().fit(X_train, y_train)
# y_pred = model.predict(X_validate)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## Decision tree model

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=123)
clf

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_validate)

In [None]:
pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))

## Random Forest  Model

In [None]:
rf = RandomForestClassifier(min_samples_leaf=5, max_depth=10,random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_validate)

In [None]:
pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

## K-Nearest Neighbors Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_validate)

In [None]:
pd.DataFrame(classification_report(y_validate, y_pred, output_dict=True))