In [None]:
%pylab inline
import pandas as pd
from gower import gower_matrix

In [None]:
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('churn.csv')
df['Area Code'] = df['Area Code'].astype(str)

df['Churn?'] = df['Churn?'].apply(lambda x: 1 if x == "True." else 0)
df.drop(['Phone'], axis=1, inplace=True)
df.head()

In [None]:
df.describe()

In [None]:
df['Churn?'].value_counts() / len(df)

In [None]:
%%time 

distances = gower_matrix(df[df.columns[:-2]])
similarity = np.exp(-0.2 * distances / distances.std())

In [None]:
from sklearn.cluster import SpectralClustering

clstr = SpectralClustering(affinity='precomputed', random_state=123456)
df['cluster'] = clstr.fit_predict(similarity)

df['cluster'].value_counts().sort_index()

In [None]:
n_clusters = len(df['cluster'].unique())
descriptions = []

for i in range(n_clusters):
    descriptions.append(df[df.cluster == i].describe(include='all').loc[['unique','top','mean']])


In [None]:
def build_feature_box_plot(df, feature_name, ax):
    n_clusters = len(df['cluster'].unique()) 
    
    

    if feature_name == 'Area Code':
        codes = dict([(v,k) for (k,v) in enumerate(df['Area Code'].unique().tolist())])
        data = [[df[df.cluster == i][feature_name].apply(lambda x: codes[x]).mean()] for i in range(n_clusters)]
        avg = df['Area Code'].apply(lambda x: codes[x]).mean()
    elif df[feature_name].dtype == dtype('O'):
        data = [[df[df.cluster == i][feature_name].apply(lambda x: 1 if x == 'yes' else 0).mean()] for i in range(n_clusters)]
        avg = df[feature_name].apply(lambda x: 1 if x == 'yes' else 0).mean()
    else:
        data = [df[df.cluster == i][feature_name].values for i in range(n_clusters)]
        avg = df[feature_name].mean()
    
    ax.boxplot(data, whis=[2.5,97.5])
    ax.hlines(avg, 1, n_clusters, colors='red', linestyles='--')
    ax.set_title(feature_name)
    ax.set_xlabel("Cluster #")
    ax.set_ylabel("Value")
    ax.set_xticks(range(1,n_clusters+1), range(0,n_clusters))
       
features = df.columns.tolist()
for x in ['State', 'cluster']:
    features.remove(x)

n_cols = 1
n_rows = int(np.ceil(len(features) / n_cols))

fig, axs = subplots(n_rows, n_cols, figsize=(20,40))

for (this_ax, this_feature) in zip(axs.ravel(), features):
    build_feature_box_plot(df, this_feature, this_ax)

In [None]:
df[df.cluster == 0].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster == 1].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster == 2].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster == 3].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster == 4].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster == 5].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster == 6].describe(include='all').loc[['unique','top','mean']]

In [None]:
df[df.cluster ==7].describe(include='all').loc[['unique','top','mean']]

In [None]:
cluster_0 = df[df.cluster == 0].reset_index()
cluster_0['Churn?'].value_counts()

In [None]:
ohe1 = OneHotEncoder(sparse=False, )
res = ohe1.fit_transform(cluster_0[['Area Code']])

encoded_area_code = pd.DataFrame(data=res, columns=['Area Code ' + c for c in ohe1.categories_])


In [None]:
cluster_0 = pd.concat([encoded_area_code, cluster_0],axis=1,join='inner')

cluster_0['Int\'l Plan'] = df['Int\'l Plan'].apply(lambda x: 1 if x == 'yes' else 0)
cluster_0['VMail Plan'] = df['VMail Plan'].apply(lambda x: 1 if x == 'yes' else 0)

cluster_0.drop(['State','Area Code','cluster','index'], axis=1, inplace=True)
cluster_0

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cluster_0[cluster_0.columns[:-1]], 
                                                    cluster_0[cluster_0.columns[-1]], test_size=0.2,
                                                    stratify=cluster_0[cluster_0.columns[-1]], random_state=12345)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=6, random_state=123456)
rf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

test_preds = rf.predict(X_test)

print("Test Set Accuracy:  ", accuracy_score(y_test, test_preds), "\n\n")


print(classification_report(y_test, rf.predict(X_test)))

In [None]:
from sklearn.metrics import plot_precision_recall_curve

_, ax = subplots(1,1,figsize=(20,7))
plot_precision_recall_curve(rf, X_test, y_test, ax=ax, name='Churn Probability')
ax.set_xticks(np.arange(0,1.01,0.1))
ax.grid('both')