In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

plt.style.use('ggplot')
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("data/churn_dataset.csv")

In [None]:
df.shape

In [None]:
df.head().T

# Fill NA values

In [None]:
nans = df.isnull().sum()
nans[nans > 0].sort_values(ascending = False)

In [None]:
df['first_order_products'] = df['first_order_products'].fillna("")

In [None]:
df[['refunds_unsuccess', 'refunds_success']] = df[['refunds_unsuccess', 'refunds_success']].fillna(0)

In [None]:
rating_cols = ['avg_rating', 'rating_diff', 'last_order_rating', 'rated_orders', 'first_order_rating']
df[rating_cols] = df[rating_cols].fillna(-999)
df.loc[df.rated_orders > 0, rating_cols].head()

In [None]:
df.aov = df.aov.fillna(0)
df.segment = df.segment.fillna('None')
df.gender = df.gender.fillna('Unknown')

In [None]:
df.zip = df.zip.fillna('Unknown')
df.zip_area = df.zip_area.fillna('Unknown')
df.avg_fac_distance = df.avg_fac_distance.fillna(-999)

In [None]:
nans = df.isnull().sum()
nans[nans > 0].sort_values(ascending = False)

# Bad vs Good Voucher Customers
Identify customers that have used a voucher on their first order, filter out those that didn't come back afterwards as bad and those who came back as good.

In [None]:
def plot_bad_vs_good_by_column(column_name):
    df_bad = df_vou_bad.groupby(column_name)['customer_db_id'].nunique() / df_vou_bad.shape[0]
    df_good = df_vou_good.groupby(column_name)['customer_db_id'].nunique() / df_vou_good.shape[0]
    df_diff = df_bad - df_good
    df_diff_abs = df_diff.abs()
    df = pd.concat([df_bad, df_good, df_diff, df_diff_abs], 
                  axis=1, keys=['bad', 'good', 'diff', 'diff_abs'], sort=True)
    df = df.sort_values('diff', ascending=False)
    df[['bad', 'good']].plot(kind='bar', title=column_name)
    df[['diff']].plot(kind='bar', title='diff bad - good', color='orange')

In [None]:
df_vou = df.copy()
df_vou = df.loc[df.first_order_voucher & df.valid_orders > 0]
df_vou_good = df_vou.loc[(df_vou.valid_orders > 1)]
df_vou_bad = df_vou.loc[(df_vou.valid_orders == 1) & (df_vou.recency > 365)]

print('Bad customers', df_vou_bad.shape)
print('Good customers', df_vou_good.shape)

df_vou_bad['customer_type'] = 'bad'
df_vou_good['customer_type'] = 'good'
df_both = pd.concat([df_vou_bad, df_vou_good])

## Distance to Facility 

In [None]:
a = df_both.loc[df_both.avg_fac_distance != -999, ['customer_type', 'avg_fac_distance']]
sns.categorical.stripplot(data=a, x='customer_type', y='avg_fac_distance')

In [None]:
f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True,figsize=(6,6))
sns.distplot(a.loc[a.customer_type == 'bad', 'avg_fac_distance'], ax=ax1, color='red', bins=25)
ax1.set_title('Bad Customers')
sns.distplot(a.loc[a.customer_type == 'good', 'avg_fac_distance'], ax=ax2, color='blue', bins=25)
ax2.set_title('Good Customers')
plt.tight_layout()

## Facility

In [None]:
plot_bad_vs_good_by_column('first_order_fac_name')

## Service Class

In [None]:
plot_bad_vs_good_by_column('first_order_service_class')

# Voucher Channel

In [None]:
plot_bad_vs_good_by_column('first_order_voucher_channel')

## Gender

In [None]:
plot_bad_vs_good_by_column('gender')

## Zip Area

In [None]:
plot_bad_vs_good_by_column('zip_area')

## Product Segmentation

In [None]:
plot_bad_vs_good_by_column('first_order_products')

In [None]:
df_both.gender.unique()

In [None]:
df_both['gender'] = df_both['gender'].map({'male': -1, 'female': 1, 'Unknown': 0})

In [None]:
df_both['customer_type'] = df_both['customer_type'].map({'bad': -1, 'good': 1})

In [None]:
df_both.first_order_voucher_channel.unique()

In [None]:
vou_channel_le = LabelEncoder()
vou_channel_le.fit(df_both.first_order_voucher_channel)
df_both['first_order_voucher_channel_enc'] = vou_channel_le.transform(df_both.first_order_voucher_channel.tolist())

In [None]:
serv_class_le = LabelEncoder()
serv_class_le.fit(df_both.first_order_service_class)
df_both['first_order_service_class_enc'] = serv_class_le.transform(df_both.first_order_service_class.tolist())

In [None]:
products_le = LabelEncoder()
products_le.fit(df_both.first_order_products)
df_both['first_order_products_enc'] = products_le.transform(df_both.first_order_products.tolist())

In [None]:
cor = df_both[['customer_type', 'gender', 'referred', 'newsletter_optin', 
         'product_LA', 'product_HH', 'product_DC', 'product_WF', 
         'first_order_voucher_channel_enc', 'first_order_service_class_enc', 
         'first_order_products_enc', 'avg_fac_distance']].corr()

In [None]:
cor.loc['customer_type']

In [None]:
sns.heatmap(cor, 
            xticklabels=cor.columns.values,
            yticklabels=cor.columns.values)