# Client Retention Demo
Simple demo to show Anaconda functionality on the mainframe accessing mainframe data with the Optimized Data Layer client, dsdbc.

In [None]:
# To use dsdbc, if you have the necessary files virtualized into ODL, simply comment out the next line
csv = "yes"

if not csv:
    import dsdbc #This package required to interface with ODL

import pandas as pd
import numpy as np
#Remove font warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", category=PendingDeprecationWarning)

## Setup Mainframe Data Connections
This step will setup the Optimized Data Layer client connection to access mainframe data and load them into Panadas DataFrames.

In [None]:
if not csv:
    conn = dsdbc.connect()
    cursor = conn.cursor()

***Client Data***

Load client data into a Pandas DataFrame.

In [None]:
#Create pandas dataframe directly from "DB" query (really a pysical sequential dataset)
if csv:
    client_df = pd.read_csv("data/CLIENT_INFO_VSAMKSDS.csv")
else:
    client_df = pd.read_sql('SELECT * FROM CLIENT_INFO_VSAMKSDS', conn)
client_df = client_df.set_index("CONT_ID")
client_df

***Credit transactions***

Load credit card transactions into a Pandas DataFrame.

In [None]:
if csv:
    txn_df = pd.read_csv("data/SPPAYTB_VSAM-1.csv")
    txn_df2 = pd.read_csv("data/SPPAYTB_VSAM-2.csv")
    txn_df = txn_df.append(txn_df2)
    txn_df3 = pd.read_csv("data/SPPAYTB_VSAM-3.csv")
    txn_df = txn_df.append(txn_df3)
else:
    txn_df = pd.read_sql('SELECT * FROM SPPAYTB_VSAM', conn)

In [None]:
txn_df['AUREQ_TX_DT_TTLAMT'] = pd.to_numeric(txn_df['AUREQ_TX_DT_TTLAMT'])
txn_df['CONT_ID'] = txn_df['CONT_ID'].astype('int64')
txn_df['HDR_CREDTT'] = pd.to_datetime(txn_df['HDR_CREDTT'])

In [None]:
txn_df['DATE'] = txn_df['HDR_CREDTT'].apply(lambda x: x.date())

In [None]:
txn_df

## Aggregate statistics
Calculate a few aggregate statistics based on credit transactions and join the results to the client data DataFrame.

In [None]:
# Total txns per customer
total_txns_df = txn_df.groupby('CONT_ID').size().rename("TOTAL_TXNS").to_frame()
client_df = client_df.join(total_txns_df)

In [None]:
# Total transaction amounts per customer
total_txn_amount_df = txn_df.groupby('CONT_ID')['AUREQ_TX_DT_TTLAMT'].sum().rename("TOTAL_TXN_AMOUNT").to_frame()
client_df = client_df.join(total_txn_amount_df)

In [None]:
# Avg transaction amounts per customer
avg_txn_amount_df = txn_df.groupby('CONT_ID')['AUREQ_TX_DT_TTLAMT'].mean().rename("AVG_TXN_AMOUNT").to_frame()
client_df = client_df.join(avg_txn_amount_df)

In [None]:
# Average daily txns per customer
daily_txns = txn_df.groupby(['DATE', 'CONT_ID']).size()

In [None]:
# Missing txns on a particular day means customer had none.
# These days should be included in the avg as 0 transaction days.
avg_daily_txns_df = daily_txns.unstack().fillna(0).mean().rename("AVG_DAILY_TXNS").to_frame()
client_df = client_df.join(avg_daily_txns_df)

## Show Results

In [None]:
client_df

***Note: importing pymon packages will fail, because they are not currently installed.  To install them, run the following command***

```conda install matplotlib```

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

### Plot X vs. Y
We begin our exploration of the data set by creating some scatterplots of each column vs. the others.

In [None]:
def jointplot(x, y, data, **kwargs):
    size = kwargs.pop('size', 10)
    alpha = kwargs.pop('alpha', 0.3)
    return sns.jointplot(x=x, y=y, data=data, 
                         alpha=alpha,
                         size=size,
                         **kwargs)

# for widget
def w_jointplot(x, y):
    g = jointplot(x, y, filter_outliers(client_df, by_col=y))
    plt.close()
    return g.fig

In [None]:
churn_labels = ['Did not churn', 'Did churn']

def filter_outliers(d, by_col=None):
    if isinstance(d, pd.
                  Series):
        return d[((d-d.mean()).abs()<=3*d.std())]
    elif isinstance(d, pd.DataFrame):
        if not by_col:
            raise ValueError('by_col is required for DataFrame')
        return d[np.abs(d[by_col]-d[by_col].mean())<=(3*d[by_col].std())] 

In [None]:
ax = jointplot('AGE_YEARS', 'ANNUAL_INCOME', filter_outliers(client_df, by_col='ANNUAL_INCOME'))

### Correlations
Next, we compute the correlation coefficients between each variable.

In [None]:
corr = client_df.corr()

# only show lower triangle
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(12,12))
ax = sns.heatmap(corr, mask=mask, square=True, annot=True, fmt='.2f',
                 cbar=True,
                 ax=ax)
title = ax.set_title('Correlations', size=14)

## Churn
We plot the distributions of clients who churned and those that did not on the same axes.

In [None]:
def plot_churn_by(df, col, **kwargs):
    f, ax = plt.subplots(figsize=(12,10), sharex=True)
    kde = kwargs.get('kde', False)
    hist = kwargs.get('hist', False)
    for churn in df.CHURN.unique():
        sns.distplot(df[df.CHURN == churn][col], 
                     label=churn_labels[churn], 
                     kde_kws={'shade': (kde and not hist)},
                     ax=ax, 
                     **kwargs)

    ax.set_title('Client Churn by {}'.format(col))
    label = ax.set_xlabel('{}'.format(col))
    return f, ax

def w_plot_churn_by(column, hist=True, kde=False, norm_hist=False):
    df = filter_outliers(client_df, by_col=column)
    f, ax = plot_churn_by(df, column, hist=hist, kde=kde, norm_hist=norm_hist)
    plt.legend()
    plt.close()
    return f

f, ax = plot_churn_by(client_df, 'AGE_YEARS')
ax = plt.legend()

The two features that showed a negative correlation with churn were age and activity level. Here we generate a boxplot with those two features as the axes, and churn as the category.
The plot shows that clients that churn tend to be younger across all levels of activity.

In [None]:
col = 'AGE_YEARS'
data = filter_outliers(client_df, by_col=col)

f, ax = plt.subplots(figsize=(12,8))
ax = sns.boxplot(x='ACTIVITY_LEVEL', y=col, hue="CHURN", data=data, 
                 palette='muted', ax=ax)
title = ax.set_title('Client Churn by Activity Level')
label = ax.set_ylabel('Age (Years)')
label = ax.set_xlabel('Activity Level')
handles, labels = ax.get_legend_handles_labels()
legend = ax.legend(handles, churn_labels)

This beeswarm plot shows clients binned by the level of activity they maintain with the bank. Clients that churned maintained lower levels of activity (0-2). And of clients within these lower activity levels, younger clients churned more than others.

In [None]:
f, ax = plt.subplots(figsize=(10,8))
ax = sns.swarmplot(x='ACTIVITY_LEVEL', y='AGE_YEARS', hue='CHURN', 
                   data=data.sample(n=2000, random_state=51), 
                   palette='muted', ax=ax)
title = ax.set_title('Client Churn by Activity Level')
label = ax.set_ylabel('Age (Years)')
label = ax.set_xlabel('Activity Level')
handles, labels = ax.get_legend_handles_labels()
legend = ax.legend(handles, churn_labels)

## Train churn model
Train a churn classifier, which we'll use to predict the probability that a client will churn.
To keep things simple, we use a single data set, which we split into training and test data sets. We use the training data to train the model, and the test data to make projections about lost revenue to the bank.

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split

In [None]:
def make_feature_space(df):
    '''Create the feature space required by our classifier.'''
    # drop columns/features we don't want/need for the classifier
    features_df = df.drop(['CHURN', 'CUSTOMER_ID'], axis=1, errors='ignore')
    X = features_df.as_matrix().astype(np.float)
    # normalize feature values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X

def predict_churn(X):
    '''Predict the probabilit of churn from feature set.'''
    return clf.predict_proba(X)[:,1]

def train_model(X, y):
    '''Train our classifier using features X and target variable y.'''
    clf = RF(n_estimators=100)
    return clf.fit(X, y)

def init_model(df):
    # split data into train, test sets
    train_index, test_index = train_test_split(df.index, random_state=99)
    train_df = client_df.ix[train_index]
    test_df = client_df.ix[test_index]

    # target variable
    y = np.array(train_df['CHURN'])

    # extract features
    X = make_feature_space(train_df)

    # train classifier
    clf = train_model(X, y)

    return clf, test_df

After training the model, we are left with the churn classifier and the test data set, which we'll use for our churn predictions.

In [None]:
clf, test_df = init_model(client_df)

## Calculate business loss
In this simple example, we calculate the projected loss of business (revenue) to BigBank for all clients in the test data set. We calculate BigBank's revenue from each client, and multiply that by the churn probability to determine the predicted loss.

In [None]:
def calc_business_loss(df):
    #df['customer_id'] = df.index
    data = df.copy()

    # extract features
    X = make_feature_space(df)
    
    # predict churn
    data['churn_probability'] = predict_churn(X)
    
    # TODO: avg_daily_balance would be a nice feature to have here
    # for now, we'll just use fraction of income
    avg_daily_balance = df['ANNUAL_INCOME'] / 6

    # Interest made on deposits
    deposit_rate = 0.02

    # Fee collected for each credit txn
    credit_rate = 0.015

    # Assume we make some money on trading fees and/or portfolio management
    mgmt_rate = 0.02

    # How much is each customer worth to the business?
    worth = deposit_rate * avg_daily_balance + \
            mgmt_rate * df['ANNUAL_INVEST'] + \
            credit_rate * df['TOTAL_TXN_AMOUNT']
    data['worth'] = worth
    
    # How much would we lose per annum?
    data['predicted_loss'] = data['churn_probability'] * worth
    
    return data.sort_values(by='predicted_loss', ascending=False)

In [None]:
churn_df = calc_business_loss(test_df)
churn_df.head()

## Loss by Age Group
In this section, we calculate and plot the projected loss of revenue by age group. In our data set, age is an important feature in predicting if a client will churn.
First we create a DataFrame containing the cumulative predicted loss by age group.

In [None]:
def group_by_age(df, bins=None):
    if bins is None:
        bin_size = 5
        _min, _max = int(df.AGE_YEARS.min()), int(df.AGE_YEARS.max())
        bins = range(_min, _max + bin_size, 5)
    return df.groupby(pd.cut(df.AGE_YEARS, bins=bins))

data_by_age = churn_df.pipe(group_by_age)
data_by_age['predicted_loss'].sum().reset_index()

In [None]:
loss_by_age_df = data_by_age['predicted_loss'].sum().reset_index()
loss_by_age_df['AGE_YEARS'] = loss_by_age_df['AGE_YEARS'].astype(str)

loss_by_age_df.plot(x='AGE_YEARS', y='predicted_loss', style='o')