# Predicting Customer Churn

**Credit:** http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html

We first import the required packages: Pandas for data processing.

In [None]:
import pandas as pd

We then import the Scikit-Learn package. In this demo, we will try 3 machine learning algorithms that are

1. *k*-Nearest Neigbors;
2. Naive Bayes;
3. Support Vector Machine.

In [None]:
from sklearn import neighbors, naive_bayes, svm

Moreover, we import the modules for evaluating the algorithms and proprocessing data.

In [None]:
from sklearn.metrics import accuracy_score, classification_report

## Preprocessing Data

We use Pandas to load the data from a CSV file.

In [None]:
churn_df = pd.read_csv('data/churn.csv')
churn_df.head()

In [None]:
churn_df[churn_df['Churn?'] == 'True.'].describe()

In [None]:
churn_df[churn_df['Churn?'] == 'False.'].describe()

In [None]:
import seaborn as sns

In [None]:
churn_df.columns

In [None]:
churn_df.columns = ['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan",
       'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge',
       'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls',
       'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge',
       'CustServ Calls', 'churn']

In [None]:
def convert(value):
    if value == 'True.':
        return 'yes'
    else:
        return 'no'

In [None]:
churn_df['preprocessed_churn'] = churn_df.churn.apply(convert)

In [None]:
churn_df.head(2)

In [None]:
sns.boxplot(x='preprocessed_churn', y='Eve Charge', data=churn_df)

In [None]:
sns.boxplot(x='preprocessed_churn', y='Night Calls', data=churn_df)

In [None]:
sns.boxplot(x='preprocessed_churn', y='Day Mins', data=churn_df)

In [None]:
churn_df.info()

Separate the classification target from the data.

In [None]:
def convert_churn_into_yes_or_no(churn):
    if churn == 'True.':
        return 'yes'
    if churn == 'False.':
        return 'no'

In [None]:
churn_df['preprocessed_churn'] = churn_df['churn'].map(convert_churn_into_yes_or_no)

In [None]:
churn_df.head()

In [None]:
y = churn_df['preprocessed_churn']

Remove some unnecessary data.

In [None]:
to_drop = ['State', 'Area Code', 'Phone', 'churn', 'preprocessed_churn']
churn_feature_space = churn_df.drop(to_drop, axis=1)

Since some columns contain 'yes' or 'no' data, we should convert them into boolean and then into floating point. This will make it easier for processing later.

In [None]:
churn_feature_space.head()

In [None]:
def convert_to_float(value):
    if value == 'yes':
        return 1.0
    else:
        return 0.0

In [None]:
churn_feature_space["Int'l Plan"] = churn_feature_space["Int'l Plan"].map(convert_to_float)

In [None]:
churn_feature_space['VMail Plan'] = churn_feature_space['VMail Plan'].map(convert_to_float)

In [None]:
churn_feature_space.head()

In [None]:
X = churn_feature_space

It is important to normalize the data in building a machine learning model. The data will be in the same range and it can improve the results.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

print('Feature space holds %d observations and %d features' % X.shape)
print('Unique targets:', y.unique())

In [None]:
X

## Preparing Training and Testing Data

Prepare the training and testing data.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

## Building Predictive Model

Define the target for data visualization later.

In [None]:
target_names = ['yes', 'no']

### k-Nearest Neighbors

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

In [None]:
knn = neighbors.KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
y_pred = grid.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

### Naive Bayes

In [None]:
gnb = naive_bayes.GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

### Support Vector Machines (SVMs)

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

## Challenge

เราสามารถทำให้ดีขึ้นได้อย่างไรบ้าง? ลองโมเดลอื่นๆ ลองทำ Cross-Validation เพิ่ม