# Music Box Churn Prediction and Recommendation using Spark

# Using Scikit-learn to train model

# Goal:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Load data

In [None]:
df = pd.read_csv('data/model_final.csv')

#### Inspect dataset

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

#### one-hot for `device_type`

In [None]:
df['device_type_1'] = (df['device_type'] == 1).astype(int)
df['device_type_2'] = (df['device_type'] == 2).astype(int)

## 2. Define features and targets

In [None]:
selected_features = list(df.columns.values)

In [None]:
selected_features.remove('uid')
selected_features.remove('label')
selected_features.remove('device_type')
selected_features

In [None]:
X = df[selected_features].values
y = df['label'].values

In [None]:
X.shape

In [None]:
y[:10]

## 3. Build models

#### Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Define metric

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def get_performance_metrics(y_train, p_train_pred, y_test, p_test_pred, threshold=0.5):
    metric_names = ['AUC', 'Accuracy', 'Precision', 'Recall', 'f1-score']
    
    metric_values_train = [roc_auc_score(y_train, p_train_pred),
                           accuracy_score(y_train, p_train_pred > threshold),
                           precision_score(y_train, p_train_pred > threshold),
                           recall_score(y_train, p_train_pred > threshold),
                           f1_score(y_train, p_train_pred > threshold)]
    
    metric_values_test = [roc_auc_score(y_test, p_test_pred),
                          accuracy_score(y_test, p_test_pred > threshold),
                          precision_score(y_test, p_test_pred > threshold),
                          recall_score(y_test, p_test_pred > threshold),
                          f1_score(y_test, p_test_pred > threshold)]
    
    all_metrics = pd.DataFrame({'metrics': metric_names,
                                'train': metric_values_train,
                                'test': metric_values_test},
                                columns=['metrics', 'train', 'test'])

    all_metrics.set_index('metrics')
    print(all_metrics)

#### Define ploting function

In [None]:
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred):
    roc_auc_train = roc_auc_score(y_train, p_train_pred)
    fpr_train, tpr_train, _ = roc_curve(y_train, p_train_pred)
    
    roc_auc_test = roc_auc_score(y_test, p_test_pred)
    fpr_test, tpr_test, _ = roc_curve(y_test, p_test_pred)
    
    lw = 2
    plt.figure()
    plt.plot(fpr_train, tpr_train, color='green', linewidth=lw, label = 'ROC Train (AUC = %0.4f)' % roc_auc_train)
    plt.plot(fpr_test, tpr_test, color='darkorange', linewidth=lw, label='ROC Test (AUC = %0.4f)' % roc_auc_test)
    plt.plot([0, 1], [0, 1], color='navy', linewidth=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
def plot_feature_importance(model):
    df_feature_importance = pd.DataFrame()
    df_feature_importance['feature'] = selected_features
    df_feature_importance['importance'] = model.feature_importances_
    df_feature_importance.sort_values('importance', inplace=True)
    
    ax = df_feature_importance.plot(kind='barh', figsize=(20, 10))
    t = np.arange(len(df_feature_importance['feature']))
    ax.set_yticks(t)
    ax.set_yticklabels(df_feature_importance['feature'])
    plt.show()

#### Define model and model performance function

In [None]:
def train_test_model(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    
    y_train_pred = clf.predict(X_train)
    y_train_prob = clf.predict_proba(X_train)[:, 1]
    
    y_test_pred = clf.predict(X_test)
    y_test_prob = clf.predict_proba(X_test)[:, 1]
    
    get_performance_metrics(y_train, y_train_prob, y_test, y_test_prob)
    plot_roc_curve(y_train, y_train_prob, y_test, y_test_prob)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.3, penalty='l2')
train_test_model(lr, X_train, y_train, X_test, y_test)

#### Estimated coefficients

In [None]:
coef_values = zip(selected_features, lr.coef_.flatten()) # 把係數的名字和數值連結起來
df_coeffs = pd.DataFrame(list(coef_values))
df_coeffs.columns = ['feature', 'coeff']
df_coeffs.sort_values(by='coeff', ascending=False, inplace=True)
df_coeffs

In [None]:
ax = df_coeffs.plot.barh(figsize=(10,10)) # 水平的 bar 圖
t = np.arange(X.shape[1])
ax.set_yticks(t)
ax.set_yticklabels(df_coeffs['feature'])
plt.show()

### Single Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=8, min_samples_leaf=20)
dt.fit(X_train, y_train)

y_train_pred = dt.predict(X_train)
p_train_pred = dt.predict_proba(X_train)[:, 1]

y_test_pred = dt.predict(X_test)
p_test_pred = dt.predict_proba(X_test)[:, 1]

get_performance_metrics(y_train, p_train_pred, y_test, p_test_pred)
plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)

In [None]:
plot_feature_importance(dt)

In [None]:
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(dt, out_file=None,
                                feature_names=selected_features,
                                filled=True, rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph

### Bagged Trees

In [None]:
from sklearn.ensemble import BaggingClassifier

parameters = {'base_estimator': dt,
              'n_estimators': 100,
              'n_jobs': -1}

bagged_trees = BaggingClassifier(**parameters)

train_test_model(bagged_trees, X_train, y_train, X_test, y_test)

### Single KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

params = {'n_neighbors':10,
          'leaf_size':100}

knn = KNeighborsClassifier(**params)

train_test_model(knn, X_train, y_train, X_test, y_test)

### Bagged KNN

In [None]:
from sklearn.ensemble import BaggingClassifier

params = {'base_estimator': knn,
          'n_estimators':30,
          'n_jobs':-1}

bagged_knn = BaggingClassifier(**params)

train_test_model(bagged_knn, X_train, y_train, X_test, y_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

parameters = {'n_estimators': 50,
              'max_features': 'auto',
              'criterion': 'gini',
              'max_depth': 10,
              'min_samples_split': 2,
              'min_samples_leaf': 20,
              'random_state': 0,
              'n_jobs': -1}
rf = RandomForestClassifier(**parameters)

train_test_model(rf, X_train, y_train, X_test, y_test)

In [None]:
plot_feature_importance(rf)

### Gradient Boosting Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

parameters = {
    'n_estimators': 100,
    'max_depth': 5,
    'learning_rate': 0.1,
    'random_state': 42
}

# parameters = {
#     'n_estimators': 50,
#     'max_depth': 5,
#     'learning_rate': 0.2,
#     'subsample': 0.7,
#     'max_features': 0.8,
#     'random_state': 42
# }

gbt = GradientBoostingClassifier(**parameters)

train_test_model(gbt, X_train, y_train, X_test, y_test)

In [None]:
plot_feature_importance(gbt)

### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

parameters = {
    'solver': 'adam',
    'activation': 'relu',
    'alpha': 1e-5, # increase alpha --> increase penalty
    'hidden_layer_sizes': (5, 5),
    'learning_rate': 'adaptive',
    'random_state': 1
}

nn = MLPClassifier(**parameters)

train_test_model(nn, X_train, y_train, X_test, y_test)

### Hyperparameter tuning: Grid search

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto'],
    'criterion': ['gini'],
    'max_depth': [5, 15, 20, 25],
    'min_samples_split': [2],
    'min_samples_leaf': [2, 5, 10, 20],
    'n_jobs': [-1]
}

acc_scorer = make_scorer(roc_auc_score)

grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

train_test_model(clf, X_train, y_train, X_test, y_test)