# Music Box Churn Prediction and Recommendation using Spark

# Using Spark to train model

# Goal:

In [None]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [None]:
# sc = SparkContext('local')
# spark = SparkSession(sc)

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
from pyspark.sql.types import *
from pyspark.mllib.linalg import Vectors, DenseVector
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

## 1. Load data

In [None]:
df = spark.read.csv('data/model_final.csv', header=True, inferSchema=True).cache()

In [None]:
df.show()

In [None]:
df

## 2. Select features

In [None]:
selected_features = df.columns
selected_features.remove('uid')
selected_features.remove('label')
selected_features.remove('device_type')
selected_features

## 3. Build model

### Training dataset

In [None]:
assembler = VectorAssembler(inputCols=selected_features, outputCol='features')
data = assembler.transform(df)

### Train test split

In [None]:
(train, test) = data.randomSplit([0.7, 0.3], seed=1)

### Define prediction function

In [None]:
def predictions(model, data):
    # Predict data
    predictions = model.transform(data)
    
    # Select example rows to display
    predictions.select('probability', 'prediction', 'label', 'features').show(5)
    res_data = predictions.select('probability', 'label').toPandas()
    return res_data

In [None]:
#### Predict 

# # Predict train data
# predictions_train = model.trainsform(train)

# # Select example rows to display
# predictions_train.select('probability', 'prediction', 'label', 'features').show(5)
# res_train = predictions_train.select('probability', 'label').toPandas()

# # Predict test data
# predictions_test = model.transform(test)

# # Select example rows to display
# predictions_test.select('probability', 'prediction', 'label', 'features').show(5)
# res_test = predictions_test.select('probability', 'label').toPendas()

### Define ploting function

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, roc_auc_score

def plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred):
    roc_auc_train = roc_auc_score(y_train, p_train_pred)
    fpr_train, tpr_train, _ = roc_curve(y_train, p_train_pred)
    
    roc_auc_test = roc_auc_score(y_test, p_test_pred)
    fpr_test, tpr_test, _ = roc_curve(y_test, p_test_pred)
    
    lw = 2
    plt.figure()
    plt.plot(fpr_train, tpr_train, color='green', linewidth=lw, label='ROC Train (AUC = %0.4f)' % roc_auc_train)
    plt.plot(fpr_test, tpr_test, color='darkorange', linewidth=lw, label='ROC Test (AUC = %0.4f)' % roc_auc_test)
    plt.plot([0, 1], [0, 1], color='navy', linewidth=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc='lower right')
    plt.show()

### Define predict and evaluate performance function

In [None]:
def predict_and_evaluate_performance(model, train, test):
    # Predict
    pred_train = predictions(model, train)
    pred_test = predictions(model, test)

    y_train = pred_train['label']
    p_train_pred = [v[1] for v in pred_train['probability']]

    y_test = pred_test['label']
    p_test_pred = [v[1] for v in pred_test['probability']]

    # Evaluate
    plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)

### Logistic regression

In [None]:
lr = LogisticRegression(maxIter=3, regParam=0.01)
lr_model = lr.fit(train)

#### Predict and evaluate performance

In [None]:
predict_and_evaluate_performance(lr_model, train, test)

In [None]:
# Predict
# lr_pred_train = predictions(lr_model, train)
# lr_pred_test = predictions(lr_model, test)

# y_train = lr_pred_train['label']
# p_train_pred = [v[1] for v in lr_pred_train['probability']]

# y_test = lr_pred_test['label']
# p_test_pred = [v[1] for v in lr_pred_test['probability']]

# # Evaluate
# plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)

### Random Forest

In [None]:
rf = RandomForestClassifier(labelCol='label',
                            featuresCol='features',
                            numTrees=50,
                            featureSubsetStrategy='auto',
                            impurity='gini',
                            maxDepth=12,
                            minInstancesPerNode=10,
                            maxBins=16)
rf_model = rf.fit(train)

#### Predict and evaluate performance

In [None]:
predict_and_evaluate_performance(rf_model, train, test)

In [None]:
# # Predict
# rf_pred_train = predictions(rf_model, train)
# rf_pred_test = predictions(rf_model, test)

# y_train = rf_pred_train['label']
# p_train_pred = [v[1] for v in rf_pred_train['probability']]

# y_test = pred_test['label']
# p_test_pred = [v[1] for v in rf_pred_test['probability']]

# # Evaluate
# plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)