Train gradient boosting models on the data.

# 1. Load data and packages

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from handyspark import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import GBTClassifier

Load the modeling helper functions

In [5]:
%run Users/tw2567@columbia.edu/Udacity/churn_prediction/util

In [6]:
raw_data = spark.read.parquet('s3://tianyi-wang-data-science-projects/churn-prediction-2020/features1')\
           .withColumnRenamed('last_status_cancelled','label')\
           .drop('userId')
features_col = [i for i in raw_data.columns if i != 'label']
assembler = VectorAssembler(inputCols=features_col,outputCol="features")
data = assembler.transform(raw_data).select('features','label')

In [7]:
feature_names = assembler.getInputCols()

In [8]:
display(data.groupby('label').count())

label,count
1,5005
0,17272


# 2 Check whether it's better to upsample the positive group

### 2.1 Triple the size of positive samples

In [11]:
auc = tune_sampling_parameters(data, GBTClassifier(), 3, 1, num_folds=3)
auc

### 2.2 Double the size of positive samples

In [13]:
auc = tune_sampling_parameters(data, GBTClassifier(), 2, 1, num_folds=3)
auc

### 2.3 Postive group size * 1.5

In [15]:
auc = tune_sampling_parameters(data, GBTClassifier(), 1.5, 1, num_folds=3)
auc

We decide to triple the size of the positive group.

### 2.4 Don't upsample

In [18]:
auc = tune_sampling_parameters(data, GBTClassifier(), 1, 1, num_folds=3)
auc

# 3. Tune model parameters

In [20]:
gbt = GBTClassifier()
paramGrid = ParamGridBuilder()\
            .addGrid(gbt.maxDepth, [5, 8])\
            .addGrid(gbt.stepSize, [0.01, 0.05, 0.1])\
            .addGrid(gbt.subsamplingRate, [0.8, 1])\
            .build()
cv_model = tune_model_parameter(data, 3, 1, gbt, paramGrid)

In [21]:
cv_model.avgMetrics

In [22]:
cv_model.bestModel.extractParamMap()

# 4. Train and test the model with the "best parameters" for multiple times to get permutaions of the testing results

In [24]:
auc_list = tune_sampling_parameters(data, GBTClassifier(maxDepth=8, stepSize=0.1), 3, 1, num_folds=10)

In [25]:
auc_list1 = tune_sampling_parameters(data, GBTClassifier(), 3, 1, num_folds=10)

In [26]:
auc_list1

# 5. Feature importance

In [28]:
plot_important_features(cv_model.bestModel, feature_names)

In [29]:
display(raw_data.groupby('label')\
        .agg(F.mean(F.col('customer_age')), 
              F.mean(F.col('perc_1_6')),
              F.mean(F.col('perc_addlist')),
              F.mean(F.col('perc_12_18')),
              F.mean(F.col('time_per_day')),
              F.mean(F.col('perc_popular_songs'))))

label,avg(customer_age),avg(perc_1_6),avg(perc_addlist),avg(perc_12_18),avg(time_per_day),avg(perc_popular_songs)
1,97.61618381618382,0.2050976390256041,0.0219253475938177,0.323630285849633,16884148.56692241,0.0912948182657
0,96.87129458082444,0.2084381712418975,0.0225480495006848,0.3092848474119766,14675028.636884484,0.0910269004052061
