Train gradient boosting models on the data.

# 1. Load data and packages

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from handyspark import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier

Load the modeling helper functions

In [5]:
%run Users/tw2567@columbia.edu/Udacity/churn_prediction/util

In [6]:
raw_data = spark.read.parquet('s3://tianyi-wang-data-science-projects/churn-prediction-2020/features1')\
           .withColumnRenamed('last_status_cancelled','label')\
           .drop('userId')
features_col = [i for i in raw_data.columns if i != 'label']
assembler = VectorAssembler(inputCols=features_col,outputCol="features")
data = assembler.transform(raw_data).select('features','label')

In [7]:
feature_names = assembler.getInputCols()

In [8]:
display(data.groupby('label').count())

label,count
1,5005
0,17272


# 2 Check whether it's better to upsample the positive group

### 2.1 Triple the size of positive samples

In [11]:
auc = tune_sampling_parameters(data, RandomForestClassifier(), 3, 1, num_folds=3)
auc

### 2.2 Double the size of positive samples

In [13]:
auc = tune_sampling_parameters(data, RandomForestClassifier(), 2, 1, num_folds=3)
auc

### 2.3 Postive group size * 1.5

In [15]:
auc = tune_sampling_parameters(data, RandomForestClassifier(), 1.5, 1, num_folds=3)
auc

We decide to triple the size of the positive group.

### 2.4 Don't upsample

In [18]:
auc = tune_sampling_parameters(data, RandomForestClassifier(), 1, 1, num_folds=3)
auc

# 3. Tune model parameters

In [20]:
rf = RandomForestClassifier()
paramGrid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5, 8])\
            .addGrid(rf.subsamplingRate, [0.8, 1])\
            .build()
cv_model = tune_model_parameter(data, 3, 1, rf, paramGrid)

In [21]:
rf = RandomForestClassifier()
paramGrid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5, 8])\
            .addGrid(rf.subsamplingRate, [0.8, 1])\
            .build()
cv_model = tune_model_parameter(data, 3, 1, rf, paramGrid)

In [22]:
rf = RandomForestClassifier()
paramGrid = ParamGridBuilder()\
            .addGrid(rf.maxDepth, [5, 8])\
            .addGrid(rf.subsamplingRate, [0.8, 1])\
            .build()
cv_model = tune_model_parameter(data, 3, 1, rf, paramGrid)

In [23]:
cv_model.bestModel.extractParamMap()