Train gradient boosting models on the data.

# 1. Load data and packages

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from handyspark import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression

In [4]:
%run Users/tw2567@columbia.edu/Udacity/churn_prediction/util

In [5]:
raw_data = spark.read.parquet('s3://tianyi-wang-data-science-projects/churn-prediction-2020/features1')\
           .withColumnRenamed('last_status_cancelled','label')\
           .drop('userId')
features_col = [i for i in raw_data.columns if i != 'label']
assembler = VectorAssembler(inputCols=features_col,outputCol="features")
data = assembler.transform(raw_data).select('features','label')

In [6]:
feature_names = assembler.getInputCols()

In [7]:
display(data.groupby('label').count())

label,count
1,5005
0,17272


# 2 Check whether it's better to upsample the positive group

### 2.1 Triple the size of positive samples

In [10]:
auc = tune_sampling_parameters(data, LogisticRegression(), 3, 1, num_folds=3)
auc

### 2.2 Double the size of positive samples

In [12]:
auc = tune_sampling_parameters(data, LogisticRegression(), 2, 1, num_folds=3)
auc

### 2.3 Postive group size * 1.5

In [14]:
auc = tune_sampling_parameters(data, LogisticRegression(), 1.5, 1, num_folds=3)
auc

### 2.4 Don't upsample

In [16]:
auc = tune_sampling_parameters(data, LogisticRegression(), 1, 1, num_folds=3)
auc