In [70]:
from pyspark.sql import SparkSession
from utility_functions import *
import pandas as pd
import seaborn as sns
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer,MinMaxScaler, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import OneHotEncoder
import numpy as np

pd.set_option('max_colwidth', None)
pd.set_option("display.max_rows", None)

In [2]:
train_file_path = "train_data.parquet"
test_file_path = "test_data.parquet"

train_data = import_data_into_dataframe(train_file_path, 'parquet')
test_data = import_data_into_dataframe(test_file_path, 'parquet')

train_data.persist()
test_data.persist()

DataFrame[userId: string, churned: double, avg_num_of_add_to_playlist_per_session: double, avg_num_of_addfriends_per_session: double, avg_num_of_adverts_per_session: double, avg_num_of_artists_per_session: double, avg_num_of_songs_per_session: double, avg_num_of_thumbs_down_per_session: double, avg_num_of_thumbs_up_per_session: double, avg_num_of_times_settings_changed_per_session: double, average_number_of_visits_to_the_about_page_per_session: double, average_number_of_visits_to_the_help_page_per_session: double, avg_num_of_visits_to_home_per_session: double, avg_num_of_visits_to_the_settings_page_per_session: double, avg_num_of_visits_to_upgrade_page: double, avg_number_of_errors_per_session: double, avg_number_of_visits_to_downgrade_page: double, num_times_user_changed_levels: bigint, num_of_downgrades_submitted: bigint, num_of_upgrades_submitted: bigint, gender: string]

In [3]:
print("Checking Train data set for correctness:")
print(count_column_types(train_data).iloc[:, :2])
print('-'*40)
print("Checking Test data set for correctness:")
print(count_column_types(test_data).iloc[:, :2])

Checking Train data set for correctness:
     type  count
0  bigint      3
1  double     16
2  string      2
----------------------------------------
Checking Test data set for correctness:
     type  count
0  bigint      3
1  double     16
2  string      2


In [59]:
# create list of numeric column names, have to remove the churned column itself
numeric_column_names = get_columns_of_type(train_data, "bigint")
numeric_column_names.extend(get_columns_of_type(train_data, "double"))
numeric_column_names.remove('churned') #remove churned column
numeric_column_names

['num_times_user_changed_levels',
 'num_of_downgrades_submitted',
 'num_of_upgrades_submitted',
 'avg_num_of_add_to_playlist_per_session',
 'avg_num_of_addfriends_per_session',
 'avg_num_of_adverts_per_session',
 'avg_num_of_artists_per_session',
 'avg_num_of_songs_per_session',
 'avg_num_of_thumbs_down_per_session',
 'avg_num_of_thumbs_up_per_session',
 'avg_num_of_times_settings_changed_per_session',
 'average_number_of_visits_to_the_about_page_per_session',
 'average_number_of_visits_to_the_help_page_per_session',
 'avg_num_of_visits_to_home_per_session',
 'avg_num_of_visits_to_the_settings_page_per_session',
 'avg_num_of_visits_to_upgrade_page',
 'avg_number_of_errors_per_session',
 'avg_number_of_visits_to_downgrade_page']

In [5]:
# create a list of categoric column names and remove userId from it 
categoric_column_names = get_columns_of_type(train_data, "string")
categoric_column_names.remove('userId')
categoric_column_names

['gender']

In [6]:
# create pipeline
pipeline_stages = []

In [7]:
# scaling numeric columns
numeric_vec_assembler = VectorAssembler(inputCols=numeric_column_names, outputCol="numeric_features") # create a vector of all the numeric columns
pipeline_stages.append(numeric_vec_assembler)

minmax_scaler = MinMaxScaler(inputCol="numeric_features", outputCol="numeric_features_scaled") # minmax scale all numeric columns
pipeline_stages.append(minmax_scaler)

In [8]:
# Label encoding the categorical column as gender is binary in this case
str_indexer = StringIndexer(inputCols=categoric_column_names, outputCols=[name+'_indexed' for name in categoric_column_names], handleInvalid='skip') # encode all categorical features
pipeline_stages.append(str_indexer)

In [9]:
# creating a vector of all the features
feature_columns = ["numeric_features_scaled", "gender_indexed"]
feature_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features") # create the combined features vector
pipeline_stages.append(feature_assembler)

output_label_indexer = StringIndexer(inputCol='churned', outputCol='label') # encode the churned column
pipeline_stages.append(output_label_indexer)

In [14]:
# tranforming the train data set using the pipeline
data_pipeline = Pipeline(stages=pipeline_stages)
data_pipeline_model = data_pipeline.fit(train_data)
transformed_data = temp_model.transform(train_data)
transformed_data.select("churned", "features", "label").show(5, truncate=False)
transformed_data.persist()

+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|churned|features                                                                                                                                                                                                                                                                                                   |label|
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|1.0    |[0.0,0.0,0.0,0.25,0.3213793103448276,0.6769

DataFrame[userId: string, churned: double, avg_num_of_add_to_playlist_per_session: double, avg_num_of_addfriends_per_session: double, avg_num_of_adverts_per_session: double, avg_num_of_artists_per_session: double, avg_num_of_songs_per_session: double, avg_num_of_thumbs_down_per_session: double, avg_num_of_thumbs_up_per_session: double, avg_num_of_times_settings_changed_per_session: double, average_number_of_visits_to_the_about_page_per_session: double, average_number_of_visits_to_the_help_page_per_session: double, avg_num_of_visits_to_home_per_session: double, avg_num_of_visits_to_the_settings_page_per_session: double, avg_num_of_visits_to_upgrade_page: double, avg_number_of_errors_per_session: double, avg_number_of_visits_to_downgrade_page: double, num_times_user_changed_levels: bigint, num_of_downgrades_submitted: bigint, num_of_upgrades_submitted: bigint, gender: string, numeric_features: vector, numeric_features_scaled: vector, gender_indexed: double, features: vector, label: doubl

## Model selection

In [17]:
# Instantiate different algorithms
model_names = ['logistic regression', 'random forest', 'gradient-boosted tree', 'linear svc',
               'decision tree', 'naive bayes']
estimators = []

lr = LogisticRegression()
estimators.append(lr)
rf = RandomForestClassifier(seed = 42)
estimators.append(rf)
gbt = GBTClassifier(seed = 42)
estimators.append(gbt)
svc = LinearSVC()
estimators.append(svc)
dt = DecisionTreeClassifier(seed = 42)
estimators.append(dt)
nb = NaiveBayes()
estimators.append(nb)

In [20]:
best_metric_value = -99
best_model = None

for model_name, est in zip(model_names,estimators):
    
    evaluator = BinaryClassificationEvaluator() # using a binary classification evalution with metric as AUCROC
    
    grid = ParamGridBuilder().build() # using an empty grid
    
    crossval = CrossValidator(estimator = est,
                            estimatorParamMaps=grid,
                            evaluator = evaluator) # using the default value for number of folds: 3
    
    cvmodel = crossval.fit(transformed_data)
    
    metric_val = evaluator.evaluate(cvmodel.transform(transformed_data))
    
    print(f"Algorithm: {model_name}")
    print(f"Cross validation score: {cvmodel.avgMetrics[0]}")
    print(f"AUCROC: {metric_val}")
    print()
    
    if metric_val > best_metric_value:
        best_metric_value = metric_val
        best_model = cvmodel.bestModel

Algorithm: logistic regression
Cross validation score: 0.6065959019718983
AUCROC: 0.7153784219001608

Algorithm: random forest
Cross validation score: 0.5990755120479732
AUCROC: 0.9315172660583285

Algorithm: gradient-boosted tree
Cross validation score: 0.5207247125103985
AUCROC: 0.9997316156736447

Algorithm: linear svc
Cross validation score: 0.6033980599829254
AUCROC: 0.6823224190373949

Algorithm: decision tree
Cross validation score: 0.5244351974131821
AUCROC: 0.40915190552871716

Algorithm: naive bayes
Cross validation score: 0.500353935878279
AUCROC: 0.5078278761853641



As it can be seen, logistic regression had the best average cross validation score. Followed by Linear SVC.
We will tune hyperparameters for the logistic regression model.

In [32]:
def train_classifier(estimator, evaluator, paramGrid, data):
    crossval = CrossValidator(estimator = est,
                            estimatorParamMaps=grid,
                            evaluator = evaluator) # using the default value for number of folds: 3
    
    cvmodel = crossval.fit(data)
    
    return cvmodel

In [33]:
est = LogisticRegression()

grid = ParamGridBuilder().addGrid(est.maxIter, [100, 200, 300]).addGrid(est.regParam, [0.001, 0.01, 0.1, 1, 3, 5]).addGrid(est.threshold, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]).build()

evaluator = BinaryClassificationEvaluator()

lr = train_classifier(est, evaluator, grid, transformed_data)

In [34]:
print(f"Cross validation score: {lr.avgMetrics[0]}")

Cross validation score: 0.6184790292341781


In [37]:
print(lr.bestModel.getElasticNetParam())

0.0


In [39]:
lr.bestModel.explainParams()

"aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\nfamily: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)\nfeaturesCol: features column name. (default: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label)\nlowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)\nlowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bounds vector size must be

In [74]:
print("maxIter: ",lr.bestModel.getMaxIter())
print("threshold: ", lr.bestModel.getThreshold())
print("regParam: ", lr.bestModel.getRegParam())

maxIter:  100
threshold:  0.3
regParam:  0.01


In [49]:
lr.bestModel.coefficients.values

array([-0.24864945, -0.10793773, -0.25570171,  1.03269487, -0.49079552,
        2.30837552,  0.46266292,  0.79770161,  1.70916452, -1.33829473,
       -1.29602326, -0.72130076,  0.92615866, -0.20009427,  0.05294991,
       -0.04719589, -1.50679953,  1.85935604,  0.29654608])

In [60]:
all_features = numeric_column_names + categoric_column_names

In [62]:
feature_coefficient = pd.DataFrame.from_dict({'feature':all_features,'LR coefficient': lr.bestModel.coefficients.values})
feature_coefficient

Unnamed: 0,feature,LR coefficient
0,num_times_user_changed_levels,-0.248649
1,num_of_downgrades_submitted,-0.107938
2,num_of_upgrades_submitted,-0.255702
3,avg_num_of_add_to_playlist_per_session,1.032695
4,avg_num_of_addfriends_per_session,-0.490796
5,avg_num_of_adverts_per_session,2.308376
6,avg_num_of_artists_per_session,0.462663
7,avg_num_of_songs_per_session,0.797702
8,avg_num_of_thumbs_down_per_session,1.709165
9,avg_num_of_thumbs_up_per_session,-1.338295


Add the logistic regression model as the final stage

In [75]:
pipeline_stages.append(LogisticRegression(maxIter=100, regParam=0.01, threshold=0.3))

Transform and evaluate performance on Test data

In [77]:
evaluator = BinaryClassificationEvaluator()

training_pipeline = Pipeline(stages=pipeline_stages)
training_pipeline_model = training_pipeline.fit(train_data)

transformed_test_data = training_pipeline_model.transform(test_data)

print("AUCROC score on test data: ", evaluator.evaluate(transformed_test_data))

AUCROC score on test data:  0.5144596651445966
