In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# WARNING: this will allocate 8 gigabytes of RAM to the spark driver, it was necessary as out of memory exceptions would
#  occur during cross validation model training with parallel processes
spark = SparkSession.builder.master("local[*]").config("spark.driver.memory", "8g").getOrCreate()

In [2]:
# !pip install numpy

## Reading and analyzing the data

### The 'Kaggle Telco Customer Churn' data is used: [link](https://www.kaggle.com/blastchar/telco-customer-churn)

In [3]:
from pyspark.sql.functions import col
from pyspark.sql.types import *

churn_schema = StructType([
    StructField("customerID", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("SeniorCitizen", IntegerType(), True),
    StructField("Partner", StringType(), True),
    StructField("Dependents", StringType(), True),
    StructField("tenure", IntegerType(), True),
    StructField("PhoneService", StringType(), True),
    StructField("MultipleLines", StringType(), True),
    StructField("InternetService", StringType(), True),
    StructField("OnlineSecurity", StringType(), True),
    StructField("OnlineBackup", StringType(), True),
    StructField("DeviceProtection", StringType(), True),
    StructField("TechSupport", StringType(), True),
    StructField("StreamingTV", StringType(), True),
    StructField("StreamingMovies", StringType(), True),
    StructField("Contract", StringType(), True),
    StructField("PaperlessBilling", StringType(), True),
    StructField("PaymentMethod", StringType(), True),
    StructField("MonthlyCharges", FloatType(), True),
    StructField("TotalCharges", FloatType(), True),
    StructField("Churn", StringType(), True)
])

In [4]:
churn_df = spark.read.csv("WA_Fn-UseC_-Telco-Customer-Churn.xls", header=True, schema=churn_schema)

In [5]:
churn_df.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: float (nullable = true)
 |-- TotalCharges: float (nullable = true)
 |-- Churn: string (nullable = true)



In [6]:
churn_df.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|  

In [7]:
# Counting number of missing (nan) values in each column, used this blogpost as a reference:
#  https://www.datasciencemadesimple.com/count-of-missing-nanna-and-null-values-in-pyspark/

churn_df.select([F.count(F.when(F.isnan(col), col)).alias(col) for col in churn_df.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [8]:
# Counting number of null values in each column, used this blogpost as a reference:
#  https://www.datasciencemadesimple.com/count-of-missing-nanna-and-null-values-in-pyspark/

churn_df.select([F.count(F.when(F.isnull(col), col)).alias(col) for col in churn_df.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [9]:
# Checking number of duplicate rows
#  Used this stackoverflow answer: https://stackoverflow.com/a/48554666
churn_df.groupBy(churn_df.columns)\
    .count()\
    .where(F.col('count') > 1)\
    .select(F.sum('count'))\
    .show()

+----------+
|sum(count)|
+----------+
|      null|
+----------+



## Imputing null values with mean value 

In [10]:
churn_df = churn_df.na.fill(
    {
        'MonthlyCharges': churn_df.agg({"MonthlyCharges": "avg"}).collect()[0][0],
        'TotalCharges': churn_df.agg({"TotalCharges": "avg"}).collect()[0][0]
    }
)

## Data preparation

In [11]:
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder

In [12]:
# Change the label column to label
churn_df = churn_df.withColumnRenamed('Churn','label')

In [13]:
# List of columns need to be indexed and featurized
col_list = [
    "gender",
    "SeniorCitizen", 
    "Partner",
    "Dependents",
    "PhoneService", 
    "MultipleLines",
    "InternetService", 
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport", 
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling", 
    "PaymentMethod"
]

featurized_col_list = col_list + ["tenure", "MonthlyCharges", "TotalCharges"]

In [14]:
# List of features and label indexers 
indexers = [
    StringIndexer(inputCol=c, outputCol=f'{c}_indexed')
    for c in col_list
]

label_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
indexers.append(label_indexer)

In [15]:
# One hot encode the categorical columns
encoder = OneHotEncoder(
    inputCols = [f'{c}_indexed' for c in col_list], 
    outputCols=[f'{c}_vector' for c in col_list],
    dropLast=True
)

In [16]:
# Vectorizing the features
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(inputCols=[f'{c}_vector' for c in col_list], outputCol="features")

### Pipeline preparation

In [17]:
# Define the pipline
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=indexers + [encoder, vector_assembler])

In [18]:
# fit and transform the pipeline
pipeline_model = pipeline.fit(churn_df)
df = pipeline_model.transform(churn_df)

In [19]:
df.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: float (nullable = false)
 |-- TotalCharges: float (nullable = false)
 |-- label: string (nullable = true)
 |-- gender_indexed: double (nullable = false)
 |-- SeniorCitizen_indexed: double 

In [20]:
df.select("StreamingMovies", "StreamingMovies_indexed", "StreamingMovies_vector").show()

+-------------------+-----------------------+----------------------+
|    StreamingMovies|StreamingMovies_indexed|StreamingMovies_vector|
+-------------------+-----------------------+----------------------+
|                 No|                    0.0|         (2,[0],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                Yes|                    1.0|         (2,[1],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                Yes|                    1.0|         (2,[1],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|                 No|                    0.0|         (2,[0],[1.0])|
|No internet service|             

In [21]:
cols_drop = [f'{c}_indexed' for c in col_list] + [f'{c}_vector' for c in col_list] + [f'{c}' for c in featurized_col_list] + ['customerID']
df = df.drop(*cols_drop)

In [22]:
df.printSchema()

root
 |-- label: string (nullable = true)
 |-- labelIndex: double (nullable = false)
 |-- features: vector (nullable = true)



In [23]:
df.show()

+-----+----------+--------------------+
|label|labelIndex|            features|
+-----+----------+--------------------+
|   No|       0.0|(27,[1,3,8,9,12,1...|
|   No|       0.0|(27,[0,1,2,3,4,5,...|
|  Yes|       1.0|(27,[0,1,2,3,4,5,...|
|   No|       0.0|(27,[0,1,2,3,8,10...|
|  Yes|       1.0|(27,[1,2,3,4,5,7,...|
|  Yes|       1.0|(27,[1,2,3,4,6,7,...|
|   No|       0.0|(27,[0,1,2,4,6,7,...|
|   No|       0.0|(27,[1,2,3,8,10,1...|
|  Yes|       1.0|(27,[1,3,4,6,7,9,...|
|   No|       0.0|(27,[0,1,2,4,5,8,...|
|   No|       0.0|(27,[0,1,4,5,8,10...|
|   No|       0.0|(27,[0,1,2,3,4,5,...|
|   No|       0.0|(27,[0,1,3,4,6,7,...|
|  Yes|       1.0|(27,[0,1,2,3,4,6,...|
|   No|       0.0|(27,[0,1,2,3,4,5,...|
|   No|       0.0|(27,[1,4,6,7,10,1...|
|   No|       0.0|(27,[1,2,3,4,5,25...|
|   No|       0.0|(27,[0,1,2,4,6,7,...|
|  Yes|       1.0|(27,[1,4,5,8,9,11...|
|   No|       0.0|(27,[1,2,3,4,5,7,...|
+-----+----------+--------------------+
only showing top 20 rows



### Splitting data to train/test splits

In [24]:
(train_data, test_data) = df.randomSplit(weights=[0.7, 0.3], seed=420)

In [25]:
train_data.describe().show()

+-------+-----+-------------------+
|summary|label|         labelIndex|
+-------+-----+-------------------+
|  count| 4840|               4840|
|   mean| null|0.26776859504132233|
| stddev| null| 0.4428420632218157|
|    min|   No|                0.0|
|    max|  Yes|                1.0|
+-------+-----+-------------------+



In [26]:
test_data.describe().show()

+-------+-----+-------------------+
|summary|label|         labelIndex|
+-------+-----+-------------------+
|  count| 2203|               2203|
|   mean| null|0.26009986382206085|
| stddev| null|0.43878847015331973|
|    min|   No|                0.0|
|    max|  Yes|                1.0|
+-------+-----+-------------------+



### Classifier model training (Grid Search with Cross Validation)

### Model Evaluation 

#### Link to MulticlassClassificationEvaluator list of metric names:
[link](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator.metricName)

In [27]:
# Test set evaluation code

from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

def evaluate_model(predictions):
    # 1- Confusion Matrix calculation
    #  taken from this stackoverflow post: https://stackoverflow.com/a/58405759/10086080
    # select only prediction and label columns
    preds_and_labels = predictions.select(['prediction', 'labelIndex'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print("Confusion Matrix:\n{}".format(metrics.confusionMatrix().toArray()))
    
    # 2- Accuracy metric calculation
    evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {} ".format(accuracy))
    
    # 3- Weighted Precision metric calculation
    evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="weightedPrecision")
    precision = evaluator.evaluate(predictions)
    print("Weighted Precision = {}".format(precision))
    
    
    # 4- Weighted Recall metric calculation
    evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="weightedRecall")
    recall = evaluator.evaluate(predictions)
    print("Weighted Recall = {}".format(recall))


    # 5- F1-score metric calculation
    evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="f1")
    f1 = evaluator.evaluate(predictions)
    print("F1-score = {}".format(f1))


    # 6- Receiver Operating Characteristic Area Under Curve metric calculation
    evaluator = BinaryClassificationEvaluator(labelCol="labelIndex", rawPredictionCol="prediction", metricName="areaUnderROC")
    roc_auc = evaluator.evaluate(predictions)
    print("ROC AUC: {}".format(roc_auc))

### 1- Decision Tree Classifier Training

In [28]:
from pyspark.ml.classification import DecisionTreeClassifier

model = DecisionTreeClassifier(
    labelCol="labelIndex",
    featuresCol="features",
    maxDepth=5,
    maxBins=32
)

In [29]:
# Cross Validator takes a pipeline object containing the model not the model object itself
pipeline = Pipeline(stages=[model])

In [30]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

param_grid = ParamGridBuilder() \
    .baseOn({model.labelCol: 'labelIndex'}) \
    .baseOn([model.predictionCol, 'prediction']) \
    .addGrid(model.maxDepth, [i for i in range(2, 10)]) \
    .addGrid(model.maxBins, [i for i in range(2, 60, 10)]) \
    .build()

In [31]:
len(param_grid)

48

In [32]:
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [33]:
import os

cross_validator = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=10,
    parallelism=os.cpu_count() # To parallelize over all available cpu cores
)

In [34]:
# Run cross validations
cv_model = cross_validator.fit(train_data)
print(cv_model)

CrossValidatorModel_46feeeb7232e


In [35]:
len(cv_model.avgMetrics)

48

In [36]:
cv_model.avgMetrics

[0.7664735363651638,
 0.7664735363651638,
 0.7664735363651638,
 0.7664735363651638,
 0.7664735363651638,
 0.7664735363651638,
 0.7714399024606208,
 0.7714399024606208,
 0.7714399024606208,
 0.7714399024606208,
 0.7714399024606208,
 0.7714399024606208,
 0.7728447372784946,
 0.7728447372784946,
 0.7728447372784946,
 0.7728447372784946,
 0.7728447372784946,
 0.7728447372784946,
 0.7766535588657811,
 0.7766535588657811,
 0.7766535588657811,
 0.7766535588657811,
 0.7766535588657811,
 0.7766535588657811,
 0.7753615369370381,
 0.7753615369370381,
 0.7753615369370381,
 0.7753615369370381,
 0.7753615369370381,
 0.7753615369370381,
 0.7754599723387284,
 0.7754599723387284,
 0.7754599723387284,
 0.7754599723387284,
 0.7754599723387284,
 0.7754599723387284,
 0.762426313561421,
 0.762426313561421,
 0.762426313561421,
 0.762426313561421,
 0.762426313561421,
 0.762426313561421,
 0.7564728144943885,
 0.7564728144943885,
 0.7564728144943885,
 0.7564728144943885,
 0.7564728144943885,
 0.7564728144943885

In [37]:
# Use test set here so we can measure the accuracy of our model on new data
predictions = cv_model.transform(test_data)

In [38]:
evaluate_model(predictions)

Confusion Matrix:
[[1412.  218.]
 [ 277.  296.]]
Accuracy = 0.7753064003631411 
Weighted Precision = 0.7683398971512245
Weighted Recall = 0.7753064003631411
F1-score = 0.771205585985192
ROC AUC: 0.6914185376717095


### Extacting best decision tree model from cross validator object

In [39]:
best_model = cv_model.bestModel

In [40]:
best_model.stages

[DecisionTreeClassificationModel: uid=DecisionTreeClassifier_00dd760c28c2, depth=5, numNodes=17, numClasses=2, numFeatures=27]

In [41]:
best_model = best_model.stages[0]

In [42]:
print(best_model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_00dd760c28c2, depth=5, numNodes=17, numClasses=2, numFeatures=27
  If (feature 21 in {0.0})
   Predict: 0.0
  Else (feature 21 not in {0.0})
   If (feature 7 in {0.0})
    If (feature 11 in {0.0})
     Predict: 0.0
    Else (feature 11 not in {0.0})
     If (feature 24 in {0.0})
      Predict: 0.0
     Else (feature 24 not in {0.0})
      If (feature 1 in {0.0})
       Predict: 1.0
      Else (feature 1 not in {0.0})
       Predict: 0.0
   Else (feature 7 not in {0.0})
    If (feature 9 in {0.0})
     Predict: 0.0
    Else (feature 9 not in {0.0})
     If (feature 24 in {0.0})
      If (feature 11 in {0.0})
       Predict: 0.0
      Else (feature 11 not in {0.0})
       Predict: 1.0
     Else (feature 24 not in {0.0})
      Predict: 1.0



In [43]:
best_model.extractParamMap()

{Param(parent='DecisionTreeClassifier_00dd760c28c2', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='DecisionTreeClassifier_00dd760c28c2', name='labelCol', doc='label column name.'): 'labelIndex',
 Param(parent='DecisionTreeClassifier_00dd760c28c2', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='DecisionTreeClassifier_00dd760c28c2', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',
 Param(parent='DecisionTreeClassifier_00dd760c28c2', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',
 Param(parent='DecisionTreeClassifier_00dd760c28c2', name='seed', doc='random seed.'): -2345538198137950020,
 Param(parent='DecisionTreeClassifier_00dd760c28c2', name='cacheNodeIds', doc='If false

In [44]:
best_model.save("decision-tree-classifier-model")

### 2- Random Forest Classifier Training

In [45]:
from pyspark.ml.classification import RandomForestClassifier

model = RandomForestClassifier(
    labelCol="labelIndex",
    featuresCol="features",
    maxDepth=5,
    maxBins=32,
    impurity='gini',
    numTrees=20
)


In [46]:
# Cross Validator takes a pipeline object containing the model not the model object itself
pipeline = Pipeline(stages=[model])

In [47]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

param_grid = ParamGridBuilder() \
    .baseOn({model.labelCol: 'labelIndex'}) \
    .baseOn([model.predictionCol, 'prediction']) \
    .addGrid(model.maxDepth, [i for i in range(2, 10)]) \
    .addGrid(model.numTrees, [i for i in range(20, 1000, 100)]) \
    .build()

In [48]:
len(param_grid)

80

In [49]:
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [50]:
import os

cross_validator = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=10,
    parallelism=os.cpu_count() # To parallelize over all available cpu cores
)

In [51]:
# Run cross validations
cv_model = cross_validator.fit(train_data)
print(cv_model)

CrossValidatorModel_719ca9d5b3d8


In [52]:
len(cv_model.avgMetrics)

80

In [53]:
cv_model.avgMetrics

[0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7323734482225532,
 0.7728318880195116,
 0.7731528129755629,
 0.7727082710040151,
 0.7757599091000309,
 0.7727838388869768,
 0.7735200520551334,
 0.7737545461825858,
 0.773984156990186,
 0.7743626765801586,
 0.7758040570350005,
 0.7792724806030709,
 0.7778668919799536,
 0.778445049786138,
 0.7795448442606953,
 0.7788498251526307,
 0.7782568849630561,
 0.7778441825896664,
 0.7782287235562558,
 0.777624213248263,
 0.7770127312196897,
 0.7785014995697841,
 0.7805833535723318,
 0.7822411291693209,
 0.7835382018356385,
 0.7820158430757321,
 0.7812193451321917,
 0.7810458214954286,
 0.7818363618224641,
 0.7817614782837156,
 0.7826856137814546,
 0.7768876655205772,
 0.7810785448150421,
 0.7834984922365217,
 0.7820609055872171,
 0.7822740981413822,
 0.7818216392035418,
 0.7822489069947696,
 0.7818324322121

In [54]:
# Use test set here so we can measure the accuracy of our model on new data
predictions = cv_model.transform(test_data)

In [55]:
evaluate_model(predictions)

Confusion Matrix:
[[1481.  149.]
 [ 324.  249.]]
Accuracy = 0.7852927825692237 
Weighted Precision = 0.7698128311418008
Weighted Recall = 0.7852927825692237
F1-score = 0.771414062098251
ROC AUC: 0.6715719654386021


### Extacting best random forest model from cross validator object

In [56]:
best_model = cv_model.bestModel

In [57]:
best_model.stages

[RandomForestClassificationModel: uid=RandomForestClassifier_6c4625416f03, numTrees=320, numClasses=2, numFeatures=27]

In [58]:
best_model = best_model.stages[0]

In [59]:
best_model.extractParamMap()

{Param(parent='RandomForestClassifier_6c4625416f03', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='RandomForestClassifier_6c4625416f03', name='labelCol', doc='label column name.'): 'labelIndex',
 Param(parent='RandomForestClassifier_6c4625416f03', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='RandomForestClassifier_6c4625416f03', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',
 Param(parent='RandomForestClassifier_6c4625416f03', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',
 Param(parent='RandomForestClassifier_6c4625416f03', name='seed', doc='random seed.'): -4381064176000401727,
 Param(parent='RandomForestClassifier_6c4625416f03', name='bootstrap', doc='Whether boo

In [60]:
best_model.save("random-forest-classifier-model")

### 3- Gradient-Boosted Tree Classifier Training

In [61]:
from pyspark.ml.classification import GBTClassifier

model = GBTClassifier(
    labelCol="labelIndex",
    featuresCol="features",
    maxDepth=5,
    maxBins=32,
    maxIter=20
)

In [62]:
# Cross Validator takes a pipeline object containing the model not the model object itself
pipeline = Pipeline(stages=[model])

In [63]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

param_grid = ParamGridBuilder() \
    .baseOn({model.labelCol: 'labelIndex'}) \
    .baseOn([model.predictionCol, 'prediction']) \
    .addGrid(model.maxDepth, [i for i in range(2, 10)]) \
    .addGrid(model.maxBins, [i for i in range(2, 60, 10)]) \
    .addGrid(model.maxIter, [i for i in range(20, 50, 10)]) \
    .build()

In [64]:
len(param_grid)

144

In [65]:
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [66]:
import os

cross_validator = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=10,
    parallelism=os.cpu_count() # To parallelize over all available cpu cores
)

In [67]:
# Run cross validations
cv_model = cross_validator.fit(train_data)
print(cv_model)

CrossValidatorModel_efa748a43ee9


In [68]:
len(cv_model.avgMetrics)

144

In [69]:
cv_model.avgMetrics

[0.7814114005374189,
 0.783502483275784,
 0.7836887934941204,
 0.7814114005374189,
 0.783502483275784,
 0.7836887934941204,
 0.7814114005374189,
 0.783502483275784,
 0.7836887934941204,
 0.7814114005374189,
 0.783502483275784,
 0.7836887934941204,
 0.7814114005374189,
 0.783502483275784,
 0.7836887934941204,
 0.7814114005374189,
 0.783502483275784,
 0.7836887934941204,
 0.7830796548105233,
 0.7826617467166068,
 0.7808372884775212,
 0.7830796548105233,
 0.7826617467166068,
 0.7808372884775212,
 0.7830796548105233,
 0.7826617467166068,
 0.7808372884775212,
 0.7830796548105233,
 0.7826617467166068,
 0.7808372884775212,
 0.7830796548105233,
 0.7826617467166068,
 0.7808372884775212,
 0.7830796548105233,
 0.7826617467166068,
 0.7808372884775212,
 0.7833885782196073,
 0.7837921816550276,
 0.7808616943953015,
 0.7833885782196073,
 0.7837921816550276,
 0.7808616943953015,
 0.7833885782196073,
 0.7837921816550276,
 0.7808616943953015,
 0.7833885782196073,
 0.7837921816550276,
 0.7808616943953015

In [70]:
# Use test set here so we can measure the accuracy of our model on new data
predictions = cv_model.transform(test_data)

In [71]:
evaluate_model(predictions)

Confusion Matrix:
[[1435.  195.]
 [ 282.  291.]]
Accuracy = 0.7834770767135724 
Weighted Precision = 0.7741177796364098
Weighted Recall = 0.7834770767135724
F1-score = 0.7773971620100744
ROC AUC: 0.6941107506504353


### Extacting best gradient-boosted tree model from cross validator object

In [72]:
best_model = cv_model.bestModel

In [73]:
best_model.stages

[GBTClassificationModel: uid = GBTClassifier_367b4a24c384, numTrees=30, numClasses=2, numFeatures=27]

In [74]:
best_model = best_model.stages[0]

In [75]:
print(best_model.toDebugString)

GBTClassificationModel: uid = GBTClassifier_367b4a24c384, numTrees=30, numClasses=2, numFeatures=27
  Tree 0 (weight 1.0):
    If (feature 21 in {0.0})
     If (feature 7 in {0.0})
      If (feature 9 in {0.0})
       If (feature 13 in {0.0})
        Predict: -0.9683426443202979
       Else (feature 13 not in {0.0})
        Predict: -0.9278350515463918
      Else (feature 9 not in {0.0})
       If (feature 22 in {1.0})
        Predict: -0.9368421052631579
       Else (feature 22 not in {1.0})
        Predict: -0.7215189873417721
     Else (feature 7 not in {0.0})
      If (feature 22 in {1.0})
       If (feature 9 in {0.0})
        Predict: -0.9090909090909091
       Else (feature 9 not in {0.0})
        Predict: -0.7297297297297297
      Else (feature 22 not in {1.0})
       If (feature 19 in {1.0})
        Predict: -0.8823529411764706
       Else (feature 19 not in {1.0})
        Predict: -0.5140845070422535
    Else (feature 21 not in {0.0})
     If (feature 7 in {0.0})
      If (fe

In [76]:
best_model.extractParamMap()

{Param(parent='GBTClassifier_367b4a24c384', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='GBTClassifier_367b4a24c384', name='labelCol', doc='label column name.'): 'labelIndex',
 Param(parent='GBTClassifier_367b4a24c384', name='predictionCol', doc='prediction column name.'): 'prediction',
 Param(parent='GBTClassifier_367b4a24c384', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability',
 Param(parent='GBTClassifier_367b4a24c384', name='rawPredictionCol', doc='raw prediction (a.k.a. confidence) column name.'): 'rawPrediction',
 Param(parent='GBTClassifier_367b4a24c384', name='seed', doc='random seed.'): -4982131694872646615,
 Param(parent='GBTClassifier_367b4a24c384', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances

In [77]:
best_model.save("gbt-classifier-model")

In [85]:
!zip -r decision-tree-classifier-model.zip decision-tree-classifier-model/

  adding: decision-tree-classifier-model/ (stored 0%)
  adding: decision-tree-classifier-model/data/ (stored 0%)
  adding: decision-tree-classifier-model/data/.part-00000-ea094244-a67f-4d02-ae5c-e270012bf099-c000.snappy.parquet.crc (stored 0%)
  adding: decision-tree-classifier-model/data/.part-00002-ea094244-a67f-4d02-ae5c-e270012bf099-c000.snappy.parquet.crc (stored 0%)
  adding: decision-tree-classifier-model/data/part-00010-ea094244-a67f-4d02-ae5c-e270012bf099-c000.snappy.parquet (deflated 68%)
  adding: decision-tree-classifier-model/data/part-00003-ea094244-a67f-4d02-ae5c-e270012bf099-c000.snappy.parquet (deflated 69%)
  adding: decision-tree-classifier-model/data/_SUCCESS (stored 0%)
  adding: decision-tree-classifier-model/data/part-00009-ea094244-a67f-4d02-ae5c-e270012bf099-c000.snappy.parquet (deflated 69%)
  adding: decision-tree-classifier-model/data/.part-00006-ea094244-a67f-4d02-ae5c-e270012bf099-c000.snappy.parquet.crc (stored 0%)
  adding: decision-tree-classifier-model

In [88]:
!zip -r gbt-classifier-model.zip gbt-classifier-model/

updating: gbt-classifier-model/ (stored 0%)
  adding: gbt-classifier-model/data/ (stored 0%)
  adding: gbt-classifier-model/data/.part-00013-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet.crc (stored 0%)
  adding: gbt-classifier-model/data/part-00012-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet (deflated 40%)
  adding: gbt-classifier-model/data/part-00015-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet (deflated 40%)
  adding: gbt-classifier-model/data/.part-00004-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet.crc (stored 0%)
  adding: gbt-classifier-model/data/.part-00003-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet.crc (stored 0%)
  adding: gbt-classifier-model/data/.part-00008-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet.crc (stored 0%)
  adding: gbt-classifier-model/data/_SUCCESS (stored 0%)
  adding: gbt-classifier-model/data/.part-00000-857a03d6-3fd9-4a0f-9355-14cf2b41c035-c000.snappy.parquet.crc (stored 0%)
  addin

In [89]:
!zip -r random-forest-classifier-model.zip random-forest-classifier-model/

updating: random-forest-classifier-model/ (stored 0%)
  adding: random-forest-classifier-model/data/ (stored 0%)
  adding: random-forest-classifier-model/data/.part-00015-8469182a-bc09-4f44-9f77-bce9f0f4d62c-c000.snappy.parquet.crc (stored 0%)
  adding: random-forest-classifier-model/data/.part-00008-8469182a-bc09-4f44-9f77-bce9f0f4d62c-c000.snappy.parquet.crc (stored 0%)
  adding: random-forest-classifier-model/data/part-00002-8469182a-bc09-4f44-9f77-bce9f0f4d62c-c000.snappy.parquet (deflated 28%)
  adding: random-forest-classifier-model/data/.part-00010-8469182a-bc09-4f44-9f77-bce9f0f4d62c-c000.snappy.parquet.crc (stored 0%)
  adding: random-forest-classifier-model/data/_SUCCESS (stored 0%)
  adding: random-forest-classifier-model/data/.part-00001-8469182a-bc09-4f44-9f77-bce9f0f4d62c-c000.snappy.parquet.crc (stored 0%)
  adding: random-forest-classifier-model/data/.part-00007-8469182a-bc09-4f44-9f77-bce9f0f4d62c-c000.snappy.parquet.crc (stored 0%)
  adding: random-forest-classifier-m