In [22]:
import os 
# from utils.utils import *

# from fog.code.utils.utils import *
from pyspark.sql.functions import lit, col, when
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, DateType
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from google.cloud import storage

import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

from pyspark.ml.classification import MultilayerPerceptronClassifier, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import confusion_matrix

In [2]:
spark = SparkSession.builder.appName("Model Test").getOrCreate()

## Functions for data loading and assembling

In [3]:
def feed_files(top_bucket_name, prefix, suffix):
    client = storage.Client()
    blobs = client.list_blobs(top_bucket_name, prefix=prefix)

    processed = None

    for i, blob in enumerate(blobs):
        print(blob.name)
        if blob.name.endswith(suffix):
            
            if suffix == ".parquet":
                df = spark.read.parquet(f"gs://{top_bucket_name}/{blob.name}")
            elif suffix == ".csv":
                df = spark.read.csv(f"gs://{top_bucket_name}/{blob.name}")
            if processed is None:
                processed = df
            else:
                processed = processed.union(df)
    return processed

In [4]:
data_path = "parkinsons_data/train/processed/defog_tasks_lagging"
top_bucket_name = "msca-bdp-student-gcs"

In [5]:
df = feed_files(top_bucket_name, data_path, ".parquet")

parkinsons_data/train/processed/defog_tasks_lagging/
parkinsons_data/train/processed/defog_tasks_lagging/_SUCCESS
parkinsons_data/train/processed/defog_tasks_lagging/part-00000-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet


                                                                                

parkinsons_data/train/processed/defog_tasks_lagging/part-00001-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00002-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00003-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00004-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00005-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00006-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00007-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00008-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_ta

In [6]:
df.columns

['Subject',
 'Visit',
 'Id',
 'Time',
 'AccV',
 'AccML',
 'AccAP',
 'StartHesitation',
 'Turn',
 'Walking',
 'Valid',
 'Task',
 'SourceDefog',
 'Medication',
 'Age',
 'Sex',
 'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ',
 'Test',
 'TimeSeconds',
 'Begin',
 'End',
 'TaskType',
 'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
 'target',
 'features',
 'standardized',
 'prediction_67',
 'prediction4',
 'AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_

## Update target columns
The targets (Turn, Walking, StartHesitation) are only accurate when the columns "Valid" and "Task" are both true.

In [7]:
df = df.withColumn("StartHesTrue", when((df.StartHesitation == 1) & (df.Valid == True) & (df.Task == True), 1).otherwise(0))
df = df.withColumn("TurnTrue", when((df.Turn == 1) & (df.Valid == True) & (df.Task == True), 1)
      .otherwise(0))
df = df.withColumn("WalkTrue", when((df.Walking == 1) & (df.Valid == True) & (df.Task == True), 1)
      .otherwise(0))
df = df.withColumn("NoTrue", when((df.StartHesTrue == 0) & (df.TurnTrue == 0)  & (df.WalkTrue == 0) & (df.Task == True), 1)
      .otherwise(0))
df = df.withColumn("target", when((df.target == 1) & (df.Valid == True) & (df.Task == True), 1)
                   .when((df.target == 2) & (df.Valid == True) & (df.Task == True), 2)
                   .when((df.target == 3) & (df.Valid == True) & (df.Task == True), 3)
                   .otherwise(0))

In [8]:
df.select("target").groupBy("Target").count().show()

                                                                                

+------+--------+
|Target|   count|
+------+--------+
|     1|      88|
|     3|   70521|
|     2|  414380|
|     0|13040714|
+------+--------+



In [9]:
df.select("WalkTrue").groupBy("WalkTrue").count().show()
df.select("StartHesTrue").groupBy("StartHesTrue").count().show()
df.select("TurnTrue").groupBy("TurnTrue").count().show()

                                                                                

+--------+--------+
|WalkTrue|   count|
+--------+--------+
|       1|   70521|
|       0|13455182|
+--------+--------+



                                                                                

+------------+--------+
|StartHesTrue|   count|
+------------+--------+
|           1|      88|
|           0|13525615|
+------------+--------+



                                                                                

+--------+--------+
|TurnTrue|   count|
+--------+--------+
|       1|  414380|
|       0|13111323|
+--------+--------+



## Balancing the dataframe
Code to balance the target class counts to improve classification accuracy

In [10]:
def resample(large_dataframe, ratio, class_field, base_class):
    """
    Resample the majority class so your dataset is more balanced.
    large_dataframe: df with all rows for all labels
    ratio: ratio:1 == new_num_majority_samples:num_minority samples
    class_field: target class field name
    base_class: majority class
    """
    pos = large_dataframe.filter(large_dataframe[class_field] != base_class)
    neg = large_dataframe.filter(large_dataframe[class_field] == base_class)
    total_pos = pos.count()
    total_neg = neg.count()
    print(total_pos)
    print(total_neg)
    print(ratio)
    fraction=float(total_pos*ratio)/float(total_neg)
    sampled = neg.sample(False,fraction)
    
    return sampled.union(pos)

In [11]:
dfsamp = resample(df, 4, "target", 0)
dfsamp.select("target").groupBy("target").count().show()

                                                                                

484989
13040714
4




+------+-------+
|target|  count|
+------+-------+
|     1|     88|
|     3|  70521|
|     2| 414380|
|     0|1939568|
+------+-------+



                                                                                

In [12]:
df2 = dfsamp.select('AccV',
 'AccML',
 'AccAP',
 'Medication',
 'Age',
 'Sex',
 'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ',
 'TimeSeconds',
 'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
 'target',
'prediction_67',
 'prediction4',
 'AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_lag2',
 'AccAP_lag3',
 'AccAP_lag4',
 'AccAP_lag5',
 'AccAP_lag6',
 'AccAP_lag7',
 'AccAP_lag8',
 'AccAP_lag9',
 'AccAP_lag10',
 'StartHesTrue',
 'TurnTrue',
 'WalkTrue',
 'NoTrue')

In [13]:
floats = ['AccV',
 'AccML',
 'AccAP','AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_lag2',
 'AccAP_lag3',
 'AccAP_lag4',
 'AccAP_lag5',
 'AccAP_lag6',
 'AccAP_lag7',
 'AccAP_lag8',
 'AccAP_lag9',
 'AccAP_lag10']

In [14]:
# Need to cast to "double" so vectorAssembler will work
for col in floats:
    df2 = df2.withColumn(
    col,
    F.col(col).cast("double"))

In [15]:
df2 = df2.fillna(0)

## Pipeline Creation

In [16]:
sex_indexer = StringIndexer(inputCol="Sex", outputCol="sexIndex")
med_indexer = StringIndexer(inputCol="Medication", outputCol="medIndex")

ohe_clust67 = OneHotEncoder(inputCol="prediction_67", outputCol="pred67ohe")
ohe_clust4 = OneHotEncoder(inputCol="prediction4", outputCol="pred4ohe")
ohe_sex = OneHotEncoder(inputCol="sexIndex", outputCol="SexOhe")
ohe_med = OneHotEncoder(inputCol="medIndex", outputCol="MedOhe")

assemble=VectorAssembler(inputCols=['AccV', 'AccML', 'AccAP', 'Age'], outputCol = 'assemblefeats')

scaleML = StandardScaler(inputCol='assemblefeats', outputCol = 'scalefeats')

#Create pipeline and pass all stages
pipeline = Pipeline(stages=[sex_indexer,
                            med_indexer,
                            ohe_clust67,
                            ohe_clust4,
                            ohe_sex,
                            ohe_med,
                            assemble,
                            scaleML])

In [17]:
df2 = pipeline.fit(df2).transform(df2)
df2.show()

23/05/22 15:19:43 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/05/22 15:19:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 15:19:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 15:19:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB

+-------------------+--------------------+--------------------+----------+---+---+------------+-----------+------------+-----+-----------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+------+-------------+-----------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+

                                                                                

In [18]:
labels = ['StartHesTrue',
 'TurnTrue',
 'WalkTrue',
 'NoTrue']

feats = ['scalefeats',
    'pred67ohe',
 'pred4ohe',
 'SexOhe',
 'MedOhe', 
    'AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_lag2',
 'AccAP_lag3',
 'AccAP_lag4',
 'AccAP_lag5',
 'AccAP_lag6',
 'AccAP_lag7',
 'AccAP_lag8',
 'AccAP_lag9',
 'AccAP_lag10', 'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ']

## Model Creation and Fitting

In [46]:
def fit_models(model_name, model, train_data, test_data):
    print("Begin training {}".format(model_name))
    mod_metrics = []
    model = model.fit(train_data)
    
    print("Begin evaluation on test data")
    pred = model.transform(test_data)
    
    for col in ['label','prediction']:
        pred = pred.withColumn(col, F.col(col).cast("double"))
        
    metrics = MulticlassMetrics(pred.select("prediction","label").rdd)
    mod_metrics.append(metrics) 
    print("{} Accuracy = {}".format(model_name, metrics.accuracy))
    print("{} Weighted F1 = {}".format(model_name, metrics.weightedFMeasure()))
    
    y_pred=pred.select("prediction").collect()
    y_orig=pred.select("label").collect()

    cm = confusion_matrix(y_orig, y_pred)
    print("{} Confustion Matrix".format(model_name))
    print(cm)
    
    return (cm, mod_metrics)


In [20]:
feats_sm = ['scalefeats',
 'SexOhe',
 'MedOhe', 
'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ']

feats_med = ['scalefeats',
 'SexOhe',
 'MedOhe', 
    'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ']

In [None]:
matrices = []
evals = []
acc_labels = []
for feat_list in [feats_sm, feats_med, feats]:
    if feat_list == feats_sm:
        name = "Small"
    elif feat_list == feats_med:
        name = "Medium"
    else:
        name = "Full"
    featassemble_tmp = VectorAssembler(inputCols=feat_list, outputCol = 'features')
    assembled_data_tmp = featassemble_tmp.transform(df2)
    assembled_data_tmp = assembled_data_tmp.select("features",'StartHesTrue','TurnTrue',
                                                   'WalkTrue','NoTrue','target')
    data_tmp = assembled_data_tmp.withColumn("label", assembled_data_tmp.target)
    
    splits = data_tmp.randomSplit([0.8,0.2],1)
    train_df = splits[0]
    test_df = splits[1]
    
    dtc = DecisionTreeClassifier(featuresCol="features", labelCol="target")
    rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'target')

    dtc_cm, dtc_metrics = fit_models("Decision Tree - {} Features".format(name)
                                                 , dtc, train_df, test_df)
    rf_cm, rf_metrics = fit_models("Random Forest - {} Features".format(name), 
                                              rf, train_df, test_df)
    
    matrices.append([dtc_cm, rf_cm])
    evals.append([dtc_metrics, rf_metrics])

Begin training Decision Tree - Small Features


23/05/22 16:26:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1908.0 KiB
23/05/22 16:26:34 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.4 KiB
23/05/22 16:26:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.4 KiB
23/05/22 16:26:36 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.4 KiB
23/05/22 16:26:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.6 KiB
23/05/22 16:26:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1880.9 KiB
23/05/22 16:27:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1880.6 KiB
23/05/22 16:27:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1881.4 KiB
23/05/22 16:27:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcas

Begin evaluation on test data


23/05/22 16:27:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1887.7 KiB
23/05/22 16:27:29 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1887.7 KiB
23/05/22 16:27:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1887.7 KiB
23/05/22 16:27:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1900.5 KiB
                                                                                

Decision Tree - Small Features Accuracy = 0.8403349204285363
Decision Tree - Small Features Weighted F1 = 0.8251963771240978


23/05/22 16:27:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1921.8 KiB
23/05/22 16:27:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1899.6 KiB
                                                                                

Decision Tree - Small Features Confustion Matrix
[[359877      0  27685      0]
 [     3      0     16      0]
 [ 35516      0  47293      0]
 [ 12248      0   1895      0]]
Begin training Random Forest - Small Features


23/05/22 16:28:37 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1908.0 KiB
23/05/22 16:28:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.4 KiB
23/05/22 16:28:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.4 KiB
23/05/22 16:28:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.4 KiB
23/05/22 16:28:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1873.6 KiB
23/05/22 16:28:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1880.9 KiB
23/05/22 16:29:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1882.4 KiB
23/05/22 16:29:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1890.3 KiB
23/05/22 16:29:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcas

Begin evaluation on test data


23/05/22 16:29:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1974.1 KiB
23/05/22 16:29:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1974.1 KiB
23/05/22 16:29:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1974.1 KiB
23/05/22 16:29:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1986.9 KiB
                                                                                

Random Forest - Small Features Accuracy = 0.8307359870225557
Random Forest - Small Features Weighted F1 = 0.7892815567108403


23/05/22 16:29:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2008.2 KiB
23/05/22 16:30:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1899.6 KiB
                                                                                

Random Forest - Small Features Confustion Matrix
[[380612      0   6950      0]
 [    19      0      0      0]
 [ 60902      0  21907      0]
 [ 13554      0    589      0]]
Begin training Decision Tree - Medium Features


23/05/22 16:30:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:31:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:31:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Begin evaluation on test data


23/05/22 16:32:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:32:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:32:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:32:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
                                                                                

Decision Tree - Medium Features Accuracy = 0.9220713553050051
Decision Tree - Medium Features Weighted F1 = 0.9226610614774615


23/05/22 16:32:31 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:32:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
                                                                                

Decision Tree - Medium Features Confustion Matrix
[[366789      0  15959   4814]
 [     0      0      0     11]
 [  9899      0  71802   1160]
 [  5824      0     92   8183]]
Begin training Random Forest - Medium Features


23/05/22 16:33:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:33:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:33:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:33:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:33:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:34:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:34:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/05/22 16:34:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:34:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Begin evaluation on test data


23/05/22 16:34:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:34:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:34:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
23/05/22 16:35:00 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.1 MiB
                                                                                

Random Forest - Medium Features Accuracy = 0.9102723653497286
Random Forest - Medium Features Weighted F1 = 0.8957838581930846


23/05/22 16:35:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.1 MiB
23/05/22 16:35:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.0 MiB
                                                                                

Random Forest - Medium Features Confustion Matrix
[[375637      0  11925      0]
 [    11      0      0      0]
 [ 17441      0  65420      0]
 [ 14007      0     92      0]]
Begin training Decision Tree - Full Features


23/05/22 16:36:19 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:37:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:37:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:37:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:37:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:37:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:38:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:38:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:39:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Begin evaluation on test data


23/05/22 16:39:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:39:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:39:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:39:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

Decision Tree - Full Features Accuracy = 0.9217555873387365
Decision Tree - Full Features Weighted F1 = 0.922368707361091


23/05/22 16:40:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:40:38 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

Decision Tree - Full Features Confustion Matrix
[[366533      0  16259   4770]
 [     0      0      0     11]
 [  9793      0  71894   1148]
 [  5839      0     92   8194]]
Begin training Random Forest - Full Features


23/05/22 16:41:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:42:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:42:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:42:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:42:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:42:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:43:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/22 16:43:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:44:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Begin evaluation on test data


23/05/22 16:44:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:44:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:44:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/05/22 16:44:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

Random Forest - Full Features Accuracy = 0.8297763000662494
Random Forest - Full Features Weighted F1 = 0.776492812467173


23/05/22 16:45:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.3 MiB
23/05/22 16:45:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

Random Forest - Full Features Confustion Matrix
[[386377      0   1185      0]
 [    11      0      0      0]
 [ 67158      0  15677      0]
 [ 14125      0      0      0]]


In [48]:
evals

[[[<pyspark.mllib.evaluation.MulticlassMetrics at 0x7fa567f293a0>],
  [<pyspark.mllib.evaluation.MulticlassMetrics at 0x7fa576d26e80>]],
 [[<pyspark.mllib.evaluation.MulticlassMetrics at 0x7fa58213c250>],
  [<pyspark.mllib.evaluation.MulticlassMetrics at 0x7fa581c7f9a0>]],
 [[<pyspark.mllib.evaluation.MulticlassMetrics at 0x7fa581fd53a0>],
  [<pyspark.mllib.evaluation.MulticlassMetrics at 0x7fa58291fa60>]]]

In [54]:
for eval in evals:
    for x in eval:
        print("0",x[0].truePositiveRate(0.0))
        print("1",x[0].truePositiveRate(1.0))
        print("2",x[0].truePositiveRate(2.0))
        print("3",x[0].truePositiveRate(3.0))

0 0.9285662681067803
1 0.0
2 0.57110942047362
3 0.0
0 0.9820673853473767
1 0.0
2 0.26454853940996753
3 0.0
0 0.9464008339310872
1 0.0
2 0.866535523346327
3 0.5803957727498404
0 0.9692307295348873
1 0.0
2 0.789514970854805
3 0.0
0 0.9457402944561129
1 0.0
2 0.8679181505402306
3 0.5801061946902655
0 0.9969424246959196
1 0.0
2 0.1892557493813002
3 0.0
