In [83]:
import os 
# from utils.utils import *

# from fog.code.utils.utils import *
from pyspark.sql.functions import lit, col, when
from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, DateType
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from google.cloud import storage

import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

from pyspark.ml.classification import MultilayerPerceptronClassifier, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import confusion_matrix

In [3]:
spark = SparkSession.builder.appName("Model Test").getOrCreate()

## Functions for data loading and assembling

In [4]:
def feed_files(top_bucket_name, prefix, suffix):
    client = storage.Client()
    blobs = client.list_blobs(top_bucket_name, prefix=prefix)

    processed = None

    for i, blob in enumerate(blobs):
        print(blob.name)
        if blob.name.endswith(suffix):
            
            if suffix == ".parquet":
                df = spark.read.parquet(f"gs://{top_bucket_name}/{blob.name}")
            elif suffix == ".csv":
                df = spark.read.csv(f"gs://{top_bucket_name}/{blob.name}")
            if processed is None:
                processed = df
            else:
                processed = processed.union(df)
    return processed

In [5]:
data_path = "parkinsons_data/train/processed/defog_tasks_lagging"
top_bucket_name = "msca-bdp-student-gcs"

In [80]:
df = feed_files(top_bucket_name, data_path, ".parquet")

parkinsons_data/train/processed/defog_tasks_lagging/
parkinsons_data/train/processed/defog_tasks_lagging/_SUCCESS
parkinsons_data/train/processed/defog_tasks_lagging/part-00000-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00001-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00002-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00003-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00004-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00005-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00006-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_la

In [164]:
df.columns

['Subject',
 'Visit',
 'Id',
 'Time',
 'AccV',
 'AccML',
 'AccAP',
 'StartHesitation',
 'Turn',
 'Walking',
 'Valid',
 'Task',
 'SourceDefog',
 'Medication',
 'Age',
 'Sex',
 'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ',
 'Test',
 'TimeSeconds',
 'Begin',
 'End',
 'TaskType',
 'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
 'target',
 'features',
 'standardized',
 'prediction_67',
 'prediction4',
 'AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_

## Update target columns
The targets (Turn, Walking, StartHesitation) are only accurate when the columns "Valid" and "Task" are both true.

In [81]:
df = df.withColumn("StartHesTrue", when((df.StartHesitation == 1) & (df.Valid == True) & (df.Task == True), 1).otherwise(0))
df = df.withColumn("TurnTrue", when((df.Turn == 1) & (df.Valid == True) & (df.Task == True), 1)
      .otherwise(0))
df = df.withColumn("WalkTrue", when((df.Walking == 1) & (df.Valid == True) & (df.Task == True), 1)
      .otherwise(0))
df = df.withColumn("NoTrue", when((df.StartHesTrue == 0) & (df.TurnTrue == 0)  & (df.WalkTrue == 0) & (df.Task == True), 1)
      .otherwise(0))
df = df.withColumn("target", when((df.target == 1) & (df.Valid == True) & (df.Task == True), 1)
                   .when((df.target == 2) & (df.Valid == True) & (df.Task == True), 2)
                   .when((df.target == 3) & (df.Valid == True) & (df.Task == True), 3)
                   .otherwise(0))

In [82]:
df.select("target").groupBy("Target").count().show()



+------+--------+
|Target|   count|
+------+--------+
|     1|      88|
|     3|   70521|
|     2|  414380|
|     0|13040714|
+------+--------+



                                                                                

In [47]:
df.select("WalkTrue").groupBy("WalkTrue").count().show()
df.select("StartHesTrue").groupBy("StartHesTrue").count().show()
df.select("TurnTrue").groupBy("TurnTrue").count().show()

                                                                                

+--------+--------+
|WalkTrue|   count|
+--------+--------+
|       1|   70521|
|       0|13455182|
+--------+--------+



                                                                                

+------------+--------+
|StartHesTrue|   count|
+------------+--------+
|           1|      88|
|           0|13525615|
+------------+--------+



                                                                                

+--------+--------+
|TurnTrue|   count|
+--------+--------+
|       1|  414380|
|       0|13111323|
+--------+--------+



## Balancing the dataframe
Code to balance the target class counts to improve classification accuracy

In [58]:
def resample(large_dataframe, ratio, class_field, base_class):
    """
    Resample the majority class so your dataset is more balanced.
    large_dataframe: df with all rows for all labels
    ratio: ratio:1 == new_num_majority_samples:num_minority samples
    class_field: target class field name
    base_class: majority class
    """
    pos = large_dataframe.filter(large_dataframe[class_field] != base_class)
    neg = large_dataframe.filter(large_dataframe[class_field] == base_class)
    total_pos = pos.count()
    total_neg = neg.count()
    print(total_pos)
    print(total_neg)
    print(ratio)
    fraction=float(total_pos*ratio)/float(total_neg)
    sampled = neg.sample(False,fraction)
    
    return sampled.union(pos)

In [59]:
dfsamp = resample(df, 4, "target", 0)
dfsamp.select("target").groupBy("target").count().show()

                                                                                

484989
13040714
4


                                                                                

+------+-------+
|target|  count|
+------+-------+
|     1|     88|
|     3|  70521|
|     2| 414380|
|     0|1938205|
+------+-------+



In [60]:
df2 = dfsamp.select('AccV',
 'AccML',
 'AccAP',
 'Medication',
 'Age',
 'Sex',
 'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ',
 'TimeSeconds',
 'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
 'target',
'prediction_67',
 'prediction4',
 'AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_lag2',
 'AccAP_lag3',
 'AccAP_lag4',
 'AccAP_lag5',
 'AccAP_lag6',
 'AccAP_lag7',
 'AccAP_lag8',
 'AccAP_lag9',
 'AccAP_lag10',
 'StartHesTrue',
 'TurnTrue',
 'WalkTrue',
 'NoTrue')

In [61]:
floats = ['AccV',
 'AccML',
 'AccAP','AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_lag2',
 'AccAP_lag3',
 'AccAP_lag4',
 'AccAP_lag5',
 'AccAP_lag6',
 'AccAP_lag7',
 'AccAP_lag8',
 'AccAP_lag9',
 'AccAP_lag10']

In [62]:
# Need to cast to "double" so vectorAssembler will work
for col in floats:
    df2 = df2.withColumn(
    col,
    F.col(col).cast("double"))

In [63]:
df2 = df2.fillna(0)

## Pipeline Creation

In [64]:
sex_indexer = StringIndexer(inputCol="Sex", outputCol="sexIndex")
med_indexer = StringIndexer(inputCol="Medication", outputCol="medIndex")

ohe_clust67 = OneHotEncoder(inputCol="prediction_67", outputCol="pred67ohe")
ohe_clust4 = OneHotEncoder(inputCol="prediction4", outputCol="pred4ohe")
ohe_sex = OneHotEncoder(inputCol="sexIndex", outputCol="SexOhe")
ohe_med = OneHotEncoder(inputCol="medIndex", outputCol="MedOhe")

assemble=VectorAssembler(inputCols=['AccV', 'AccML', 'AccAP', 'Age'], outputCol = 'assemblefeats')

scaleML = StandardScaler(inputCol='assemblefeats', outputCol = 'scalefeats')

#Create pipeline and pass all stages
pipeline = Pipeline(stages=[sex_indexer,
                            med_indexer,
                            ohe_clust67,
                            ohe_clust4,
                            ohe_sex,
                            ohe_med,
                            assemble,
                            scaleML])

In [65]:
df2 = pipeline.fit(df2).transform(df2)
df2.show()

23/05/21 20:22:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/21 20:22:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/21 20:22:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB

+-------------------+--------------------+--------------------+----------+---+---+------------+-----------+------------+-----+-----------+---+-----+-----+-----+---------+----+-----+----+---+-----+----+------+----------+------+-----+--------+---+-----+---+----------+----------+---+--------+---+---+---+-----+----+----+----------+----+------+-------------+-----------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+

                                                                                

In [99]:
labels = ['StartHesTrue',
 'TurnTrue',
 'WalkTrue',
 'NoTrue']

feats = ['scalefeats',
    'pred67ohe',
 'pred4ohe',
 'SexOhe',
 'MedOhe', 
    'AccV_lag1',
 'AccV_lag2',
 'AccV_lag3',
 'AccV_lag4',
 'AccV_lag5',
 'AccV_lag6',
 'AccV_lag7',
 'AccV_lag8',
 'AccV_lag9',
 'AccV_lag10',
 'AccML_lag1',
 'AccML_lag2',
 'AccML_lag3',
 'AccML_lag4',
 'AccML_lag5',
 'AccML_lag6',
 'AccML_lag7',
 'AccML_lag8',
 'AccML_lag9',
 'AccML_lag10',
 'AccAP_lag1',
 'AccAP_lag2',
 'AccAP_lag3',
 'AccAP_lag4',
 'AccAP_lag5',
 'AccAP_lag6',
 'AccAP_lag7',
 'AccAP_lag8',
 'AccAP_lag9',
 'AccAP_lag10', 'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ']

## Model Creation and Fitting

In [88]:
def fit_models(model_name, model, train_data, test_data):
    print("Begin training {}".format(model_name))
    model = model.fit(train_data)
    print("Begin evaluation on test data")
    pred = model.transform(test_data)
    
    evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
    acc = evaluator.evaluate(pred)
    print("{} Accuracy = {}".format(model_name, acc))
    
    y_pred=pred.select("prediction").collect()
    y_orig=pred.select("label").collect()

    cm = confusion_matrix(y_orig, y_pred)
    print("{} Confustion Matrix".format(model_name))
    print(cm)
    
    labacc = []
    for lab in [0, 1.0, 2.0, 3.0]:
        labacc.append(evaluator.evaluate(pred, {evaluator.metricName: "truePositiveRateByLabel",
            evaluator.metricLabel: lab}))
    
    return (cm, evaluator, labacc)




In [109]:
feats_sm = ['scalefeats',
 'SexOhe',
 'MedOhe', 
'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ']

feats_med = ['scalefeats',
 'SexOhe',
 'MedOhe', 
    'MB9',
 'Rest1',
 'MB6-L',
 'MB6-R',
 'Turning-C',
 'MB2a',
 'MB3-L',
 'MB12',
 'MB5',
 'MB3-R',
 'MB13',
 'TUG-DT',
 'Turning-ST',
 'TUG-ST',
 '4MW-C',
 'Hotspot2',
 'MB6',
 'TUG-C',
 '4MW',
 'Hotspot1-C',
 'Hotspot2-C',
 'MB8',
 'Hotspot1',
 'MB4',
 'MB1',
 'MB7',
 'Rest2',
 'MB2b',
 'MB10',
 'Turning-DT',
 'MB11',
'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ']

In [None]:
matrices = []
evals = []
acc_labels = []
for feat_list in [feats_sm, feats_med, feats]:
    if feat_list == feats_sm:
        name = "Small"
    elif feat_list == feats_med:
        name = "Medium"
    else:
        name = "Full"
    featassemble_tmp = VectorAssembler(inputCols=feat_list, outputCol = 'features')
    assembled_data_tmp = featassemble_tmp.transform(df2)
    assembled_data_tmp = assembled_data_tmp.select("features",'StartHesTrue','TurnTrue',
                                                   'WalkTrue','NoTrue','target')
    data_tmp = assembled_data_tmp.withColumn("label", assembled_data_tmp.target)
    
    splits = data_tmp.randomSplit([0.8,0.2],1)
    train_df = splits[0]
    test_df = splits[1]
    
    dtc = DecisionTreeClassifier(featuresCol="features", labelCol="target")
    rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'target')

    dtc_cm, dtc_eval, dtc_label_acc = fit_models("Decision Tree - {} Features".format(name)
                                                 , dtc, train_df, test_df)
    rf_cm, rf_eval, rf_label_acc = fit_models("Random Forest - {} Features".format(name), 
                                              rf, train_df, test_df)
    
    matrices.append([dtc_cm, rf_cm])
    evals.append([dtc_eval, rf_eval])
    acc_labels.append([dtc_label_acc, rf_label_acc])

Begin training Decision Tree - Small Features


23/05/22 02:55:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1577.8 KiB
23/05/22 02:56:29 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.2 KiB
23/05/22 02:56:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.2 KiB
23/05/22 02:56:31 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.2 KiB
23/05/22 02:56:34 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.3 KiB
23/05/22 02:56:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1555.6 KiB
23/05/22 02:56:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1555.3 KiB
23/05/22 02:57:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1556.1 KiB
23/05/22 02:57:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcas

Begin evaluation on test data


23/05/22 02:57:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1571.2 KiB
                                                                                

Decision Tree - Small Features Accuracy = 0.8247968460751594


23/05/22 02:57:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1591.6 KiB
23/05/22 02:57:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1568.8 KiB
                                                                                

Decision Tree - Small Features Confustion Matrix
[[357511      0  29768      0]
 [     3      0     16      0]
 [ 34188      0  48621      0]
 [ 11817      0   2326      0]]


23/05/22 02:58:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1571.4 KiB
23/05/22 02:58:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1571.4 KiB
23/05/22 02:58:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1571.4 KiB
23/05/22 02:58:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1571.4 KiB
                                                                                

Begin training Random Forest - Small Features


23/05/22 02:59:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1577.8 KiB
23/05/22 02:59:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.2 KiB
23/05/22 02:59:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.2 KiB
23/05/22 02:59:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.2 KiB
23/05/22 02:59:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1548.3 KiB
23/05/22 02:59:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1555.6 KiB
23/05/22 02:59:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1557.1 KiB
23/05/22 02:59:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1565.0 KiB
23/05/22 02:59:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcas

Begin evaluation on test data


23/05/22 03:00:06 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1658.8 KiB
                                                                                

Random Forest - Small Features Accuracy = 0.7894456564611825


23/05/22 03:00:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1679.3 KiB
23/05/22 03:00:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1568.8 KiB
                                                                                

Random Forest - Small Features Confustion Matrix
[[380158      0   7121      0]
 [    19      0      0      0]
 [ 60734      0  22075      0]
 [ 13526      0    617      0]]


23/05/22 03:01:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1659.0 KiB
23/05/22 03:01:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1659.0 KiB
23/05/22 03:01:25 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1659.0 KiB
23/05/22 03:01:34 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1659.0 KiB
                                                                                

Begin training Decision Tree - Medium Features


23/05/22 03:01:48 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/05/22 03:02:06 ERROR org.apache.spark.scheduler.AsyncEventQueue: Dropping event from queue dataprocEvent. This likely means one of the listeners is too slow and cannot keep up with the rate at which tasks are being started by the scheduler.
23/05/22 03:02:06 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 1 events from dataprocEvent since the application started.
23/05/22 03:02:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:02:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:02:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:02:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:02:25 WARN org.apache.spark.scheduler.DAGSche

Begin evaluation on test data


23/05/22 03:03:06 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 56 events from dataprocEvent since Mon May 22 03:02:06 UTC 2023.
23/05/22 03:03:07 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
                                                                                

Decision Tree - Medium Features Accuracy = 0.9224787280045985


23/05/22 03:03:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/05/22 03:03:41 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
                                                                                

Decision Tree - Medium Features Confustion Matrix
[[366441      0  16106   4732]
 [     0      0      0     11]
 [  9895      0  71806   1160]
 [  5824      0     92   8183]]


23/05/22 03:04:22 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 152 events from dataprocEvent since Mon May 22 03:03:06 UTC 2023.
23/05/22 03:04:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:04:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:04:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:05:11 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
                                                                                

Begin training Random Forest - Medium Features


23/05/22 03:05:24 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 232 events from dataprocEvent since Mon May 22 03:04:22 UTC 2023.
23/05/22 03:05:29 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/05/22 03:05:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:05:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:05:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:05:46 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:05:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:06:08 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/05/22 03:06:21 WARN org.apache.spark.scheduler.DAGScheduler: Broa

Begin evaluation on test data


23/05/22 03:06:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
                                                                                

Random Forest - Medium Features Accuracy = 0.8949546204016056


23/05/22 03:07:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.8 MiB
23/05/22 03:07:19 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
                                                                                

Random Forest - Medium Features Confustion Matrix
[[375466      0  11813      0]
 [    11      0      0      0]
 [ 17885      0  64976      0]
 [ 14013      0     86      0]]


23/05/22 03:07:57 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 232 events from dataprocEvent since Mon May 22 03:06:47 UTC 2023.
23/05/22 03:07:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/05/22 03:08:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/05/22 03:08:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/05/22 03:08:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.7 MiB
                                                                                

Begin training Decision Tree - Full Features


23/05/22 03:08:58 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 290 events from dataprocEvent since Mon May 22 03:07:57 UTC 2023.
23/05/22 03:08:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/05/22 03:09:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:09:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:09:45 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:09:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:10:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:11:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:11:51 WARN org.apache.spark.scheduler.DAGScheduler: Broa

Begin evaluation on test data


23/05/22 03:12:19 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 116 events from dataprocEvent since Mon May 22 03:08:58 UTC 2023.
23/05/22 03:12:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
                                                                                

Decision Tree - Full Features Accuracy = 0.9221715788126652


23/05/22 03:12:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/05/22 03:13:26 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 116 events from dataprocEvent since Mon May 22 03:12:19 UTC 2023.
23/05/22 03:13:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Decision Tree - Full Features Confustion Matrix
[[366167      0  16360   4752]
 [     0      0      0     11]
 [  9787      0  71900   1148]
 [  5839      0     92   8194]]


23/05/22 03:14:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:14:53 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 116 events from dataprocEvent since Mon May 22 03:13:26 UTC 2023.
23/05/22 03:14:55 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:15:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:15:58 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 116 events from dataprocEvent since Mon May 22 03:14:53 UTC 2023.
23/05/22 03:15:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
                                                                                

Begin training Random Forest - Full Features


23/05/22 03:16:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/05/22 03:17:00 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 174 events from dataprocEvent since Mon May 22 03:15:58 UTC 2023.
23/05/22 03:17:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:17:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:17:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:17:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:17:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:18:05 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.8 MiB
23/05/22 03:18:39 WARN org.apache.spark.scheduler.DAGScheduler: Broa

Begin evaluation on test data


23/05/22 03:19:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB

Random Forest - Full Features Accuracy = 0.7676724921861194


23/05/22 03:19:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.0 MiB
23/05/22 03:20:28 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 174 events from dataprocEvent since Mon May 22 03:19:25 UTC 2023.
23/05/22 03:20:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Random Forest - Full Features Confustion Matrix
[[386443      0    836      0]
 [    11      0      0      0]
 [ 69580      0  13255      0]
 [ 14125      0      0      0]]


23/05/22 03:21:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/05/22 03:21:59 WARN org.apache.spark.scheduler.AsyncEventQueue: Dropped 116 events from dataprocEvent since Mon May 22 03:20:28 UTC 2023.
23/05/22 03:22:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/05/22 03:22:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
23/05/22 03:22:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

In [114]:
acc_labels

[[[0.9231355172885697, 0.0, 0.5871463246748542, 0.0],
  [0.9816127391363849, 0.0, 0.2665773043992803, 0.0]],
 [[0.9461938292548783, 0.0, 0.8665837969611759, 0.5803957727498404],
  [0.9694974424123177, 0.0, 0.78415659960657, 0.0]],
 [[0.9454863289773006, 0.0, 0.867990583690469, 0.5801061946902655],
  [0.9978413495180477, 0.0, 0.16001690106838898, 0.0]]]