In [1]:
import os 
#from fog.code.utils.utils import *

import pyspark.sql.functions as F
#from pyspark.sql.types import StringType, BooleanType, IntegerType, FloatType, DateType
from pyspark.sql import SparkSession
from google.cloud import storage

# import Spark stuff
from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('defog-labeled').getOrCreate()

In [3]:
defog_path = "parkinsons_data/train/processed/defog_tasks_lagging"
top_bucket_name = "msca-bdp-student-gcs"

In [4]:
def feed_files(top_bucket_name, prefix, suffix):
    client = storage.Client()
    blobs = client.list_blobs(top_bucket_name, prefix=prefix)

    processed = None

    for i, blob in enumerate(blobs):
        print(blob.name)
        if blob.name.endswith(suffix):
            
            if suffix == ".parquet":
                df = spark.read.parquet(f"gs://{top_bucket_name}/{blob.name}")
            elif suffix == ".csv":
                df = spark.read.csv(f"gs://{top_bucket_name}/{blob.name}")
            if processed is None:
                processed = df
            else:
                processed = processed.union(df)
    return processed



In [5]:
defog = feed_files(top_bucket_name, defog_path, ".parquet")

parkinsons_data/train/processed/defog_tasks_lagging/
parkinsons_data/train/processed/defog_tasks_lagging/_SUCCESS
parkinsons_data/train/processed/defog_tasks_lagging/part-00000-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet


                                                                                

parkinsons_data/train/processed/defog_tasks_lagging/part-00001-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00002-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00003-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet


                                                                                

parkinsons_data/train/processed/defog_tasks_lagging/part-00004-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00005-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00006-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00007-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00008-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00009-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00010-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_tasks_lagging/part-00011-eae2acef-cda6-49a5-b43f-ded309b7bad5-c000.snappy.parquet
parkinsons_data/train/processed/defog_ta

In [6]:
# drop Test, Visit, Medication cols because they were for tdcsfog
defog = defog.drop("Test", "Visit", "Medication")

# keep only when valid = True
defog = defog.filter(F.col("Valid")==True).filter(F.col("Task")==True)

# convert Time to float
defog = defog.withColumn("Time", F.col("Time").cast("float"))
defog.printSchema()

root
 |-- Subject: string (nullable = true)
 |-- Id: string (nullable = true)
 |-- Time: float (nullable = true)
 |-- AccV: float (nullable = true)
 |-- AccML: float (nullable = true)
 |-- AccAP: float (nullable = true)
 |-- StartHesitation: integer (nullable = true)
 |-- Turn: integer (nullable = true)
 |-- Walking: integer (nullable = true)
 |-- Valid: boolean (nullable = true)
 |-- Task: boolean (nullable = true)
 |-- SourceDefog: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- YearsSinceDx: integer (nullable = true)
 |-- UPDRSIII_On: integer (nullable = true)
 |-- UPDRSIII_Off: integer (nullable = true)
 |-- NFOGQ: integer (nullable = true)
 |-- TimeSeconds: double (nullable = true)
 |-- Begin: double (nullable = true)
 |-- End: double (nullable = true)
 |-- TaskType: string (nullable = true)
 |-- MB9: integer (nullable = true)
 |-- Rest1: integer (nullable = true)
 |-- MB6-L: integer (nullable = true)
 |-- MB6-R: integer (nulla

In [7]:
# create sex binary cols
defog = defog.withColumn('SexInd', 
                  F.when((F.col("Sex") == "F"), 1) \
                    .when((F.col("Sex") == "M"), 0)
                  )

In [8]:
# convert task types to multilabels
# keep null tasks for now
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='TaskType', outputCol='TaskTypeInd', handleInvalid="keep")
model = indexer.fit(defog)
defog = model.transform(defog)

# use model.labels to see the order of the labels

                                                                                

In [9]:
# remove unnecessary cols
defog = defog.drop("Subject", "Sex", "Id", "TaskType")

In [10]:
defog.groupBy("target").count().show()

                                                                                

+------+-------+
|target|  count|
+------+-------+
|     1|     88|
|     3|  70521|
|     2| 414380|
|     0|3626334|
+------+-------+



In [20]:
# feature selection
# ignore standardized for now
def vectorize():
    """
    Creates vectorized dataframe.
    """
    """
    feature_cols = ["AccV", "AccML", "AccAP", "TaskTypeInd",
        "AccV_lag1", "AccV_lag2", "AccV_lag3", "AccV_lag4", "AccV_lag5",
        "AccV_lag6", "AccV_lag7", "AccV_lag8", "AccV_lag9", "AccV_lag10",
        "AccML_lag1", "AccML_lag2", "AccML_lag3", "AccML_lag4", "AccML_lag5",
        "AccML_lag6", "AccML_lag7", "AccML_lag8", "AccML_lag9", "AccML_lag10",
        "AccAP_lag1", "AccAP_lag2", "AccAP_lag3", "AccAP_lag4", "AccAP_lag5",
        "AccAP_lag6", "AccAP_lag7", "AccAP_lag8", "AccAP_lag9", "AccAP_lag10",
        "prediction4"]
    """
    #feature_cols = ['standardized'] + ["TaskTypeInd", "prediction4"]
    label_cols = ['target', 'StartHesitation', 'Turn', 'Walking']
    feature_cols = ["AccV", "AccML", "AccAP", "TaskTypeInd", "prediction4"]

    defog_sel = defog.select(feature_cols + label_cols)
    assembler = VectorAssembler(inputCols=feature_cols, outputCol='features', handleInvalid="skip")
    vectorized = assembler.transform(defog_sel)
    return vectorized

In [23]:
def run_logreg(train, test, labelCol):
    # create model
    lgr = LogisticRegression(maxIter=10, featuresCol = 'features', labelCol=labelCol)

    # fit model
    lgrm = lgr.fit(train)

    # make predictions
    predictions = lgrm.transform(test)

    return predictions

def evaluate(predictions):
    evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction")

    print("Accuracy", evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))
    print("F1", evaluator.evaluate(predictions, {evaluator.metricName: "f1"}))

Lag features, not standardized:  
> Accuracy 0.8811464693942825  
F1 0.8278086328796053

No lag, standardized:  
>Accuracy 0.8818123580030658  
F1 0.8396314248066408  

No lag, unstandardized:  
>Accuracy 0.8819386763598863  
F1 0.8404164161360337  

Use unstandardized, no lag


In [24]:
# no lag, unstandardized
vectorized = vectorize()
train, test = vectorized.randomSplit([0.8, 0.2],0.0)
predictions = run_logreg(train, test, 'target')
evaluate(predictions)

                                                                                

Accuracy 0.8815381570335543




F1 0.8398448335102066


                                                                                

In [25]:
predictions.select('prediction').groupBy('prediction').count().show()



+----------+------+
|prediction| count|
+----------+------+
|       0.0|807749|
|       3.0|     3|
|       2.0| 13585|
+----------+------+



                                                                                

Our algorithm can't predict labels 1 and 3.
We will try to treat it as a binary classification problem.  

Try classifying label 1 = Start Hesitation

In [26]:
vectorized.groupBy('StartHesitation').count().show()

                                                                                

+---------------+-------+
|StartHesitation|  count|
+---------------+-------+
|              1|     88|
|              0|4111235|
+---------------+-------+



In [28]:
# binary target=1
predictions = run_logreg(train, test, 'StartHesitation')
evaluate(predictions)

                                                                                

Accuracy 0.8815551570007687




F1 0.8260608167059249


                                                                                

In [29]:
# show predicted labels
predictions.groupBy("prediction").count().show()



+----------+------+
|prediction| count|
+----------+------+
|       0.0|823531|
+----------+------+



                                                                                

Treating label 1 as a binary problem doesn't improve classification. We will try to undersample the data instead.

In [30]:
def resample(large_dataframe, ratio, class_field, base_class):
    """
    Resamples non-minority label by a chosen factor
    """
    pos = large_dataframe.filter(large_dataframe[class_field] != base_class)
    neg = large_dataframe.filter(large_dataframe[class_field] == base_class)
    total_pos = pos.count()
    total_neg = neg.count()
    fraction=float(total_pos*ratio)/float(total_neg)
    sampled = neg.sample(False,fraction)
    
    return sampled.union(pos)
sampled = resample(vectorized, 5, 'StartHesitation', 0)
sampled.select("StartHesitation").groupBy("StartHesitation").count().show()



+---------------+-----+
|StartHesitation|count|
+---------------+-----+
|              1|   88|
|              0|  496|
+---------------+-----+



                                                                                

In [31]:
# binary target=1, undersampled majority
train, test = sampled.randomSplit([0.8, 0.2],0.0)
predictions = run_logreg(train, test, 'StartHesitation')
evaluate(predictions)

                                                                                

Accuracy 0.7542372881355932




F1 0.655858511422255


                                                                                

In [77]:
predictions.groupBy("prediction").count().show()

23/05/22 01:31:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1975.5 KiB

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  117|
|       1.0|   21|
+----------+-----+



                                                                                

This is better, but we've had to severely undersample the data.

Now treat label 3=walking as binary

In [32]:
vectorized.groupBy('Walking').count().show()



+-------+-------+
|Walking|  count|
+-------+-------+
|      1|  70521|
|      0|4040802|
+-------+-------+



                                                                                

In [33]:
# binary target=3
train, test = vectorized.randomSplit([0.8, 0.2],0.0)
predictions = run_logreg(train, test, 'Walking')
evaluate(predictions)

                                                                                

Accuracy 0.8815478713005339




F1 0.8260571882775194


                                                                                

In [34]:
predictions.groupBy("prediction").count().show()



+----------+------+
|prediction| count|
+----------+------+
|       0.0|823525|
|       1.0|     6|
+----------+------+



                                                                                

Now we try to undersampling

In [35]:
# undersample majority relative to "Walking"
sampled = resample(vectorized, 5, 'Walking', 0)
sampled.select("Walking").groupBy("Walking").count().show()



+-------+------+
|Walking| count|
+-------+------+
|      1| 70521|
|      0|352607|
+-------+------+



                                                                                

In [36]:
# binary target=3, undersampled majority
train, test = sampled.randomSplit([0.8, 0.2],0.0)
predictions = run_logreg(train, test, 'Walking')
evaluate(predictions)

                                                                                

Accuracy 0.7473675522067754




F1 0.6397606656325797


                                                                                

In [37]:
predictions.groupBy("prediction").count().show()



+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|84761|
|       1.0|   46|
+----------+-----+



                                                                                

Even though some models produce higher accuracies and F1 scores, they may not be the most sensitive models because they end up not being able to predict all of our labels. This is likely due to our unbalanced dataset, that has a lot of rows with labels=0, 2 and a relatively small number with labels=1, 3. Some areas to explore in the future could be a combination of undersampling and oversampling, or appying a method such as SMOTE to generate new data points.