In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from pandas.plotting import scatter_matrix
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.optimize import minimize
from statsmodels.tsa.seasonal import seasonal_decompose
import re
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

"""for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))"""

"for dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))"

In [None]:
## !pip install 'polars[numpy,pandas,pyarrow]' --target=/kaggle/working/mysitepackages

!pip download polars[numpy,pandas,pyarrow] -d /kaggle/working/mysitepackages/polars_pkg

In [None]:
!pip download duckdb==0.7.1 -d /kaggle/working/mysitepackages/duck

#### create the zip file with the code below and then download the zip file into local machine

In [None]:
from zipfile import ZipFile

dirName = "/kaggle/working/mysitepackages"
zipName = "packages.zip"

# Create a ZipFile Object
with ZipFile(zipName, 'w') as zipObj:
    # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk(dirName):
        for filename in filenames:
            if (filename != zipName):
                # create complete filepath of file in directory
                filePath = os.path.join(folderName, filename)
                # Add file to zip
                zipObj.write(filePath)

#### Download the zipfile into local machine
#### Upload the downloaded zipfile as data from local machine and then fire the commands below

In [None]:
! ls /kaggle/input/duckdb-n-polars-offline/kaggle/working/mysitepackages

In [None]:
!pip install polars[numpy,pandas,pyarrow] --no-index --find-links=file:///kaggle/input/duckdb-n-polars-offline/kaggle/working/mysitepackages/polars_pkg

In [2]:
!pip install duckdb --no-index --find-links=file:///kaggle/input/duckdb-n-polars-offline/kaggle/working/mysitepackages/duck

Looking in links: file:///kaggle/input/duckdb-n-polars-offline/kaggle/working/mysitepackages/duck
Processing /kaggle/input/duckdb-n-polars-offline/kaggle/working/mysitepackages/duck/duckdb-0.7.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: duckdb
Successfully installed duckdb-0.7.1
[0m

In [3]:
import polars as pl
import duckdb as dd

In [4]:
defog_metadata = pl.scan_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv", try_parse_dates=True).collect()
print(defog_metadata.shape)
print(defog_metadata.select('Subject').n_unique())
print(defog_metadata.select('Id').n_unique())

subjects = pl.scan_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/subjects.csv", try_parse_dates=True)\
.collect()
print(subjects.shape)
print(subjects.select('Subject').n_unique())

defog_subjects_metadata = defog_metadata.join(subjects, on=["Subject","Visit"], how="inner")

defog_subjects_metadata = defog_subjects_metadata.with_columns(pl.when(pl.col('Medication') == 'on').then(pl.col('UPDRSIII_On')).otherwise(pl.col('UPDRSIII_Off')).alias("UPDRSIII"))
defog_subjects_metadata = defog_subjects_metadata.drop(['UPDRSIII_On','UPDRSIII_Off'])

del defog_metadata, subjects

print('defog_subjects_metadata.shape : ', defog_subjects_metadata.shape)
print('defog_subjects_metadata.unique.Subject : ', defog_subjects_metadata.select('Subject').n_unique())
print('defog_subjects_metadata.unique.Id : ', defog_subjects_metadata.select('Id').n_unique())

tdcs_metadata = pl.scan_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv", try_parse_dates=True).collect()
print('tdcs_metadata.shape : ', tdcs_metadata.shape)
print('tdcs_metadata.unique.Subject : ', tdcs_metadata.select('Subject').n_unique())
print('tdcs_metadata.unique.Id : ', tdcs_metadata.select('Id').n_unique())

(137, 4)
45
137
(173, 8)
136
defog_subjects_metadata.shape :  (137, 9)
defog_subjects_metadata.unique.Subject :  45
defog_subjects_metadata.unique.Id :  137
tdcs_metadata.shape :  (833, 5)
tdcs_metadata.unique.Subject :  62
tdcs_metadata.unique.Id :  833


In [5]:
events = pl.scan_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/events.csv", try_parse_dates=True)
events = events.with_columns(
    [
        ((pl.col('Init')*100).round(0).cast(pl.Int64)).alias('Init_m'), 
        ((pl.col('Completion')*100).round(0).cast(pl.Int64)).alias('Completion_m')
    ]
)
events = events.with_columns(
    [
        (pl.when(pl.col('Type')=='Turn').then(pl.lit(1)).otherwise(pl.lit(0)).alias('Turn')),
        (pl.when(pl.col('Type')=='Walking').then(pl.lit(1)).otherwise(pl.lit(0)).alias('Walking')),
        (pl.when(pl.col('Type')=='StartHesitation').then(pl.lit(1)).otherwise(pl.lit(0)).alias('StartHesitation'))
    ]
)

events = events.drop(['Init','Completion','Type'])
events = events.rename({'Init_m': 'Init', 'Completion_m':'Completion'})

events = events.sort(['Id','Init','Completion'], nulls_last=True)

In [6]:
tasks = pl.scan_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tasks.csv", try_parse_dates=True)

tasks = tasks.with_columns(
    [
        ((pl.col('Begin')*100).round(0).cast(pl.Int64)).alias('Begin_m'), 
        ((pl.col('End')*100).round(0).cast(pl.Int64)).alias('End_m')
    ]
)

tasks = tasks.drop(['Begin','End'])
tasks = tasks.rename({'Begin_m': 'Begin', 'End_m':'End'})

In [64]:
defog_train_data = pl.DataFrame()

for dirname, _, filenames in os.walk('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog'):
    for filename in filenames:
        
        temp_defog = pl.scan_csv(os.path.join(dirname, filename), try_parse_dates=True).filter((pl.col("Valid")) & (pl.col("Task"))).with_columns(pl.lit(filename[0:-4]).alias('Id'))
        temp_defog = temp_defog.with_columns(
            [
                ((pl.col('AccV')*9.80665).round(5).cast(pl.Float64)).alias('AccV_ms2'), 
                ((pl.col('AccML')*9.80665).round(5).cast(pl.Float64)).alias('AccML_ms2'),
                ((pl.col('AccAP')*9.80665).round(5).cast(pl.Float64)).alias('AccAP_ms2')
            ]
        )
        temp_defog = temp_defog.drop(['AccV','AccML','AccAP'])
        temp_defog = temp_defog.rename({'AccV_ms2': 'AccV', 'AccML_ms2':'AccML', 'AccAP_ms2':'AccAP'})
        
        temp_defog = temp_defog.sort(['Id','Time'], nulls_last=True)
        
        temp_final = dd.sql("SELECT td.Id, td.Time, td.AccV, td.AccML, td.AccAP, td.StartHesitation, td.Turn, td.Walking, coalesce(e.Kinetic,-1) as Kinetic \
        , coalesce(t.Task,'No Task') as Task \
        FROM temp_defog td \
        left join events e on td.Id = e.Id and td.Time >= e.Init and td.Time <= e.Completion \
        left join tasks t on td.Id = t.Id and td.Time >= t.Begin and td.Time <= t.End").pl()
        
        defog_train_data = pl.concat([defog_train_data, temp_final], how="vertical")
        del temp_final
        
defog_train_data_with_metadata = dd.sql('select t1.Id, t2.Subject, t2.Visit, t1.Time, t2.Medication, t2.Age, t2.Sex, t2.YearsSinceDx \
,t2.NFOGQ, t2.UPDRSIII, t1.Task, t1.AccV, t1.AccML, t1.AccAP, t1.Kinetic \
,t1.StartHesitation, t1.Turn, t1.Walking \
from defog_train_data t1 join defog_subjects_metadata t2 on t1.Id = t2.Id').pl()

del defog_train_data, defog_subjects_metadata
gc.collect()

60392

In [70]:
tdcs_train_data = pl.DataFrame()

for dirname, _, filenames in os.walk('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog'):
    for filename in filenames:
        
        temp_tdcs = pl.scan_csv(os.path.join(dirname, filename), try_parse_dates=True).with_columns(pl.lit(filename[0:-4]).alias('Id'))
        
        temp_tdcs = temp_tdcs.sort(['Id','Time'], nulls_last=True)
        
        temp_final = dd.sql("SELECT td.Id, td.Time, td.AccV, td.AccML, td.AccAP, td.StartHesitation, td.Turn, td.Walking, coalesce(e.Kinetic,-1) as Kinetic \
        FROM temp_tdcs td \
        left join events e on td.Id = e.Id and td.Time >= e.Init and td.Time <= e.Completion").pl()
        
        tdcs_train_data = pl.concat([tdcs_train_data, temp_final], how="vertical")
        del temp_final
        
tdcs_train_data_with_metadata = dd.sql('select t1.Id, t2.Subject, t2.Visit, t1.Time, t2.Medication, t2.Test, t1.AccV, t1.AccML, t1.AccAP, t1.Kinetic \
,t1.StartHesitation, t1.Turn, t1.Walking \
from tdcs_train_data t1 join tdcs_metadata t2 on t1.Id = t2.Id').pl()

del tdcs_train_data, tdcs_metadata
gc.collect()

0

In [73]:
tdcs_train_data_with_metadata.shape

(7066211, 13)

In [74]:
#tdcs_train_data_with_metadata.head(100)

print(tdcs_train_data_with_metadata.select('Id').n_unique())

833


In [20]:
defog_train_data_with_metadata.write_parquet('/kaggle/working/defog_train_data_with_metadata.parquet')
tdcs_train_data_with_metadata.write_parquet('/kaggle/working/tdcs_train_data_with_metadata.parquet')

In [6]:
defog_train_data_with_metadata = pl.read_parquet('/kaggle/input/defog-train-data-with-metadata/defog_train_data_with_metadata.parquet')

In [18]:
defog_train_data_with_metadata.select(pl.col('Task')).unique().transpose()

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Rest1""","""Rest2""","""4MW""","""MB2a""","""MB1""","""MB3-L""","""MB2b""","""MB3-R""","""MB4""","""MB7""","""MB6-R""","""MB6-L""","""MB5""","""MB8""","""MB9""","""MB10""","""MB13""","""MB12""","""MB11""","""TUG-ST""","""TUG-DT""","""Turning-ST""","""Turning-DT""","""Hotspot1""","""Hotspot2""","""TUG-C""","""Turning-C""","""Hotspot1-C""","""Hotspot2-C""","""4MW-C""","""MB6"""


In [8]:
defog_train_data_with_metadata.filter(pl.col('Id')=='77d7d95074').sort('Time') ## 00c4c9313d, 77d7d95074

Id,Subject,Visit,Time,Medication,Age,Sex,YearsSinceDx,NFOGQ,UPDRSIII,Task,AccV,AccML,AccAP,Kinetic,StartHesitation,Turn,Walking
str,str,i64,i64,str,i64,str,f64,i64,i64,str,f64,f64,f64,i64,i64,i64,i64
"""77d7d95074""","""413532""",1,1000,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.50311,-2.51419,4.61952,-1,0,0,0
"""77d7d95074""","""413532""",1,1001,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.50025,-2.51818,4.61314,-1,0,0,0
"""77d7d95074""","""413532""",1,1002,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.49875,-2.53144,4.60751,-1,0,0,0
"""77d7d95074""","""413532""",1,1003,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.49458,-2.52311,4.59673,-1,0,0,0
"""77d7d95074""","""413532""",1,1004,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.50451,-2.51152,4.60716,-1,0,0,0
"""77d7d95074""","""413532""",1,1005,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.51063,-2.51322,4.6191,-1,0,0,0
"""77d7d95074""","""413532""",1,1006,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.50097,-2.51853,4.62438,-1,0,0,0
"""77d7d95074""","""413532""",1,1007,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.49265,-2.52463,4.63078,-1,0,0,0
"""77d7d95074""","""413532""",1,1008,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.49973,-2.53138,4.62923,-1,0,0,0
"""77d7d95074""","""413532""",1,1009,"""on""",77,"""M""",21.0,24,37,"""Rest1""",-8.50288,-2.52905,4.63785,-1,0,0,0


In [10]:
defog_test_data = pl.DataFrame()

for dirname, _, filenames in os.walk('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog'):
    for filename in filenames:
        
        temp_defog = pl.scan_csv(os.path.join(dirname, filename), try_parse_dates=True).with_columns(pl.lit(filename[0:-4]).alias('Id'))
        temp_defog = temp_defog.with_columns(
            [
                ((pl.col('AccV')*9.80665).round(5).cast(pl.Float64)).alias('AccV_ms2'), 
                ((pl.col('AccML')*9.80665).round(5).cast(pl.Float64)).alias('AccML_ms2'),
                ((pl.col('AccAP')*9.80665).round(5).cast(pl.Float64)).alias('AccAP_ms2')
            ]
        )
        temp_defog = temp_defog.drop(['AccV','AccML','AccAP'])
        temp_defog = temp_defog.rename({'AccV_ms2': 'AccV', 'AccML_ms2':'AccML', 'AccAP_ms2':'AccAP'})
        
        temp_defog = temp_defog.sort(['Id','Time'], nulls_last=True)
        
        temp_final = dd.sql("SELECT td.Id, td.Time, td.AccV, td.AccML, td.AccAP, coalesce(e.Kinetic,-1) as Kinetic, coalesce(t.Task,'No Task') as Task \
        FROM temp_defog td \
        left join events e on td.Id = e.Id and td.Time >= e.Init and td.Time <= e.Completion \
        left join tasks t on td.Id = t.Id and td.Time >= t.Begin and td.Time <= t.End").pl()
        
        defog_test_data = pl.concat([defog_test_data, temp_final], how="vertical")
        del temp_final
        
defog_test_data_with_metadata = dd.sql('select t1.Id, t2.Subject, t2.Visit, t1.Time, t2.Medication, t2.Age, t2.Sex, t2.YearsSinceDx \
,t2.NFOGQ, t2.UPDRSIII, t1.Task, t1.AccV, t1.AccML, t1.AccAP, t1.Kinetic \
from defog_test_data t1 join defog_subjects_metadata t2 on t1.Id = t2.Id').pl()

del defog_test_data, defog_subjects_metadata
gc.collect()

60

In [11]:
defog_test_data_with_metadata.sort('Time')

Id,Subject,Visit,Time,Medication,Age,Sex,YearsSinceDx,NFOGQ,UPDRSIII,Task,AccV,AccML,AccAP,Kinetic
str,str,i64,i64,str,i64,str,f64,i64,i64,str,f64,f64,f64,i64
"""02ab235146""","""e1f62e""",2,0,"""on""",79,"""F""",8.0,28,38,"""No Task""",-8.96967,-2.95034,2.92391,-1
"""02ab235146""","""e1f62e""",2,1,"""on""",79,"""F""",8.0,28,38,"""No Task""",-8.96688,-2.95741,2.92556,-1
"""02ab235146""","""e1f62e""",2,2,"""on""",79,"""F""",8.0,28,38,"""No Task""",-7.64468,-3.36561,2.70359,-1
"""02ab235146""","""e1f62e""",2,3,"""on""",79,"""F""",8.0,28,38,"""No Task""",-9.73959,-2.65055,3.09669,-1
"""02ab235146""","""e1f62e""",2,4,"""on""",79,"""F""",8.0,28,38,"""No Task""",-9.30422,-3.01896,2.91445,-1
"""02ab235146""","""e1f62e""",2,5,"""on""",79,"""F""",8.0,28,38,"""No Task""",-8.90649,-2.92705,2.80188,-1
"""02ab235146""","""e1f62e""",2,6,"""on""",79,"""F""",8.0,28,38,"""No Task""",-9.01279,-2.94532,2.84569,-1
"""02ab235146""","""e1f62e""",2,7,"""on""",79,"""F""",8.0,28,38,"""No Task""",-8.99727,-2.94726,2.91122,-1
"""02ab235146""","""e1f62e""",2,8,"""on""",79,"""F""",8.0,28,38,"""No Task""",-8.99269,-2.92374,2.94614,-1
"""02ab235146""","""e1f62e""",2,9,"""on""",79,"""F""",8.0,28,38,"""No Task""",-8.99789,-2.89523,2.90047,-1


##### How to upsample

In [None]:
defog_train_data_with_metadata.describe()

In [None]:
defog_train_data_with_metadata.select('AccV','AccML','AccAP','Kinetic','Age','NFOGQ','UPDRSIII','StartHesitation','Turn','Walking').corr()

In [None]:
dd.sql('select Task, Visit, StartHesitation, count(1) from defog_train_data_with_metadata group by Task, Visit, StartHesitation').pl().head(65)

In [None]:
fig_dict = {}
defog_task_list = defog_train_data_with_metadata.get_column('Task').unique().to_list()

for task in defog_task_list:
    plot_title = 'AccV Vs Time for task : {0}'.format(task)
    temp = defog_train_data_with_metadata.filter((pl.col('Visit')==1) & (pl.col('Task')==task)).select('Time','AccV','StartHesitation','Sex')
    temp = temp.with_columns(
        [
            (pl.col('StartHesitation').cast(pl.Utf8)).alias('StartHesitation_str')
        ]
    )

    temp = temp.drop(['StartHesitation'])
    temp = temp.rename({'StartHesitation_str': 'StartHesitation'})
    #print(temp.shape)
    #print(temp.columns)
    temp_pdf = temp.to_pandas()
    if temp.shape[0] != 0:
        print('For Task :', task)
        fig = px.scatter(temp_pdf, x="Time", y="AccV", color="StartHesitation", facet_col="Sex", title=plot_title)
        fig_dict[task] = fig
    del temp, temp_pdf
    gc.collect()

In [None]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
for key in fig_dict.keys():
    print(key)
    fig_dict[key].show()