# Preprocessing ALS Dataset

In [1]:
import os

import utilsp
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore', category=Warning)

In [2]:
df_demographics = pd.read_csv('Data/source/PROACT_DEMOGRAPHICS.csv')
df_demographics.head()

Unnamed: 0,subject_id,Demographics_Delta,Age,Date_of_Birth,Ethnicity,Race_Americ_Indian_Alaska_Native,Race_Asian,Race_Black_African_American,Race_Hawaiian_Pacific_Islander,Race_Unknown,Race_Caucasian,Race_Other,Race_Other_Specify,Sex
0,121,0.0,52.0,,,,,,,,1.0,,,Female
1,226,0.0,72.0,,,,,,,,1.0,,,Male
2,290,0.0,,,,,,,,,,,,Male
3,624,0.0,,,,,,,,,,,,Female
4,666,0.0,42.0,,,,,,,,1.0,,,Male


In [3]:
utilsp.show_columns_stats(df_demographics)

subject_id...................... = 12504 rows (100.0%)     0 with NaN (  0.0%) Uniques= 12504 
Demographics_Delta.............. = 12419 rows (99.32%)    85 with NaN ( 0.68%) Uniques=    12 
Age............................. =  9483 rows (75.84%)  3021 with NaN (24.16%) Uniques=   196 
Date_of_Birth................... =  1573 rows (12.58%) 10931 with NaN (87.42%) Uniques=  1487 
Ethnicity....................... =  3764 rows ( 30.1%)  8740 with NaN ( 69.9%) Uniques=     5 
Race_Americ_Indian_Alaska_Native =   352 rows ( 2.82%) 12152 with NaN (97.18%) Uniques=     3 
Race_Asian...................... =   422 rows ( 3.37%) 12082 with NaN (96.63%) Uniques=     3 
Race_Black_African_American..... =   484 rows ( 3.87%) 12020 with NaN (96.13%) Uniques=     3 
Race_Hawaiian_Pacific_Islander.. =   341 rows ( 2.73%) 12163 with NaN (97.27%) Uniques=     3 
Race_Unknown.................... =   365 rows ( 2.92%) 12139 with NaN (97.08%) Uniques=     3 
Race_Caucasian.................. =  8377 rows (66.

In [4]:
df_calc_age_from_birth = df_demographics.loc[
    (df_demographics.Age.isnull()) & (df_demographics.Date_of_Birth.notnull())
]
print(f"Age calculated for {df_calc_age_from_birth.shape[0]} samples")
ages_calculated = df_calc_age_from_birth.Date_of_Birth.apply(
    lambda x: utilsp.calculate_age_from_birth_delta(np.abs(x))
)
df_demographics.loc[df_calc_age_from_birth.index, "Age"] = ages_calculated
utilsp.show_columns_stats(df_demographics, ["Age"])

Age calculated for 944 samples
Age = 10427 rows (83.39%)  2077 with NaN (16.61%) Uniques=   196 


In [5]:
to_delete = df_demographics.loc[(df_demographics.Age.isnull())]
df_demographics = utilsp.remove_rows(
    df=df_demographics, to_delete=to_delete, info="AGE"
)

to_delete = df_demographics.loc[(df_demographics.Sex.isnull())]
df_demographics = utilsp.remove_rows(
    df=df_demographics, to_delete=to_delete, info="SEX"
)

df_demographics = df_demographics[["subject_id", "Age", "Sex"]]
df_demographics.head()

  - AGE Previous=12504, To delete=2077, After=10427
  - SEX Previous=10427, To delete=0, After=10427


Unnamed: 0,subject_id,Age,Sex
0,121,52.0,Female
1,226,72.0,Male
4,666,42.0,Male
5,671,56.0,Female
6,935,45.0,Female


In [6]:
utilsp.show_columns_stats(df_demographics, df_demographics.columns)

subject_id = 10427 rows (100.0%)     0 with NaN (  0.0%) Uniques= 10427 
Age....... = 10427 rows (100.0%)     0 with NaN (  0.0%) Uniques=   195 
Sex....... = 10427 rows (100.0%)     0 with NaN (  0.0%) Uniques=     2 


In [7]:
df_als_history = pd.read_csv('Data/source/PROACT_ALSHISTORY.csv')
df_als_history.head()

Unnamed: 0,subject_id,Site_of_Onset___Bulbar,Site_of_Onset___Limb,Site_of_Onset___Limb_and_Bulbar,Site_of_Onset___Other,Site_of_Onset___Other_Specify,Site_of_Onset___Spine,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Onset_Delta,Diagnosis_Delta
0,121,,,,,,,0.0,,,,,,Onset: Limb,,
1,121,,,,,,,0.0,,Weakness,,UPPER LIMBS,,,-366.0,
2,226,,,,,,,0.0,,,,,,Onset: Bulbar,,
3,226,,,,,,,0.0,,Weakness,,"DIFFICULT PRONUNCIATION OF """"""""""""""""R"""""""""""""""" M...",,,-459.0,
4,290,,1.0,,,,,,,,,,,,,


In [8]:
utilsp.show_columns_stats(df_als_history, df_als_history.columns)

subject_id..................... = 13765 rows (100.0%)     0 with NaN (  0.0%) Uniques= 11100 
Site_of_Onset___Bulbar......... =  1713 rows (12.44%) 12052 with NaN (87.56%) Uniques=     3 
Site_of_Onset___Limb........... =  3929 rows (28.54%)  9836 with NaN (71.46%) Uniques=     3 
Site_of_Onset___Limb_and_Bulbar =   275 rows (  2.0%) 13490 with NaN ( 98.0%) Uniques=     3 
Site_of_Onset___Other.......... =    74 rows ( 0.54%) 13691 with NaN (99.46%) Uniques=     3 
Site_of_Onset___Other_Specify.. =     9 rows ( 0.07%) 13756 with NaN (99.93%) Uniques=     3 
Site_of_Onset___Spine.......... =   397 rows ( 2.88%) 13368 with NaN (97.12%) Uniques=     2 
Subject_ALS_History_Delta...... =  9923 rows (72.09%)  3842 with NaN (27.91%) Uniques=    23 
Disease_Duration............... =     0 rows (  0.0%) 13765 with NaN (100.0%) Uniques=     1 
Symptom........................ =  2656 rows ( 19.3%) 11109 with NaN ( 80.7%) Uniques=    11 
Symptom_Other_Specify.......... =    50 rows ( 0.36%) 13715 

In [9]:
df_als_history = df_als_history.groupby(['subject_id']).first().reset_index()
df_als_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11100 entries, 0 to 11099
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   subject_id                       11100 non-null  int64  
 1   Site_of_Onset___Bulbar           1713 non-null   float64
 2   Site_of_Onset___Limb             3929 non-null   float64
 3   Site_of_Onset___Limb_and_Bulbar  275 non-null    float64
 4   Site_of_Onset___Other            74 non-null     float64
 5   Site_of_Onset___Other_Specify    9 non-null      object 
 6   Site_of_Onset___Spine            397 non-null    float64
 7   Subject_ALS_History_Delta        7258 non-null   float64
 8   Disease_Duration                 0 non-null      float64
 9   Symptom                          2221 non-null   object 
 10  Symptom_Other_Specify            48 non-null     object 
 11  Location                         2818 non-null   object 
 12  Location_Other_Spe

In [10]:
df_als_history = utilsp.preprocess_als_history(df_als_history)
df_als_history.head()

Unnamed: 0,subject_id,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset
0,121,-366.0,,Limb/Spinal
1,226,-459.0,,Bulbar
2,290,,,Limb/Spinal
3,624,,,Limb/Spinal
4,666,-575.0,-132.0,Limb/Spinal


In [11]:
df_demographics = utilsp.join_datasets_by_key(
    df_main=df_demographics,
    df_to_join=df_als_history,
    key_name="subject_id",
    how="left",
)
df_demographics.head()

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset
0,121,52.0,Female,-366.0,,Limb/Spinal
1,226,72.0,Male,-459.0,,Bulbar
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal
5,671,56.0,Female,-254.0,,Bulbar
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal


In [12]:
df_demographics = utilsp.preprocess_diagnosis_delay(df_demographics)
df_demographics.head()

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay
0,121,52.0,Female,-366.0,,Limb/Spinal,
1,226,72.0,Male,-459.0,,Bulbar,
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0
5,671,56.0,Female,-254.0,,Bulbar,
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0


In [13]:
df_demographics = utilsp.preprocess_age_at_onset(df_demographics)
df_demographics.head()

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset
0,121,52.0,Female,-366.0,,Limb/Spinal,,51.0
1,226,72.0,Male,-459.0,,Bulbar,,71.0
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0,41.0
5,671,56.0,Female,-254.0,,Bulbar,,56.0
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0,43.0


In [14]:
# Age_of_Onset
to_delete = df_demographics.loc[(df_demographics.Age_at_Onset.isnull())]
df_demographics = utilsp.remove_rows(df=df_demographics, to_delete=to_delete)

# Site_Onset
to_delete = df_demographics.loc[(df_demographics.Site_Onset.isnull())]
df_demographics = utilsp.remove_rows(df=df_demographics, to_delete=to_delete)

  -  Previous=10427, To delete=1847, After=8580
  -  Previous=8580, To delete=0, After=8580


In [15]:
PREPROCESS_DIR = 'Data/preprocessed'
os.makedirs(PREPROCESS_DIR, exist_ok=True)
df_demographics.to_csv(f'{PREPROCESS_DIR}/demographics.csv', index=False)

In [16]:
df_demographics = utilsp.preprocess_last_visit(
    df_to_process=df_demographics, data_dir="Data/source"
)
df_demographics.head()

 - Get Last_Visit registered in PROACT_ALSFRS
 - Get Last_Visit registered in PROACT_FVC
 - Get Last_Visit registered in PROACT_DEATHDATA
 - Get Last_Visit registered in PROACT_LABS
 - Get Last_Visit registered in PROACT_RILUZOLE
 - Get Last_Visit registered in PROACT_SVC
 - Get Last_Visit registered in PROACT_VITALSIGNS
 - Get Last_Visit registered in PROACT_ALSHISTORY
 - Get Last_Visit registered in PROACT_DEMOGRAPHICS
 - Get Last_Visit registered in PROACT_ELESCORIAL
 - Get Last_Visit registered in PROACT_FAMILYHISTORY
 - Get Last_Visit registered in PROACT_HANDGRIPSTRENGTH
 - Get Last_Visit registered in PROACT_MUSCLESTRENGTH
 - Get Last_Visit registered in PROACT_TREATMENT
 - Get Last_Visit registered in PROACT_ADVERSEEVENTS
 - Get Last_Visit registered in PROACT_ADVERSEEVENTS
 - Get Last_Visit registered in PROACT_CONMEDS
 - Get Last_Visit registered in PROACT_CONMEDS


Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset,Last_Visit_Delta,Last_Visit_from_First_Visit,Last_Visit_from_Onset
0,121,52.0,Female,-366.0,,Limb/Spinal,,51.0,504.0,16.0,28.0
1,226,72.0,Male,-459.0,,Bulbar,,71.0,393.0,12.0,28.0
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0,41.0,486.0,16.0,34.0
5,671,56.0,Female,-254.0,,Bulbar,,56.0,107.0,3.0,11.0
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0,43.0,133.0,4.0,31.0


In [17]:
df_demographics = utilsp.preprocess_death_data(
    df_to_process=df_demographics, data_dir="Data/source"
)
df_demographics.head(15)

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset,Last_Visit_from_First_Visit,Last_Visit_from_Onset,Event_Dead,Event_Dead_Time_from_Onset,Event_Dead_Time_from_First_Visit
0,121,52.0,Female,-366.0,,Limb/Spinal,,51.0,16.0,28.0,True,28.0,16.0
1,226,72.0,Male,-459.0,,Bulbar,,71.0,12.0,28.0,True,28.0,12.0
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0,41.0,16.0,34.0,False,34.0,16.0
5,671,56.0,Female,-254.0,,Bulbar,,56.0,3.0,11.0,False,11.0,3.0
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0,43.0,4.0,31.0,False,31.0,4.0
7,1009,51.0,Male,-324.0,-63.0,Other,8.0,51.0,13.0,24.0,False,24.0,13.0
9,1110,70.0,Male,-297.0,-73.0,Limb/Spinal,7.0,70.0,8.0,17.0,True,17.0,8.0
10,1125,58.0,Female,-1102.0,-98.0,Limb/Spinal,33.0,55.0,14.0,50.0,False,50.0,14.0
11,1137,62.0,Female,-642.0,-501.0,Limb/Spinal,4.0,61.0,9.0,30.0,False,30.0,9.0
13,1333,55.0,Female,-425.0,-93.0,Limb/Spinal,10.0,54.0,9.0,23.0,True,23.0,9.0


In [18]:
df_demographics = utilsp.preprocess_riluzole(
    df_to_process=df_demographics, data_dir="Data/source"
)
df_demographics.head()

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset,Last_Visit_from_First_Visit,Last_Visit_from_Onset,Event_Dead,Event_Dead_Time_from_Onset,Event_Dead_Time_from_First_Visit,Riluzole
0,121,52.0,Female,-366.0,,Limb/Spinal,,51.0,16.0,28.0,True,28.0,16.0,True
1,226,72.0,Male,-459.0,,Bulbar,,71.0,12.0,28.0,True,28.0,12.0,True
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0,41.0,16.0,34.0,False,34.0,16.0,True
5,671,56.0,Female,-254.0,,Bulbar,,56.0,3.0,11.0,False,11.0,3.0,True
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0,43.0,4.0,31.0,False,31.0,4.0,True


In [19]:
utilsp.show_columns_stats(df_demographics, df_demographics.columns)

subject_id...................... =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=  8580 
Age............................. =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=   194 
Sex............................. =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=     2 
Symptoms_Onset_Delta............ =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=  1631 
Diagnosis_Delta................. =  6150 rows (71.68%)  2430 with NaN (28.32%) Uniques=  1049 
Site_Onset...................... =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=     5 
Diagnostic_Delay................ =  6150 rows (71.68%)  2430 with NaN (28.32%) Uniques=    74 
Age_at_Onset.................... =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=    69 
Last_Visit_from_First_Visit..... =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=    62 
Last_Visit_from_Onset........... =  8580 rows (100.0%)     0 with NaN (  0.0%) Uniques=   142 
Event_Dead...................... =  8580 rows (100

In [20]:
df_demographics.to_csv(f'{PREPROCESS_DIR}/demographics.csv', index=False)

In [21]:
df_alsfrs = utilsp.preprocess_alsfrs(df_to_process=df_demographics, data_dir='Data/source')
df_alsfrs.to_csv(f'{PREPROCESS_DIR}/ALSFRS.csv', index=False)
df_alsfrs.head()

  -  Previous=73845, To delete=188, After=73657


Unnamed: 0,subject_id,Delta_from_First_Visit,Delta_from_Symptoms_Onset,Q1_Speech,Q2_Salivation,Q3_Swallowing,Q4_Handwriting,Q5_Cutting,Q6_Dressing_and_Hygiene,Q7_Turning_in_Bed,...,Slope_from_First_Visit_Q6_Dressing_and_Hygiene,Slope_from_First_Visit_Q7_Turning_in_Bed,Slope_from_First_Visit_Q8_Walking,Slope_from_First_Visit_Q9_Climbing_Stairs,Slope_from_First_Visit_Q10_Respiratory,Region_Involved_Bulbar,Region_Involved_Upper_Limb,Region_Involved_Lower_Limb,Region_Involved_Respiratory,Qty_Regions_Involved
0,121,0.0,12.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0
1,121,1.0,13.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,121,2.0,14.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0
3,121,3.0,15.0,4.0,4.0,4.0,3.0,3.0,2.0,4.0,...,-0.333333,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0
4,121,5.0,17.0,4.0,4.0,4.0,2.0,3.0,2.0,3.0,...,-0.2,0.0,0.0,-0.2,0.0,0.0,1.0,1.0,0.0,2.0


In [22]:
subject_ids = utilsp.generate_time_series_disease_duration_based_on_alsfrs(
    df_temporal=df_alsfrs
)

In [23]:
df_alsfrs_filtered = df_alsfrs.loc[(df_alsfrs.subject_id.isin(subject_ids))].copy()
utilsp.generate_time_series_alsfrs(df_alsfrs_filtered)

Q1_Speech :: (8002, 122)
Q2_Salivation :: (8002, 122)
Q3_Swallowing :: (8002, 122)
Q4_Handwriting :: (8002, 122)
Q5_Cutting :: (8002, 122)
Q6_Dressing_and_Hygiene :: (8002, 122)
Q7_Turning_in_Bed :: (8002, 122)
Q8_Walking :: (8002, 122)
Q9_Climbing_Stairs :: (8002, 122)
Q10_Respiratory :: (8002, 122)
Region_Involved_Bulbar :: (8002, 122)
Region_Involved_Upper_Limb :: (8002, 122)
Region_Involved_Lower_Limb :: (8002, 122)
Region_Involved_Respiratory :: (8002, 122)
Qty_Regions_Involved :: (8002, 122)
Patient_with_Gastrostomy :: (8002, 122)
ALSFRS_Total :: (8002, 122)


In [24]:
df_measurements = df_demographics[["subject_id"]]
df_qty = pd.read_csv("Data/preprocessed/Q1_Speech_RAW.csv")

col_qty = "Qty_Measurements_ALSFRS"
df_qty[col_qty] = df_qty[df_qty.columns[1:]].count(axis=1)
df_qty = df_qty[["subject_id", col_qty]]
df_measurements = utilsp.join_datasets_by_key(
    df_main=df_measurements, df_to_join=df_qty, key_name="subject_id", how="left"
)
df_measurements.fillna(0, inplace=True)
df_measurements[col_qty] = df_measurements[col_qty].astype(int)
df_demographics = utilsp.join_datasets_by_key(
    df_main=df_demographics,
    df_to_join=df_measurements,
    key_name="subject_id",
    how="left",
)
df_demographics

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset,Last_Visit_from_First_Visit,Last_Visit_from_Onset,Event_Dead,Event_Dead_Time_from_Onset,Event_Dead_Time_from_First_Visit,Riluzole,Qty_Measurements_ALSFRS
0,121,52.0,Female,-366.0,,Limb/Spinal,,51.0,16.0,28.0,True,28.0,16.0,True,9
1,226,72.0,Male,-459.0,,Bulbar,,71.0,12.0,28.0,True,28.0,12.0,True,8
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0,41.0,16.0,34.0,False,34.0,16.0,True,5
5,671,56.0,Female,-254.0,,Bulbar,,56.0,3.0,11.0,False,11.0,3.0,True,3
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0,43.0,4.0,31.0,False,31.0,4.0,True,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12496,999492,43.0,Male,-296.0,-207.0,Limb/Spinal,2.0,43.0,10.0,20.0,False,20.0,10.0,True,10
12499,999649,56.2,Male,-441.0,-252.0,Bulbar,6.0,55.0,5.0,19.0,False,19.0,5.0,True,5
12500,999836,59.0,Male,-282.0,-178.0,Limb/Spinal,3.0,59.0,12.0,21.0,False,21.0,12.0,True,11
12501,999850,67.0,Male,-541.0,-224.0,Limb/Spinal,10.0,66.0,9.0,27.0,True,25.0,7.0,True,7


In [25]:
df_measurements = df_demographics.copy()
df_plot = df_measurements.loc[(df_measurements["Qty_Measurements_ALSFRS"] >= 3)]
df_plot

Unnamed: 0,subject_id,Age,Sex,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset,Last_Visit_from_First_Visit,Last_Visit_from_Onset,Event_Dead,Event_Dead_Time_from_Onset,Event_Dead_Time_from_First_Visit,Riluzole,Qty_Measurements_ALSFRS
0,121,52.0,Female,-366.0,,Limb/Spinal,,51.0,16.0,28.0,True,28.0,16.0,True,9
1,226,72.0,Male,-459.0,,Bulbar,,71.0,12.0,28.0,True,28.0,12.0,True,8
4,666,42.0,Male,-575.0,-132.0,Limb/Spinal,14.0,41.0,16.0,34.0,False,34.0,16.0,True,5
5,671,56.0,Female,-254.0,,Bulbar,,56.0,3.0,11.0,False,11.0,3.0,True,3
6,935,45.0,Female,-814.0,-508.0,Limb/Spinal,10.0,43.0,4.0,31.0,False,31.0,4.0,True,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12496,999492,43.0,Male,-296.0,-207.0,Limb/Spinal,2.0,43.0,10.0,20.0,False,20.0,10.0,True,10
12499,999649,56.2,Male,-441.0,-252.0,Bulbar,6.0,55.0,5.0,19.0,False,19.0,5.0,True,5
12500,999836,59.0,Male,-282.0,-178.0,Limb/Spinal,3.0,59.0,12.0,21.0,False,21.0,12.0,True,11
12501,999850,67.0,Male,-541.0,-224.0,Limb/Spinal,10.0,66.0,9.0,27.0,True,25.0,7.0,True,7


In [26]:
df_q1_speech_raw = pd.read_csv("Data/preprocessed/Q1_Speech_RAW.csv")
df_demographics_to_analyze = utilsp.get_first_last_visits(
    df_patients=df_demographics,
    df_alsfrs=df_q1_speech_raw,
    max_months_to_analyze=15,
)
df_demographics_to_analyze.to_csv("Data/preprocessed/15m_data.csv", index=False)
df_coded = utilsp.perform_data_codification(df_patients=df_demographics_to_analyze)
df_coded.to_csv("Data/preprocessed/15m_data_coded.csv", index=False)
df_coded.head()

Unnamed: 0,subject_id,Age,Sex_Male,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnostic_Delay,Age_at_Onset,Last_Visit_from_First_Visit,Last_Visit_from_Onset,Event_Dead,Event_Dead_Time_from_Onset,Event_Dead_Time_from_First_Visit,Riluzole,Qty_Measurements_ALSFRS
16,1492,42.0,1,-286.0,-223.0,1.0,2.0,42.0,29.0,38.0,1,38.0,29.0,1,15
17,1524,50.0,1,-958.0,,1.0,,48.0,19.0,50.0,0,50.0,19.0,1,12
21,2018,40.0,1,-192.0,,1.0,,40.0,18.0,24.0,0,24.0,18.0,1,12
22,2025,58.0,1,-758.0,,1.0,,56.0,18.0,43.0,0,43.0,18.0,1,12
23,2194,56.0,1,-1267.0,,1.0,,53.0,18.0,59.0,0,59.0,18.0,1,11


In [27]:
features = [
    "Q1_Speech",
    "Q2_Salivation",
    "Q3_Swallowing",
    "Q4_Handwriting",
    "Q5_Cutting",
    "Q6_Dressing_and_Hygiene",
    "Q7_Turning_in_Bed",
    "Q8_Walking",
    "Q9_Climbing_Stairs",
    "Q10_Respiratory",
    "ALSFRS_Total",
    "Qty_Regions_Involved",
    "Region_Involved_Bulbar",
    "Region_Involved_Lower_Limb",
    "Region_Involved_Upper_Limb",
    "Region_Involved_Respiratory",
    "Patient_with_Gastrostomy",
    "Disease_Duration",
]

patients_cols = [
    "subject_id",
    "Sex_Male",
    "Site_Onset",
    "Age_at_Onset",
    "Riluzole",
]

df_demographics = pd.read_csv("Data/preprocessed/15m_data_coded.csv")
df_demographics = df_demographics[patients_cols]
df_demographics.dropna(inplace=True)
subject_ids_to_filter = df_demographics.subject_id.unique()
df_time_series = None
for feature in features:
    df_ts = pd.read_csv(f"Data/preprocessed/{feature}.csv")
    df_ts = df_ts.loc[(df_ts.subject_id.isin(subject_ids_to_filter))].copy()
    df_ts.sort_values(by=["subject_id"], inplace=True)
    df_ts.insert(1, "feature", feature)
    if df_time_series is None:
        df_time_series = df_ts
    else:
        df_time_series = pd.concat([df_time_series, df_ts], ignore_index=True)
df_static_as_ts = utilsp.generate_time_series_static_features_based_on_alsfrs(
    df_temporal=df_ts,
    df_patients=df_demographics,
)
df_time_series = pd.concat([df_time_series, df_static_as_ts], ignore_index=True)
df_time_series["feature"].replace(
    {
        "Qty_Regions_Involved": "Regions_Involved",
        "Region_Involved_Bulbar": "Bulbar_Involved",
        "Region_Involved_Lower_Limb": "Lower_Limb_Involved",
        "Region_Involved_Upper_Limb": "Upper_Limb_Involved",
        "Region_Involved_Respiratory": "Respiratory_Involved",
        "Patient_with_Gastrostomy": "Gastrostomy",
        "Site_Onset": "Site_at_Onset",
        "Sex_Male": "Sex",
    }
)
df_time_series.to_csv("Data/preprocessed/data.csv", index=False)

In [28]:
df_time_series

Unnamed: 0,subject_id,feature,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,...,111.0,112.0,113.0,114.0,115.0,116.0,117.0,118.0,119.0,120.0
0,1492,Q1_Speech,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1524,Q1_Speech,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
2,2018,Q1_Speech,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,2025,Q1_Speech,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
4,2194,Q1_Speech,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38935,996984,Riluzole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38936,997561,Riluzole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38937,997649,Riluzole,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38938,998278,Riluzole,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
