# PROACT Dataset

In [1]:
import config as cfg
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

matplotlib_style = 'default'
import matplotlib.pyplot as plt; plt.style.use(matplotlib_style)
plt.rcParams.update({'axes.labelsize': 'medium',
                     'axes.titlesize': 'medium',
                     'font.size': 14.0,
                     'text.usetex': True,
                     'text.latex.preamble': r'\usepackage{amsfonts} \usepackage{bm}'})

from utility.plot import load_tf_color
TFColor = load_tf_color()

df = pd.read_csv(Path.joinpath(cfg.PROACT_DATA_DIR, 'proact_processed.csv'), index_col=0)
event_names = ['Speech', 'Swallowing', 'Handwriting', 'Walking', 'Dyspnea']
for event_name in event_names:
    df = df.loc[(df[f'Event_{event_name}'] == 0) | (df[f'Event_{event_name}'] == 1)] # drop already occured
    df = df.loc[(df[f'TTE_{event_name}'] > 0) & (df[f'TTE_{event_name}'] <= 500)] # 1 - 500
df = df.drop(df.filter(like='_Strength').columns, axis=1) # Drop strength tests
df = df.drop('Race_Caucasian', axis=1) # Drop race information
df = df.drop('El_escorial', axis=1) # Drop el_escorial
df = df.drop(['Height', 'Weight', 'BMI'], axis=1) # Drop height/weight/bmi

In [2]:
event_times = df[["TTE_Speech", "TTE_Swallowing", "TTE_Handwriting", "TTE_Walking", "TTE_Dyspnea"]].to_numpy()
event_indicators = df[["Event_Speech", "Event_Swallowing", "Event_Handwriting", "Event_Walking", "Event_Dyspnea"]].to_numpy()
masked_event_times = np.where(event_indicators == 1, event_times, np.nan)
min_time = np.nanmin(masked_event_times)
max_time = np.nanmax(masked_event_times)
mean_time = np.nanmean(masked_event_times)
print(min_time)
print(max_time)
print(mean_time)

1.0
498.0
130.69117647058823


In [3]:
# Print event distribution
for event_col in event_names:
    obs_arr = np.array(df[f"Event_{event_col}"])
    n_censored = obs_arr.shape[0] - obs_arr.sum()
    print(f"Number of samples: {len(df)}")
    print(f"Number of censored/uncensored: {n_censored}/{obs_arr.shape[0]-n_censored}")
    print(f"{round((obs_arr.shape[0]-n_censored) / obs_arr.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 3053
Number of censored/uncensored: 1962.0/1091.0
35.74% of records are uncensored

Number of samples: 3053
Number of censored/uncensored: 2148.0/905.0
29.64% of records are uncensored

Number of samples: 3053
Number of censored/uncensored: 1572.0/1481.0
48.51% of records are uncensored

Number of samples: 3053
Number of censored/uncensored: 1234.0/1819.0
59.58% of records are uncensored

Number of samples: 3053
Number of censored/uncensored: 2229.0/824.0
26.99% of records are uncensored

