# Data preprocessing

## Importing functions and data

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats
from scipy import special
import networkx as nx
import itertools
import updatedBoltzmannclean
import pickle
import math
# import boltzmannclean
# from fancyimpute import IterativeImputer as fancyIterativeImputer
# import pandas_bokeh
# pandas_bokeh.output_notebook()

import bokeh
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource, Slider, HoverTool
# import holoviews as hv
# from holoviews import dim, opts
# hv.extension('bokeh', 'matplotlib')
# output_notebook()

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.manifold import TSNE
from sklearn import cluster
from sklearn.metrics import f1_score, average_precision_score, multilabel_confusion_matrix, recall_score, roc_auc_score
pd.set_option('display.max_columns', None)

from umap import UMAP

from pymongo import MongoClient
import matplotlib.lines as mlines

## Renaming

In [38]:
# Importing data
df = pd.read_csv('data/dataRaw2.csv', na_values='.') # 2 is the second round of data where APOE count exists
print('df size is ', df.shape)

# list out colnames
colNames = ['summary_id', 'collectiondate_median', 'visit', 'led', 
            'site', 'agevisit', 'race', 'gender', 'ethnicity', 'education',  #demographic
            'handedness', 'Firstdeg_PD', 'Seconddeg_PD', 'veteran_status', #demographic
            'cognitive_status', 'updrs3_total_combo', 'modified_hy', #disease/clinical characteristics
            'disease_duration_onset', 'gds_total', 'disease_duration_dx', #disease/clinical characteristics 
            'dx_tremor', 'dx_rigidity', 'dx_bradykinesia', 'dx_instability', 'dx_dominant_side',
            'mayo_1_recent', 'mayo_2_recent', 'mayo_3_recent', 'mayo_4_recent', 
            'mayo_5_recent', 'mayo_6_recent', 'mayo_7_recent', 'mayo_8_3weeks', #sleep questions
            'npi_A_total', 'npi_B_total', 'npi_C_total', 'npi_D_total', 
            'npi_E_total', 'npi_F_total', 'npi_G_total', 'npi_H_total', 
            'npi_I_total', 'npi_J_total', 'npi_K_total', 'npi_L_total', 'visual_halluc', 
            'GBA_carrier', 'ApoE', 'LRRK2_carrier',  'MAPT', 'SNCA_rs356219', #genetic
            'animals', 'letter_fluency', # Cognitive-verbal fluency
            'hvlt_total_recall', 'hvlt_RDI', 'hvlt_trial4_correct', #cognitive - learning/memory
            'jolo_total_correct', #cognitive - visuospatial
            'wais_digit_symbol_score', 'letter_number_sequencing_total',
            'trails_a_seconds_utc150', 'trails_b_seconds_utc300', 
            'trailsbminusa', #cognitive - attention
            'moca_score_unadjusted' #cognitive - global
             ]
df['ApoE'] = df['ApoE'].apply(lambda x: np.NaN if math.isnan(x) else str(x))
multipleCateg = ['race', 'handedness', 'dx_tremor', 'dx_rigidity', 'dx_bradykinesia', 'dx_instability', 'dx_dominant_side', 'SNCA_rs356219', 'ApoE']
binaryCateg = ['gender', 'ethnicity', 'Firstdeg_PD', 'Seconddeg_PD', 'GBA_carrier', 'LRRK2_carrier', 'MAPT']
categ = multipleCateg + binaryCateg

# Remove minor cognitive statuses (<5 cases)
df = df[colNames]
otherStatus = df.loc[df.cognitive_status.isin(['Unknown', 'Other']), 'summary_id'].values
df = df.drop(df[df.summary_id.isin(otherStatus)].index)
df = df.reset_index().drop(['index'], axis=1)
colNames = df.columns

# Remove trails_a and trails_ b because trailsminusa is a subtraction of trails_b_seconds_utc300 by trails_a_seconds_utc150
df = df.drop(['trails_a_seconds_utc150', 'trails_b_seconds_utc300'], axis=1)

df size is  (1836, 64)


In [40]:
#check for NA in each column
colNA = df.isna().sum() > 0
colNA = [colNA.keys().tolist()[i] for i in np.array(np.where(colNA.tolist()))[0, :]]
df.isna().sum()[colNA]

led                                326
handedness                          14
Firstdeg_PD                         57
Seconddeg_PD                       106
veteran_status                     584
updrs3_total_combo                  70
modified_hy                          6
gds_total                           42
dx_tremor                           22
dx_rigidity                         34
dx_bradykinesia                     27
dx_instability                      37
dx_dominant_side                    30
mayo_1_recent                     1172
mayo_2_recent                     1181
mayo_3_recent                     1175
mayo_4_recent                     1176
mayo_5_recent                     1185
mayo_6_recent                     1185
mayo_7_recent                     1177
mayo_8_3weeks                     1180
npi_A_total                       1092
npi_B_total                       1094
npi_C_total                       1094
npi_D_total                       1094
npi_E_total              

## Imputing NAs with RBM

In [42]:
# Get veteran_status, visual_halluc, npi, and mayo out of the data for now
npiMayo = [(i) for i in df.columns.tolist() if 'npi' in i or 'mayo' in i or 'visual_halluc' in i or 'veteran_status' in i]
df_vis = df.drop(npiMayo, axis=1).copy()
notForImpute = ['summary_id', 'collectiondate_median', 'site', 'cognitive_status', 'visit']
df_vis = df_vis.drop(notForImpute, axis=1)

# start imputing
df_vis = updatedBoltzmannclean.clean(dataframe=df_vis, numerical_columns=list(set(df_vis.columns.to_list()) - set(categ)), 
                                     categorical_columns=categ, tune_rbm=True)
df_vis = pd.concat([df[notForImpute], df_vis], axis=1)

# Manage ApoE a bit (into columns of counts of each allele)
df_vis['ApoE'] = df_vis['ApoE'].apply(str)
ApoE = df_vis.ApoE.str.split(pat='.', expand=True)
df_vis['ApoE2'] = ApoE[0].apply(lambda x: 1 if x == '2' else 0) + ApoE[1].apply(lambda x: 1 if x == '2' else 0)
df_vis['ApoE3'] = ApoE[0].apply(lambda x: 1 if x == '3' else 0) + ApoE[1].apply(lambda x: 1 if x == '3' else 0)
df_vis['ApoE4'] = ApoE[0].apply(lambda x: 1 if x == '4' else 0) + ApoE[1].apply(lambda x: 1 if x == '4' else 0)

# Encoding y and remaining category
df_vis[['cognitive_status']] = df_vis[['cognitive_status']].apply(preprocessing.LabelEncoder().fit_transform)
df_vis[['cognitive_status']] = df_vis.cognitive_status.replace({1: 2, 0: 1, 2: 0}) # 0 = No cognitive impairment, 1 = Cognitive impairment but no dementia, 2 = dementia
encodeCol = ['race', 'gender', 'SNCA_rs356219', 'site', 'dx_tremor', 'dx_rigidity', 'dx_bradykinesia', 'dx_instability', 'ApoE']
df_vis[encodeCol] = df_vis[encodeCol].astype(str).apply(preprocessing.LabelEncoder().fit_transform)

# Add APOE4 coulumn (of having or not having)
df_vis['APOE_E4'] = df_vis['ApoE4'].apply(lambda x: 1 if x > 0 else 0)
categ = categ + ['APOE_E4']

# One of the patient visit number has a typo
df_vis.loc[df.summary_id == 'PWA10-0183', 'visit'] = df_vis.loc[df.summary_id == 'PWA10-0183', 'visit'] - 1

# #dump df_vis
# pickle.dump(df_vis, open("data/pickled_df_vis.p", "wb"), protocol=4)   # uncomment if you need to save

# Data Wrangling

## Creating df_vis_byID (grouped by ID)

In [56]:
# load df_vis
df_vis = pickle.load(open("data/pickled_df_vis.p", "rb"))

# Revised adjustment
df_vis.loc[df_vis.summary_id=='PWA13-0570', 'trailsbminusa'] = 95

# remove those that have negative disease_duration_dx
# df_vis = df_vis.loc[~df_vis.disease_duration_dx.isin([-803, -2]), ]

# Create dataframe grouped by id and visit, also adding interval between each visit, and type of disease progression
df_vis_byID = df_vis.groupby(['summary_id', 'visit']).mean()
df_vis_byID['collectiondate_median'] = pd.to_datetime(df_vis_byID['collectiondate_median'], yearfirst=True, format='%Y%m%d')
df_vis_byID['interval'] = np.where(df_vis_byID.index.get_level_values('visit').values != 1, 
                                   df_vis_byID.collectiondate_median - df_vis_byID.collectiondate_median.shift(), 0)
df_vis_byID['interval'] = df_vis_byID['interval'].apply(lambda l: l.days)
df_vis_byID['interval(mth)'] = df_vis_byID['interval'].apply(lambda x: round(x/30))
df_vis_byID['interval(yr)'] = df_vis_byID['interval'].apply(lambda x: round(x/365))
df_vis_byID['days_since_1st_visit'] = df_vis_byID['interval'].groupby(level=[0]).cumsum()
df_vis_byID['years_since_1st_visit'] = df_vis_byID['days_since_1st_visit'].apply(lambda x: round(x/365))
df_vis_byID['n_visit'] = [(df_vis_byID.loc[ID].index.get_level_values('visit')).max() for ID in df_vis_byID.index.get_level_values('summary_id')]

progression = []
firstPDYear = []
for ID in df_vis_byID.index.get_level_values('summary_id').unique():
    status = df_vis_byID.loc[ID].cognitive_status.values
    PDYears = df_vis_byID.loc[ID].disease_duration_onset.values
    firstPDYear = firstPDYear + [PDYears[0]]*len(PDYears)
    if len(np.unique(status)) == 1:
        progression = progression + [str(np.unique(status)[0])]*len(status)
    elif (status == sorted(status)).all():
        # if status is developmental (0 --> 1 --> 2)
        progression = progression + [''.join(str(x) for x in np.unique(status))]*len(status)
    elif (set([0, 2]).issubset(status)):
        if (np.argwhere(status==0)[-1] > np.argwhere(status==2)[0]):
            # removing status that is unlikely
            progression = progression + ['*20*']*len(status)
        else:
            progression = progression + ['9999']*len(status)
    elif (set([1, 2]).issubset(status)):
        if (np.argwhere(status==1)[-1] > np.argwhere(status==2)[0]):
            # removing status that is unlikely
            progression = progression + ['*21*']*len(status)
            continue
        else:
            progression = progression + ['9999']*len(status)
            continue
    else:
        progression = progression + ['9999']*len(status)

df_vis_byID['progression'] = progression
df_vis_byID['firstPDYear'] = firstPDYear
df_vis_byID['disease_duration_onset_calculated'] = df_vis_byID['firstPDYear'] + df_vis_byID['years_since_1st_visit']
df_vis_byID = df_vis_byID.drop(['firstPDYear'], axis=1)

oriID = list(df_vis_byID.index.get_level_values('summary_id'))
shiftedID = oriID[1:] + ['0']
df_vis_byID['next_cognitive_status'] = np.where([oriID[x] == shiftedID[x] for x in range(len(shiftedID))], 
                                    df_vis_byID.cognitive_status.shift(-1).apply(lambda x: x if np.isnan(x) else str(int(x))), '9999')

# binning PD onset
bins = np.linspace(0, 44, 45)
df_vis_byID['PD_onset_binned'] =  np.digitize(df_vis_byID['disease_duration_onset_calculated'], bins)

# Get rid of the *21* or *20*
df_vis_byID = df_vis_byID.loc[~(df_vis_byID.progression.isin(['*21*', '*20*'])), :].copy()

colName = df_vis_byID.columns.tolist()
colName = [colName[0]] + colName[-10:] + colName[1:-10]
df_vis_byID = df_vis_byID[colName]
featureColumns  = colName[13:] # excluding all these progression, intervals, and cognitive_status

# saving
pickle.dump(df_vis_byID, open('data/pickled_df_vis_byID.p', 'wb'))
pickle.dump(featureColumns, open('data/pickled_featureColumns.p', 'wb'))
pickle.dump(multipleCateg, open('data/pickled_multipleCateg.p', 'wb'))
pickle.dump(categ, open('data/pickled_categ.p', 'wb'))

## Exporting to MATLAB

In [57]:
# To matlab
df_vis_byID = pickle.load(open('data/pickled_df_vis_byID.p', 'rb'))
featureColumns = pickle.load(open('data/pickled_featureColumns.p', 'rb'))
multipleCateg = pickle.load(open('data/pickled_multipleCateg.p', 'rb'))
categ = pickle.load(open('data/pickled_categ.p', 'rb'))

featureColumnsMATLAB = [i for i in featureColumns if not(i in ['ApoE', 'ApoE2', 'ApoE3', 'ApoE4', 'SNCA_rs356219', 'LRRK2_carrier', 'disease_duration_onset'])]
multipleCategMATLAB = [i for i in multipleCateg if not(i in ['ApoE', 'SNCA_rs356219'])]

Xs = dict()
ys = dict()
scaling = preprocessing.StandardScaler()  #MinMaxScaler #StandardScaler

df_vis_byYear = df_vis_byID.reset_index().groupby(['summary_id', 'years_since_1st_visit']).mean()
X = df_vis_byYear[featureColumnsMATLAB]
y = df_vis_byYear['cognitive_status']
dummies = pd.get_dummies(X[multipleCategMATLAB], columns=X[multipleCategMATLAB].columns)
X = X.drop(multipleCategMATLAB, axis=1)
X = pd.concat([X, dummies], axis=1)

X = X.rename(columns={'handedness_1.0':'handedness_1', 'handedness_2.0':'handedness_2', 'handedness_3.0':'handedness_3',
                  'dx_dominant_side_1.0':'dx_dominant_side_1', 'dx_dominant_side_2.0':'dx_dominant_side_2',
                  'dx_dominant_side_3.0':'dx_dominant_side_3'})
X[X.columns[~X.columns.isin(categ)]] = scaling.fit_transform(X[X.columns[~X.columns.isin(categ)]])
# X[X.columns[~X.columns.isin(categ)]] = X[X.columns[~X.columns.isin(categ)]].apply(lambda x: x + abs(min(x)))

for i in range(0, 6):
    ys[i] = y.loc[(y.index.get_level_values('years_since_1st_visit')==i), ]
    ys_index = ys[i].index.get_level_values('summary_id')
    Xs[i] = X.loc[(X.index.get_level_values('years_since_1st_visit')==0) & (X.index.get_level_values('summary_id').isin(ys_index)), ]
    
for i, x in enumerate(Xs.keys()):
    Xs[x].reset_index(drop=True).to_csv('data/X_yearsince1stvisit_stdScaled.{}.csv'.format(i))
    ys[x].reset_index(drop=True).to_csv('data/y_yearsince1stvisit_stdScaled.{}.csv'.format(i), header=False)

In [53]:
df_vis_byID.disease_duration_dx.describe()

count    1795.000000
mean        7.611699
std         6.150036
min         0.000000
25%         3.000000
50%         6.000000
75%        11.000000
max        41.000000
Name: disease_duration_dx, dtype: float64

## Export to R

In [18]:
# df_vis_byID = pickle.load(open('data/pickled_df_vis_byID.p', 'rb'))
# featureColumns = pickle.load(open('data/pickled_featureColumns.p', 'rb'))
# multipleCateg = pickle.load(open('data/pickled_multipleCateg.p', 'rb'))
# categ = pickle.load(open('data/pickled_categ.p', 'rb'))

df_vis_byID.to_csv('data/csv_df_vis_by_ID.csv')
pd.Series(featureColumns).to_csv('data/csv_featureColumns.csv', header=False)
pd.Series(multipleCateg).to_csv('data/csv_multipleCateg.csv', header=False)
pd.Series(categ).to_csv('data/csv_categ.csv', header=False)