In [None]:
#imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline
sns.set_palette("Set2")

In [None]:
#load data
raw_ds=pd.read_csv(r"..\data\raw\investigator_nacc50.csv")


In [None]:
raw_ds.head()

In [None]:
list(raw_ds)

In [None]:
raw_ds.shape

In [None]:
plt.hist(raw_ds['NACCALZD']);

In [None]:
print(sum(raw_ds['NACCALZD']==1)) #Presumptive etiologic diagnosis (could be contributing)
print(sum(raw_ds['NACCETPR']==1)) #Primary etiologic diagnosis

In [None]:
#NACCUDSD: 3 = MCI; 4 = Dementia; NACCETPR=1 AD; is this last one = to NACCALZD=1? no. see above
MCI_index=np.unique(raw_ds.loc[(raw_ds['NACCUDSD']==3) & ((raw_ds['NACCETPR']==1) | (raw_ds['NACCETPR']==99)),['NACCID']])
Dem_index=np.unique(raw_ds.loc[(raw_ds['NACCUDSD']==4) & (raw_ds['NACCETPR']==1),['NACCID']])
combi_i=set(MCI_index).intersection(Dem_index)
len(combi_i)


In [None]:
AD_MCI=raw_ds.loc[raw_ds['NACCID'].isin(combi_i)]
#db of etiol AD patients that had at one point MCI and developed dementia


In [None]:
#add sb of etiol AD patients that had at one point MCI and did not develop dementia
nod_index=set(MCI_index).difference(combi_i)
AD_MCI=pd.concat([AD_MCI,raw_ds.loc[raw_ds['NACCID'].isin(nod_index)]])
len(nod_index)

In [None]:
len(combi_i)

In [None]:
#add visit date and create target
AD_MCI['visitdate']=pd.to_datetime(dict(year=AD_MCI.VISITYR, month=AD_MCI.VISITMO, day=AD_MCI.VISITDAY))
scores=np.concatenate((np.ones(len(combi_i)),np.zeros(len(nod_index))))
idi=list(combi_i)+list(nod_index)
op={'NACCID':idi, 'target':scores}
output= pd.DataFrame(op)

In [None]:
#add interval between visits, with last visit being NAN
visit_interval=np.array([])
for i in np.unique(AD_MCI['NACCID']):
    temp=AD_MCI[['NACCVNUM','NACCAVST','visitdate']].loc[AD_MCI['NACCID'] == i]
    for n in temp['NACCVNUM']:
        if n not in temp['NACCAVST'].values:
            visit_next=(temp['visitdate'].loc[temp['NACCVNUM'] == n+1]).values
            visit_now=(temp['visitdate'].loc[temp['NACCVNUM'] == n])
            visit_interval=np.append(visit_interval,(visit_next-visit_now).dt.days)
        else:
            visit_interval=np.append(visit_interval,np.nan)

In [None]:
AD_MCI['visit_interval']=visit_interval

In [None]:
#faster solution with no loops? pct_change = close[1:]/close[:-1]

#make sure the df is sorted by id and then NACCVNUM
#create col with visitdate shifted 1 up
# substract to visitdate
#find loc where NACCVNUM NACCAVST and substitute by nan.



In [None]:
#lets map the output to the main
AD_MCI=AD_MCI.join(output.set_index('NACCID'), on= 'NACCID')

In [None]:
AD_MCI

In [None]:
#plot the number of visits and interval beetween for the two catergories; 
f, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.boxenplot(x="NACCAVST", y="visit_interval", hue="target",showfliers=0,
            data=AD_MCI,ax=axes[0]); axes[0].set(ylim=(200, 1500))

sns.countplot(x="NACCAVST", hue="target",
            data=AD_MCI.groupby(['NACCID']).mean(),ax=axes[1]);
#g.set_xticklabels(g.get_xticklabels(),rotation=45)
labels = [label.get_text() for label in axes[1].get_xticklabels()];
axes[1].set_xticklabels(map(lambda x: "{:g}".format(float(x)), labels));



In [None]:
# drop everything that is not MCI & dem (we won't take into account healthy history. 
#Reconsider this step if building historic model)
clean_AD_MCI=AD_MCI.loc[(AD_MCI['NACCUDSD']>=3)]
print(clean_AD_MCI.shape)
#drop NACCIDs with just one visit
clean_AD_MCI=clean_AD_MCI[clean_AD_MCI['NACCID'].map(clean_AD_MCI['NACCID'].value_counts()) >= 2]
print(clean_AD_MCI.shape)
# drop everything that is not MCI 
clean_AD_MCI=clean_AD_MCI.loc[(clean_AD_MCI['NACCUDSD']==3)]
print(clean_AD_MCI.shape)

#check again to see if we need to drop desangelados
#todrop=[]
c=0
for i in nod_index:
    if clean_AD_MCI.loc[clean_AD_MCI.NACCID == i, 'NACCID'].count()<2 & clean_AD_MCI.loc[clean_AD_MCI.NACCID == i, 'NACCID'].count()> 0:
        c+=1
        #todrop.append(clean_AD_MCI.index[clean_AD_MCI['NACCID'] == i].tolist())
        #clean_AD_MCI=clean_AD_MCI.drop(clean_AD_MCI.index[clean_AD_MCI['NACCID'] == i])

print(c)


In [None]:
#recheck distros
f, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.boxenplot(x="NACCAVST", y="visit_interval", hue="target",showfliers=0,
            data=clean_AD_MCI,ax=axes[0]); axes[0].set(ylim=(200, 1500))

sns.countplot(x="NACCAVST", hue="target",
            data=clean_AD_MCI.groupby(['NACCID']).mean(),ax=axes[1]);
#g.set_xticklabels(g.get_xticklabels(),rotation=45)
labels = [label.get_text() for label in axes[1].get_xticklabels()];
axes[1].set_xticklabels(map(lambda x: "{:g}".format(float(x)), labels));

In [None]:
#drop columns full of nans
clean_AD_MCI=clean_AD_MCI.dropna(axis=1,how='all')

In [None]:
#save this dataframe
#clean_AD_MCI.to_csv('..\data\interim\AD_MCI.csv')