In [None]:
#imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from pivottablejs import pivot_ui

In [None]:
%matplotlib inline
sns.set_palette("Set2")

In [None]:
#load data
raw_ds=pd.read_csv(r"..\data\interim\AD_MCI.csv")


In [None]:
#In this notebook: Focus 1st on most recent visit from demented diagnosis, drop nan columns
#start variable cleaning/oneshoting(prob not efficient?)

In [None]:
#drop rows withvisit_interval == nan; 
#drop every row per id but latest recorded
print(raw_ds.shape)
one_ts=raw_ds[raw_ds['visit_interval'].notna()]
print(one_ts.shape)
grouped=one_ts.groupby('NACCID')
max_i = grouped['NACCVNUM'].idxmax()
one_ts=one_ts[one_ts.index.isin(max_i.values)]
print(one_ts.shape)


In [None]:
one_ts

In [None]:
#sanity check
one_ts['NACCID'].nunique() == one_ts['NACCID'].size

In [None]:
#recheck distros
f, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.boxenplot(x="NACCAVST", y="visit_interval", hue="target",showfliers=0,
            data=one_ts,ax=axes[0]); axes[0].set(ylim=(200, 1500))

sns.countplot(x="NACCAVST", hue="target",
            data=one_ts.groupby(['NACCID']).mean(),ax=axes[1]);
#g.set_xticklabels(g.get_xticklabels(),rotation=45)
labels = [label.get_text() for label in axes[1].get_xticklabels()];
axes[1].set_xticklabels(map(lambda x: "{:g}".format(float(x)), labels));

In [None]:
one_ts=one_ts.dropna(axis=1,how='all')
one_ts.describe

In [None]:
#nan time
def summary(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name', 'dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Missing%'] = (df.isnull().sum().values/df.shape[0])*100
    summary['Uniques'] = df.nunique().values
    summary=summary.sort_values(by=['Missing','Uniques'], ascending=[False, False])
    return summary

In [None]:
#what are the nulls
pd.set_option('display.min_rows', 500)
summary(one_ts)

In [None]:
#handle the drugs
one_ts['drug_count']=one_ts.loc[:,one_ts.columns.str.contains('DRUG')].apply(lambda x: x.count(), axis=1).copy()
drugs=one_ts.loc[:,one_ts.columns.str.contains('DRUG')]
#and drop the drugs
one_ts=one_ts.loc[:,~one_ts.columns.str.contains('DRUG')]

drugs.shape

In [None]:
#drop all other cols (ending with X)
one_ts = one_ts.loc[:,~one_ts.columns.str.endswith('X')]
#or go ahead and drop all columns with more than 20% nan
one_ts=one_ts[one_ts.columns[one_ts.isnull().mean() < 0.2]]
one_ts=one_ts[one_ts.columns[one_ts.nunique() < 1000]]

In [None]:
def plt_cat(df,target):
    ix = 1
    fig = plt.figure(figsize = (15,10))
    for c in list(df.columns):
        if ix <= 3:
            if c != target:
                ax1 = fig.add_subplot(2,3,ix)
                #sns.countplot(data = df, x=c, hue=target, ax = ax1)
                df1 = df.groupby(target)[c].value_counts(normalize=True)
                df1 = df1.rename('norm').reset_index()
                sns.barplot(x=c,y='norm',hue=target,data=df1,ax = ax1)
                
                ax2 = fig.add_subplot(2,3,ix+3)
                #sns.violinplot(data=df, x=target, y=c, split=True, ax=ax2)
                sns.boxenplot(data=df, x=target, y=c, ax=ax2)

        ix = ix +1
        if ix == 4: 
            fig = plt.figure(figsize = (15,10))
            ix =1

In [None]:
#lets plot the remaing variables with nans
t=one_ts[one_ts.columns[one_ts.isnull().mean() > 0]].copy()
t['target']=one_ts["target"]
plt_cat(t,"target")

In [None]:
#useless drop function
def drop_useless(df, cols):
    for c in cols:
        if c in df.columns:
            df.drop(c,axis=1,inplace=True)

In [None]:
#All these nan columns have the majority of data unavailable (-4) as they come from a merged table of brain autopsy details and majority of patients are alive or didn't donate their brain.
#Therefore discard and let's look at the rest
one_ts=one_ts[one_ts.columns[one_ts.isnull().mean() == 0]]
#from here we can take out those with -4 and 0, vars with one number only, and vars that are part of the neuropathology database and most of genomic
one_ts=one_ts.loc[:,~one_ts.columns.str.startswith(('ADGC','NGDS'))]
cols=one_ts.loc[:,'NPFORMVER':'NACCINT'].columns
useless=pd.Index(['NCDSWEAC', 'NACCNCRD'])
cols=cols.append(useless)
drop_useless(one_ts,cols)

one_ts.drop(one_ts.columns[one_ts.nunique() == 1],axis=1,inplace=True)
#where all is -4 and another number
one_ts.drop(one_ts.columns[one_ts.nunique()==2] & one_ts.columns[one_ts.isin([-4]).any()],axis=1,inplace=True)

In [None]:
#where there's only -4 unique per target?
g = one_ts.groupby('target').agg(['unique'])
g

In [None]:
#useless/dangerous by inspection
useless=['NACCADC','PACKET','FORMVER','VISITMO','VISITDAY','VISITYR','NACCVNUM','NACCCORE','NACCREAS','BIRTHMO','INBIRMO','INHISPOR','INRACE', 'INRASEC','INRATER','INEDUC','INKNOWN','NACCAMS','NACCFM','NACCFMS','NACCOM','NACCOMS','MACCFFTD','HATTMULT','HATTYEAR','NACCSTYR','TIAMULT','NACCTIYR','PDYR','PDOTHRYR','TBIYEAR','COGFLAGO','BEVHAGO','BEREMAGO', 'BEAGE','PARKAGE','MOAGE','ALSAGE','LOGIMO','LOGIDAY','LOFIYR','LOGIPREV','MOCAREAS','NORMCOG','DEMENTED','AMNDEM','PCA','NACCPPAG','PSPIF','NACCBEHF','NACCNRDY','NACCNRMO','NACCDSYR','NACCDSMO','NACCDSDY','NACCYOD','NACCMOD','LOGIYR','NACCIDEM','NACCDIED','NACCAUTP']

drop_useless(one_ts,useless)


In [None]:
#numerical cols: usually more than 20 unique numbers
#numerical_cols=['NACCAVST','NACCNVST','BRTHYR','EDUC', 'INBIRYR','SMOKYRS','QUITSMOK','HEIGHT','WEIGHT']
numerical_cols=one_ts.columns[one_ts.nunique() > 10]
others=['MMSEORDA','MMSEORLO','PENTAGON','NACCNRYR']
numerical_cols=numerical_cols.append(pd.Index(others))

In [None]:
numerical_cols

In [None]:
#and plot
plt_cat(one_ts[one_ts.columns[~one_ts.columns.isin(numerical_cols)]],"target")

In [None]:
#histogram of visit_interval
sns.distplot(one_ts["visit_interval"][one_ts["target"]==0])
sns.distplot(one_ts["visit_interval"][one_ts["target"]==1])
len(one_ts.select_dtypes(include=['int64','float64']))==one_ts.shape[0]#check if everything is numeric

In [None]:
#pearson correl for visualization
n_ds =one_ts[numerical_cols]
sum_corr = n_ds.corr().sum().sort_values(ascending=False).index.values
plt.figure(figsize=(13, 13))
sns.heatmap(n_ds[sum_corr].corr())
#some variables are coding the same, we will have to drop: examples age & birthyear, drugnumber, bmi and weight


In [None]:
#split numerical in training sets
X_train, X_test, y_train, y_test = train_test_split(n_ds, one_ts['target'], test_size=0.35, random_state=1)
# configure to select all features
fs = feature_selection.SelectKBest(score_func=feature_selection.mutual_info_classif, k='all')
# learn relationship from training data
fs.fit(X_train, y_train)
target_vec=fs.scores_

In [None]:
def feature_drop(corr_matrix, target_vec):
    corr_features=[]
    for i in range(len(corr_matrix .columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > 0.85:
                if target_vec[i] > target_vec[j]:
                    colname = corr_matrix.columns[j]
                else:
                    colname = corr_matrix.columns[i]

                corr_features.append(colname)
    return corr_features            


In [None]:
to_drop_num=feature_drop(n_ds.corr(), target_vec)
drop_useless(one_ts, np.unique(to_drop_num))
drop_useless(n_ds, np.unique(to_drop_num))
up_numerical_cols = set(numerical_cols).difference(np.unique(to_drop_num))

len(to_drop_num)

In [None]:
#plot numerical
temp=n_ds.copy()
temp['target']=one_ts['target']
plt_cat(temp,"target")


In [None]:
#other exclussions


In [None]:
#quick tree
X_train, X_test, y_train, y_test = train_test_split(one_ts, one_ts['target'], test_size=0.35, random_state=1)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
print(model.score(X_test, y_test))

In [None]:
#feature selection
y= one_ts['target']
X = one_ts.drop('target', axis=1)
model = ExtraTreesClassifier()
model.fit(X,y)
#print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(50).plot(kind='barh')
plt.show()


In [None]:
c='INHISP'#'NACCAM'#NACCALZD
target='target'
df1 = one_ts.groupby(target)[c].value_counts(normalize=True)
df1 = df1.rename('norm').reset_index()
sns.barplot(x=c,y='norm',hue=target,data=df1)

In [None]:
#sns.boxenplot(data=one_ts, x='target',y='NACCADMD' )
c='NACCADMD'
target='target'
df1 = one_ts.groupby(target)[c].value_counts(normalize=True)
df1 = df1.rename('norm').reset_index()
sns.barplot(x=c,y='norm',hue=target,data=df1)

In [None]:
res=feat_importances.nlargest(50).index

In [None]:
#quick tree restricted
#quick tree
X_train, X_test, y_train, y_test = train_test_split(one_ts[res.to_list()], one_ts['target'], test_size=0.35, random_state=1)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(model.score(X_test, y_test))
print(precision_score(y_test,y_pred))
recall_score(y_test,y_pred)

In [None]:
# res taken
taken=one_ts[res.to_list()].drop(['NACCADMD','NACCAVST', 'NACCAM', 'NACCALZD','NACCALZP','INHISP','NEWINF'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(taken, one_ts['target'], test_size=0.5, random_state=1)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(model.score(X_test, y_test))
print(precision_score(y_test,y_pred))
recall_score(y_test,y_pred)

In [None]:
sns.countplot(one_ts['target'])

In [None]:
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(one_ts)
kmo_model


In [None]:
from factor_analyzer import analyze

fa = FactorAnalyzer(rotation=None)
fa.fit(one_ts)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev

In [None]:
plt.plot(range(1,one_ts.shape[1]+1),ev)
plt.xlim([0, 20])

In [None]:
fa = FactorAnalyzer(10, rotation='varimax')
fa.fit(one_ts)
loads = fa.loadings_
print(loads)

In [None]:
loads.shape

In [None]:
filt_loads=np.argwhere(loads>.5)

In [None]:
filt_loads.shape

In [None]:
a=np.argmax(loads, axis=0)
a

In [None]:
one_ts.columns[a]

In [None]:
dd=pd.read_csv(r"..\docs\rdd_datadictionary_uds.csv")

In [None]:
dd

In [None]:
dd.DataType.unique()

In [None]:
dd.Form.unique()

In [None]:
pd.set_option('display.max_rows', dd.shape[0]+1)
print(dd.AllowableCodes)

In [None]:
one_ts.shape

In [None]:
pivot_ui(one_ts)