In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import *
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from scipy import interp
from sklearn.metrics import roc_auc_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Set two decimal points for display
pd.options.display.float_format = "{:.2f}".format

# Load the Data

In [4]:
import pyreadstat
uds, meta = pyreadstat.read_sas7bdat("C:/Users/tsb7592/Downloads/armada_uds_sep21_di.sas7bdat")
uds.shape

(1046, 652)

In [5]:
uds =uds[uds.assmnt ==1]

In [25]:
uds.ADCNAME.unique()

array(['Emory', 'Mayo', 'Northwestern',
       'University of Pittsburgh - ARMADA', 'U California, San Diego',
       'Massachusetts - ARMADA', 'U Wisconsin', 'U Michigan ADC',
       'Columbia WHICAP'], dtype=object)

In [31]:
uds[uds.NACCMOCA.isnull()]['ADCNAME'].unique()

array(['University of Pittsburgh - ARMADA', 'Massachusetts - ARMADA',
       'U Wisconsin', 'U California, San Diego', 'Columbia WHICAP'],
      dtype=object)

In [32]:
uds[uds.ADCNAME=='Emory']['MOCATOTS'].describe()

count   57.00
mean    22.88
std      6.61
min     -4.00
25%     21.00
50%     24.00
75%     27.00
max     29.00
Name: MOCATOTS, dtype: float64

In [33]:
uds[uds.ADCNAME=='U Wisconsin']['MOCATOTS'].describe()

count   110.00
mean     24.17
std       8.43
min      -4.00
25%      22.00
50%      25.00
75%      28.00
max      88.00
Name: MOCATOTS, dtype: float64

In [34]:
missing = uds[(uds.MOCATOTS==-4)|(uds.MOCATOTS==88)|(uds.MOCATOTS.isnull())|(uds.NACCMOCA==-4)|(uds.NACCMOCA==88)|(uds.NACCMOCA.isnull())]
missing[['ptid','ADCNAME','MOCATOTS','NACCMOCA']].head()

Unnamed: 0,ptid,assmnt,ADCNAME,MOCATOTS,NACCMOCA
10,1.48786,1.0,Emory,-4.0,-4.0
44,101149492.0,1.0,Mayo,88.0,88.0
70,142.0,1.0,University of Pittsburgh - ARMADA,,
101,2.0,1.0,University of Pittsburgh - ARMADA,,
136,2090.0,1.0,Northwestern,-4.0,-4.0


In [41]:
missing.MOCATOTS.isnull().groupby(missing['ADCNAME']).sum().astype(int)

ADCNAME
Columbia WHICAP                       0
Emory                                 0
Massachusetts - ARMADA                0
Mayo                                  0
Northwestern                          0
U California, San Diego               0
U Wisconsin                           1
University of Pittsburgh - ARMADA    14
Name: MOCATOTS, dtype: int32

In [45]:
missing.NACCMOCA.isnull().groupby(missing['ADCNAME']).sum().astype(int)

ADCNAME
Columbia WHICAP                      180
Emory                                  0
Massachusetts - ARMADA               105
Mayo                                   0
Northwestern                           0
U California, San Diego               30
U Wisconsin                           25
University of Pittsburgh - ARMADA     36
Name: NACCMOCA, dtype: int32

In [48]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('C:/Users/tsb7592/Downloads/MISSING_MOCA.xlsx', engine='xlsxwriter')

#a. rows with missing moca
missing[['ptid','ADCNAME','MOCATOTS','NACCMOCA']].to_excel(writer, sheet_name='Missing_data', index=False)

#b. summary of missing MOCATOTS
missing.MOCATOTS.isnull().groupby(missing['ADCNAME']).sum().astype(int).to_excel(writer, sheet_name='missing_counts_MOCATOTS')

#c. summary of missing NACCMOCA
missing.NACCMOCA.isnull().groupby(missing['ADCNAME']).sum().astype(int).to_excel(writer, sheet_name='missing_counts_NACCMOCA')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [36]:
#load the missing info
mdf = pd.read_csv("C:/Users/tsb7592/Downloads/UDS_MOCA_MISSING2[72153].csv")

In [37]:
mdf.shape

(277, 5)

In [38]:
mdf.columns=['ADCID', 'ptid', 'MOCATOTS', 'Reason', 'ADCNAME']

In [39]:
df = pd.merge(mdf,uds[['ptid','ADCID','MOCATOTS','MMSECOMP','NACCMMSE']], on=['ptid','ADCID'],how='left')

In [41]:
df.shape

(277, 8)

# 2. Verify that all the ptids are indeed missing: that is, all the missing IDs that are listed have the values -4, 88, or NA

In [31]:
#Inconsistancy in MOCATOTS
df[df.MOCATOTS_x != df.MOCATOTS_y]

Unnamed: 0,ADCID,ptid,MOCATOTS_x,Reason,ADCNAME,MOCATOTS_y,MMSECOMP,NACCMMSE
0,6,1.46421,-4.0,Telephone packet,Emory,10.0,-4.0,-4.0
1,6,1.46422,-4.0,Telephone packet,Emory,22.0,-4.0,-4.0
3,6,713886,-4.0,Telephone packet,Emory,21.0,-4.0,-4.0
9,37,ARMADA007,,Matches data submitted,U Wisconsin,,-4.0,-4.0
10,43,UM00000700,-4.0,Telephone packet,U Michigan ADC,20.0,-4.0,-4.0
11,43,UM00001444,-4.0,Telephone packet,U Michigan ADC,30.0,-4.0,-4.0
12,73,142,,Center does not upload C2,University of Pittsburgh - ARMADA,,-4.0,-4.0
13,73,2,,Center does not upload C2,University of Pittsburgh - ARMADA,,-4.0,-4.0
14,73,2177,,Center does not upload C2,University of Pittsburgh - ARMADA,,-4.0,-4.0
15,73,2188,,Center does not upload C2,University of Pittsburgh - ARMADA,,-4.0,-4.0


# 3. For these ptids, look at their values for MMSECOMP. What is their value? (should only have values 0, 1, or -4). Please provide a table of counts by cohort

# 4. For these ptids, look at their value for NACCMMSE. What is their value? Please provide a table of counts by cohort (can submit in the email).

In [33]:
df.MMSECOMP.value_counts()

-4.00    276
Name: MMSECOMP, dtype: int64

In [34]:
df.NACCMMSE.value_counts()

-4.00    276
Name: NACCMMSE, dtype: int64

In [42]:
df.to_csv('C:/Users/tsb7592/Downloads/UDS_MOCA_MISSING2_new.csv', index=False)

In [103]:
marital ={1:"Married", 2:"Widowed", 3:"Divorced",
4:"Separated", 5:"Never married (or marriage was annulled) ",
6:"Living as married/domestic partner", 9:"Other  or  unknown"}

In [104]:
#Rename the value under column cohort2
uds.MARISTAT = uds.MARISTAT.map(marital)