In [11]:
import pandas as pds
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

NHANES cognitive data downloaded from https://www.icpsr.umich.edu/web/about/cms/3625/

In [3]:
demo_df = pds.read_table('../refined_data/demographics.tsv',  index_col='SEQN')
nhanescog_df = pds.read_csv('../data/nhanescog_2011_12_CSV.csv', index_col='seqn')
nhanescog_df.index.name = 'SEQN' # change index name for merging
perio_df = pds.read_table('../refined_data/perio_summary.tsv', index_col='SEQN')

In [4]:
# merge demographics, cognitive functioning scores, and perio
merged_df = \
    demo_df.merge(
        nhanescog_df.merge(perio_df, how='inner', on='SEQN'),
        how='inner',
        on='SEQN'
    )
len(merged_df)

837

In [5]:
list(nhanescog_df.columns)

['year',
 'ridstatr',
 'riagendr',
 'female',
 'ridageyr',
 'age_cat',
 'ridreth1',
 'race',
 'ridreth3',
 'dmdeduc2',
 'edu_cat',
 'cfq_present',
 'cfastat',
 'cfdccs',
 'cfdcst1',
 'cfdcst2',
 'cfdcst3',
 'cfdcsr',
 'cfdast',
 'cfdds',
 'cerad_sum',
 'z_cerad_re',
 'z_animal_re',
 'z_digit_re',
 'z_delayed_re',
 'z_global_re',
 'low_cerad_re',
 'low_animal_re',
 'low_digit_re',
 'low_delayed_re',
 'low_global_re',
 'z_cerad_edu',
 'z_animal_edu',
 'z_digit_edu',
 'z_delayed_edu',
 'z_global_edu',
 'low_cerad_edu',
 'low_animal_edu',
 'low_digit_edu',
 'low_delayed_edu',
 'low_global_edu',
 'z_cerad_age',
 'z_animal_age',
 'z_digit_age',
 'z_delayed_age',
 'z_global_age',
 'low_cerad_age',
 'low_animal_age',
 'low_digit_age',
 'low_delayed_age',
 'low_global_age',
 'wtint2yr',
 'wtmec2yr',
 'sdmvpsu',
 'sdmvstra']

In [24]:
train_df = merged_df.dropna()[['pct_teeth_gt_3', 'pct_teeth_gt_4', 'pct_teeth_gt_5', 'pct_teeth_gt_6']]
y = merged_df.dropna()['low_cerad_re'].values

In [27]:
y

array([0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 0.

In [13]:
perio_df.columnsvalues

Index(['num_teeth', 'max_CAL', 'num_teeth_gt_3', 'num_teeth_gt_4',
       'num_teeth_gt_5', 'num_teeth_gt_6', 'pct_teeth_gt_3', 'pct_teeth_gt_4',
       'pct_teeth_gt_5', 'pct_teeth_gt_6'],
      dtype='object')

In [6]:
merged_df.low_cerad_re.count()

328

add single low score flag

In [7]:
for idx in merged_df.index:
    if (
        merged_df.loc[idx, 'low_animal_age'] == 1
        or merged_df.loc[idx, 'low_animal_edu'] == 1
        or merged_df.loc[idx, 'low_animal_re'] == 1
        or merged_df.loc[idx, 'low_cerad_age'] == 1
        or merged_df.loc[idx, 'low_cerad_edu'] == 1
        or merged_df.loc[idx, 'low_cerad_re'] == 1
        or merged_df.loc[idx, 'low_delayed_age'] == 1
        or merged_df.loc[idx, 'low_delayed_edu'] == 1
        or merged_df.loc[idx, 'low_delayed_re'] == 1
        or merged_df.loc[idx, 'low_digit_age'] == 1
        or merged_df.loc[idx, 'low_digit_edu'] == 1
        or merged_df.loc[idx, 'low_digit_re'] == 1
       ):
        merged_df.loc[idx, 'low_score'] = 1
    else:
        merged_df.loc[idx, 'low_score'] = 1

In [8]:
merged_df.low_score.value_counts()

1.0    837
Name: low_score, dtype: int64

In [33]:
246 + 46

292

In [12]:
lin_regr = LinearRegression()