Import libraries

In [3]:
import warnings
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS

from pandas.core import datetools
%matplotlib inline



Initialize data structures

In [4]:
# define adni dictionary
adni_dict_df = pd.read_csv("study info/DATADIC.csv")

In [6]:
# define medical history dictionaries
init_health_df = pd.read_csv("Medical_History/INITHEALTH.csv")
init_health_df.head()

Unnamed: 0,Phase,ID,RID,SITEID,VISCODE,USERDATE,USERDATE2,IHNUM,IHSYMPTOM,IHDESC,IHSURG,IHSURGDATE,IHPRESENT,IHCHRON,IHSEVER,IHDTONSET,IHONGOING,IHCEASE,IHCOMM,update_stamp
0,ADNI3,415,74,20,init,2017-01-24,2017-01-24,12,18,bowel resection,1,1975-07-02,,,,,,,,2018-07-15 22:56:34.0
1,ADNI3,5898,679,28,init,2017-06-19,2017-06-19,2,17,Skin cancer on forehead,0,,0.0,,,2002-07-02,0.0,2002-07-02,,2018-07-15 22:56:34.0
2,ADNI3,8035,677,23,init,2017-07-19,2017-07-19,14,8,osteoarthritis - right hand,0,,1.0,3.0,1.0,2017-04-15,1.0,,,2018-07-15 22:56:34.0
3,ADNI3,4169,626,16,init,2017-05-25,2017-05-25,28,18,shingles,0,,0.0,,,2001-04-11,0.0,2001-07-02,,2018-07-15 22:56:34.0
4,ADNI3,9189,6058,59,sc,2017-08-01,2017-08-01,8,12,Prolapsed bladder,0,,1.0,3.0,1.0,2007-07-02,1.0,,,2018-07-15 22:56:38.0


In [90]:
def define_terms(data_df, dict_df, columns=None):
    
    if columns is None:
        columns = data_df.columns

    keys = ["TYPE","TEXT","CODE"]
    term_dict = dict.fromkeys(keys)
    for col in columns:

        loc = dict_df.FLDNAME == col
        tmp = dict_df.loc[loc][keys]
        
        for key in keys:
            if tmp[key].unique().shape[0]:
                term_dict[key] = tmp[key].unique()[0]
            else:
                term_dict[key] = float('nan')

        print("Name: %s,\nType: %s,\nDesc: %s,\nCode:%s\n" % (col,*term_dict.values()))
        

In [None]:
def paths_with_ext(directory=None, extension=".csv")

    if directory is None:
        directory = os.getcwd()

    for root, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(extension):
                matches.append(os.path.join(root, filename))
                
    return(matches)

In [91]:
# preview terms, definition, and codes
define_terms(init_health_df, adni_dict_df)

Name: Phase,
Type: nan,
Desc: nan,
Code:nan

Name: ID,
Type: N,
Desc: Record ID,
Code:"crfname","","indexes","adni_aal_idx=TBLID,FLDID,TRANSNID","notnullcols","ID,USERID,USERDATE,USERNUM,TBLID,FLDID,TRANSNID,REASONID,CHANGE"

Name: RID,
Type: N,
Desc: Participant roster ID,
Code:nan

Name: SITEID,
Type: N,
Desc: Site ID,
Code:nan

Name: VISCODE,
Type: T,
Desc: Visit code,
Code:nan

Name: USERDATE,
Type: S,
Desc: Date record created,
Code:nan

Name: USERDATE2,
Type: S,
Desc: Date record last updated,
Code:nan

Name: IHNUM,
Type: N,
Desc: Condition Number,
Code:nan

Name: IHSYMPTOM,
Type: N,
Desc: System/Category,
Code:1=Psychiatric;2=Neurologic (other than Cognitive Disorder);3=Head, Eyes, Ears, Nose, Throat;4=Cardiovascular;5=Respiratory;6=Hepatic;7=Dermatologic-Connective Tissue;8=Musculoskeletal;9=Endocrine-Metabolic;10=Gastrointestinal;11=Hematopoietic-Lymphatic;12=Renal-Genitourinary;13=Allergies or Drug Sensitivities;14=Smoking, Alcohol Use, and/or Drug Use;17=Malignancy;18=Other;