In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
demo=pd.read_csv('demographic.csv')
diet=pd.read_csv('diet.csv')
exam=pd.read_csv('examination.csv')
labs=pd.read_csv('labs.csv')
medi=pd.read_csv('medications.csv',encoding='latin1')
ques=pd.read_csv('questionnaire.csv')

In [3]:
#merge datasets
df = demo.join(diet.set_index('SEQN'), on='SEQN', how='inner')
df = df.join(exam.set_index('SEQN'), on='SEQN', how='inner')
df = df.join(labs.set_index('SEQN'), on='SEQN', how='inner')
#df = df.join(medi.set_index('SEQN'), on='SEQN', how='inner')
df = df.join(ques.set_index('SEQN'), on='SEQN', how='inner')
df=df.dropna(axis=1,how='all')
df=df.dropna(axis=0,how='all')

In [4]:
#select features
df=df.loc[:,['SEQN', 'RIAGENDR', 'INDFMPIR', 'LBXGH', 'DBD100', 'DMDEDUC2', 'RIDAGEYR', 'BMXBMI', 'BMDAVSAD', 'MGDCGSZ']]
df.columns = ['SEQN','Gender','Family_income','LBXGH','Salt_level','Education_level','Age','BMI','Abdominal_size','Grip_strength']
df=df.dropna(axis=1,how='all')
df=df.dropna(axis=0,how='all')
df.shape

(9813, 10)

In [5]:
#deal with nan value
df = df[df['Family_income'].notnull()]
df['LBXGH']=df['LBXGH'].fillna(df['LBXGH'].mean())
df['Salt_level']=df['Salt_level'].fillna(df['Salt_level'].median())
df['Education_level']=df['Education_level'].fillna(df['Education_level'].median())
df['BMI']=df['BMI'].fillna(df['BMI'].mean())
df['Abdominal_size']=df['Abdominal_size'].fillna(df['Abdominal_size'].mean())
df['Grip_strength']=df['Grip_strength'].fillna(df['Grip_strength'].mean())
df.describe()

Unnamed: 0,SEQN,Gender,Family_income,LBXGH,Salt_level,Education_level,Age,BMI,Abdominal_size,Grip_strength
count,9083.0,9083.0,9083.0,9083.0,9083.0,9083.0,9083.0,9083.0,9083.0,9083.0
mean,78641.697016,1.508092,2.24893,5.643572,1.361775,3.742816,31.524386,25.670061,21.116572,63.026197
std,2935.838385,0.499962,1.629325,0.832025,0.684011,0.946209,24.381378,7.700796,4.288664,22.280017
min,73557.0,1.0,0.0,3.5,1.0,1.0,0.0,12.1,10.1,8.0
25%,76101.5,1.0,0.88,5.3,1.0,4.0,10.0,20.1,18.5,50.3
50%,78634.0,2.0,1.7,5.643572,1.0,4.0,26.0,25.6,21.116572,63.026197
75%,81174.5,2.0,3.58,5.643572,2.0,4.0,52.0,29.6,22.9,73.3
max,83731.0,2.0,5.0,17.5,9.0,9.0,80.0,82.9,40.1,162.8


In [6]:
#target variable
df.loc[df['LBXGH']<6.0, 'Diabetes']=0
df.loc[df['LBXGH']>=6.0, 'Diabetes']=1
df=df.drop('LBXGH',axis=1)
df=df.astype({'Salt_level':'int64','Education_level':'int64','Diabetes':'int64'})
df.head()

Unnamed: 0,SEQN,Gender,Family_income,Salt_level,Education_level,Age,BMI,Abdominal_size,Grip_strength,Diabetes
0,73557,1,0.84,2,3,69,26.7,20.6,55.2,1
1,73558,1,1.78,2,3,54,28.6,24.4,61.5,1
2,73559,1,4.51,1,4,72,28.9,25.6,91.0,1
3,73560,1,2.52,1,4,9,17.1,14.9,32.2,0
4,73561,2,5.0,1,5,73,19.7,21.116572,30.9,0


In [13]:
#split train-test
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
X=df.drop('Diabetes',axis=1)
y=df['Diabetes']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
#X_val, X_test, y_val, y_test=train_test_split(X_test, y_test, test_size=0.5)
print(X_train.shape,X_test.shape)

(7266, 9) (1817, 9)


In [226]:
#downsample training set
#from sklearn.utils import resample
#train=pd.concat([X_train,y_train],axis=1)
#major=train[train['Diabetes']==0]
#minor=train[train['Diabetes']==1]
#major_down=resample(major, replace=False, n_samples=minor.shape[0], random_state=123) 
#train_down=pd.concat([major_down,minor])
#X_train=train_down.drop('Diabetes',axis=1)
#y_train=train_down['Diabetes']

(1746, 10)

In [14]:
#lr
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
lr=LogisticRegression()
params={'class_weight':[{1:3},{1:3.5},{1:4},{1:4.5},{1:10},'balanced'], 'C':[0.5,0.8,1,2,5,10], 'tol':[1e-4,1e-6]}
gs=GridSearchCV(lr, params,'f1')
gs.fit(X_train,y_train)
clf=gs.best_estimator_
y_pred = clf.predict(X_test)
mean_accu=clf.score(X_test,y_test)
f1=f1_score(y_test,y_pred)
print(mean_accu, f1)

0.8365437534397359 0.5308056872037914


In [16]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
params={'class_weight':[{1:3},{1:3.5},{1:4},{1:4.5},{1:10},'balanced','balanced_subsample'],'n_estimators':[25,50,100],'max_depth':[2, 5,10,20,50,100]}
gs=GridSearchCV(rf, params,'f1')
gs.fit(X_train,y_train)
clf=gs.best_estimator_
y_pred = clf.predict(X_test)
mean_accu=clf.score(X_test,y_test)
f1=f1_score(y_test,y_pred)
print(mean_accu, f1)

0.8458998348926803 0.5469255663430421


In [17]:
#save random forest
import pickle
model=gs.best_estimator_
filename = 'trained_randomforest.pkl'
pickle.dump(model, open(filename, 'wb'))