# A Bayesian Network for Health Risk Analysis Using NHANES Data

In [1]:
import pandas as pd

# Load dataset
### NHANES 2017-March 2020 Pre-pandemic
https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2017-2020 \
The NHANES program suspended field operations in March 2020 due to the coronavirus disease 2019 (COVID-19) pandemic.

In [2]:
demo = pd.read_sas("./dataset/P_DEMO.xpt")
bmx = pd.read_sas("./dataset/P_BMX.xpt")
alq = pd.read_sas("./dataset/P_ALQ.xpt")
bpq = pd.read_sas("./dataset/P_BPQ.xpt")
diq = pd.read_sas("./dataset/P_DIQ.xpt")
smq = pd.read_sas("./dataset/P_SMQ.xpt")
paq = pd.read_sas("./dataset/P_PAQ.xpt")

# List of attributes
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.htm
  * **RIDAGEYR**: Age in years, at the time of the screening interview
  * **RIAGENDR**: Gender of the participant
      * Code: 1: Male; 2: Female, .: Missing
  * **RIDRETH3**: Recode of reported race and Hispanic origin information, with Non-Hispanic Asian Category
      * Code:
  * **DMDEDUC2**: Education level - Adults 20+
  * **INDFMPIR**: Ratio of family income to poverty
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_BMX.htm
    * **BMXBMI**: Body Mass Index (kg/m**2)
    * **BMXWAIST**: Waist Circumference (cm)
<br/>
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_ALQ.htm
    * **ALQ111**: Ever had a drink of any kind of alcohol
    * **ALQ130**: Avg # alcoholic drinks/day - past 12 mos
<br/>
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_BPQ.htm
    * **BPQ020**: Ever told you had high blood pressure
    * **BPQ050A**: Now taking prescribed medicine for HBP
<br/>
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DIQ.htm
    * **DIQ010**: Doctor told you have diabetes
<br/>
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_SMQ.htm
    * **SMQ020**: Smoked at least 100 cigarettes in life
    * **SMQ040**: {Do you/Does SP} now smoke cigarettes?
<br/>
* https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_PAQ.htm
    * **PAQ605**: Vigorous work activity
    * **PAQ620**: Moderate work activity

In [3]:
demo_cols = ["SEQN", "RIDAGEYR", "RIAGENDR", "RIDRETH3", "DMDEDUC2", "INDFMPIR"]
bmx_cols = ["SEQN", "BMXBMI", "BMXWAIST"]
alq_cols = ["SEQN", "ALQ111", "ALQ130"]
bpq_cols = ["SEQN", "BPQ020", "BPQ050A"]
diq_cols = ["SEQN", "DIQ010"]
smq_cols = ["SEQN", "SMQ020", "SMQ040"]
paq_cols = ["SEQN", "PAQ605", "PAQ620"]

demo_sel = demo[demo_cols]
bmx_sel = bmx[bmx_cols]
alq_sel = alq[alq_cols]
bpq_sel = bpq[bpq_cols]
diq_sel = diq[diq_cols]
smq_sel = smq[smq_cols]
paq_sel = paq[paq_cols]

In [4]:
merged = demo_sel.merge(bmx_sel, on="SEQN", how="inner")
merged = merged.merge(alq_sel, on="SEQN", how="inner")
merged = merged.merge(bpq_sel, on="SEQN", how="inner")
merged = merged.merge(diq_sel, on="SEQN", how="inner")
merged = merged.merge(smq_sel, on="SEQN", how="inner")
merged = merged.merge(paq_sel, on="SEQN", how="inner")

### Explore structure of the dataset

In [5]:
merged.head()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,RIDRETH3,DMDEDUC2,INDFMPIR,BMXBMI,BMXWAIST,ALQ111,ALQ130,BPQ020,BPQ050A,DIQ010,SMQ020,SMQ040,PAQ605,PAQ620
0,109266.0,29.0,2.0,6.0,5.0,5.0,37.8,117.9,1.0,1.0,2.0,,2.0,2.0,,2.0,2.0
1,109271.0,49.0,1.0,3.0,2.0,,29.7,120.4,1.0,,2.0,,2.0,1.0,1.0,2.0,1.0
2,109273.0,36.0,1.0,3.0,4.0,0.83,21.9,86.8,1.0,,2.0,,2.0,1.0,1.0,1.0,2.0
3,109274.0,68.0,1.0,7.0,4.0,1.2,30.2,109.6,1.0,2.0,1.0,1.0,1.0,2.0,,1.0,1.0
4,109282.0,76.0,1.0,3.0,5.0,3.61,26.6,,1.0,,1.0,1.0,2.0,1.0,3.0,2.0,2.0


In [6]:
merged.describe()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,RIDRETH3,DMDEDUC2,INDFMPIR,BMXBMI,BMXWAIST,ALQ111,ALQ130,BPQ020,BPQ050A,DIQ010,SMQ020,SMQ040,PAQ605,PAQ620
count,8965.0,8965.0,8965.0,8965.0,8544.0,7705.0,8790.0,8449.0,8370.0,5863.0,8965.0,3032.0,8965.0,8965.0,3596.0,8965.0,8965.0
mean,117107.849637,49.465142,1.513999,3.484551,3.554775,2.55874,29.883413,100.397574,1.103584,4.187958,1.639264,1.148417,1.883882,1.601562,2.225528,1.75449,1.572337
std,4501.048325,18.457564,0.499832,1.574461,1.210994,1.627623,7.603916,17.349142,0.304739,40.354534,0.552809,0.355571,0.435757,0.508822,0.921705,0.475485,0.537145
min,109266.0,18.0,1.0,1.0,1.0,5.397605e-79,14.2,56.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,113211.0,33.0,1.0,3.0,3.0,1.17,24.7,88.1,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
50%,117091.0,50.0,2.0,3.0,4.0,2.16,28.6,99.0,1.0,2.0,2.0,1.0,2.0,2.0,3.0,2.0,2.0
75%,121022.0,64.0,2.0,4.0,4.0,4.14,33.6,111.0,1.0,3.0,2.0,1.0,2.0,2.0,3.0,2.0,2.0
max,124822.0,80.0,2.0,7.0,9.0,5.0,92.3,187.5,2.0,999.0,9.0,2.0,9.0,9.0,3.0,9.0,9.0
