In [1]:
import pandas as pd
import numpy as np

In [2]:
df_raw = pd.read_csv('LLCP2018.csv.gz')

In [3]:
# DIABETES and CVD
df_raw['DIABETES'] = df_raw['DIABETE3'].replace({2: 1, 3: 0, 4: 0, 7: np.nan, 9: np.nan})
df_raw['DIABETES'].value_counts(dropna=False)
df_raw['CVD'] = ((df_raw['CVDINFR4'] == 1) | (df_raw['CVDCRHD4'] == 1) | (df_raw['CVDSTRK3'] == 1)).astype(int)

In [4]:
df = df_raw.loc[:, ['DIABETES', 'CVD']]

In [5]:
df['POPULATION'] = df_raw['_LLCPWT']

In [6]:
df_prev = df.dropna()
diabetes_prevalence_overall = df_prev.loc[df_prev.DIABETES == 1]['POPULATION'].sum() / df_prev['POPULATION'].sum()
cvd_prevalence_overall = df_prev.loc[df_prev.CVD == 1]['POPULATION'].sum() / df_prev['POPULATION'].sum()

In [7]:
# AGE
# 0: 20-30 (25)
# 1: 30-40 (35)
# 2: 40-50 (45)
# 3: 50-60 (55)
# 4: 60-70 (65)
# 5: 70-80 (75)
# 6: 80-90 (85)
df['AGE'] = (df_raw['_AGEG5YR'] - 1) // 2
df['AGE'].value_counts(dropna=False)

4.0    94323
3.0    77220
5.0    69598
2.0    54429
1.0    50387
0.0    48291
6.0    43188
Name: AGE, dtype: int64

In [8]:
# BMI
# 0: Underweight (< 18.5)
# 1: Normal (18.5 <= BMI < 25)
# 2: Overweight (25 <= BMI < 30)
# 3: Obese (>= 30)
df['BMI'] = df_raw['_BMI5CAT'] - 1
df['BMI'].value_counts(dropna=False)

2.0    143878
3.0    127998
1.0    123522
NaN     35262
0.0      6776
Name: BMI, dtype: int64

In [9]:
# SLEEP: Sleep Time
# 0: <7 hr
# 1: 7-9 hr
# 2: > 9 hr
def bin_sleep(x):
    if pd.isna(x):
        return np.nan
    elif x < 7:
        return 0
    elif x > 9:
        return 2
    else:
        return 1

df['SLEEP'] = df_raw['SLEPTIM1'].replace({77: np.nan, 99: np.nan}).apply(bin_sleep)
df['SLEEP'].value_counts(dropna=False)

1.0    273866
0.0    142309
2.0     15924
NaN      5337
Name: SLEEP, dtype: int64

In [10]:
# Smoke
# 0: No
# 1: Yes
df['SMOKE'] = df_raw['_RFSMOK3'].replace({9: np.nan, 1: 0, 2: 1})
df['SMOKE'].value_counts(dropna=False)

0.0    359348
1.0     61272
NaN     16816
Name: SMOKE, dtype: int64

In [11]:
df = df.dropna()
len(df)

389851

In [12]:
def get_prevalence(df, age, bmi, sleep, smoke):
    sub_df = df.loc[
        (df.AGE == age) &
        (df.BMI == bmi) &
        (df.SLEEP == sleep) &
        (df.SMOKE == smoke)
    ]
    
    diabetes_count = len(sub_df.loc[sub_df.DIABETES == 1])
    cvd_count = len(sub_df.loc[sub_df.CVD == 1])
    
    if diabetes_count <= 10:
        diabetes_prevalence = np.nan
    else:
        diabetes_prevalence = sub_df.loc[sub_df.DIABETES == 1]['POPULATION'].sum() / sub_df['POPULATION'].sum()
    
    if cvd_count <= 10:
        cvd_prevalence = np.nan
    else:
        cvd_prevalence = sub_df.loc[sub_df.CVD == 1]['POPULATION'].sum() / sub_df['POPULATION'].sum()
        
    return diabetes_prevalence, cvd_prevalence

In [13]:
result = []
for age in range(7):
    for bmi in range(4):
        for sleep in range(3):
                for smoke in range(2):
                    diabetes, cvd = get_prevalence(df, age, bmi, sleep, smoke)
                    result.append((age, bmi, sleep, smoke, diabetes, cvd))
result_df = pd.DataFrame(
    result, 
    columns=['AGE', 'BMI', 'SLEEP', 'SMOKE', 'DIABETES_PREVALENCE', 'CVD_PREVALENCE']
)

In [14]:
result_df

Unnamed: 0,AGE,BMI,SLEEP,SMOKE,DIABETES_PREVALENCE,CVD_PREVALENCE
0,0,0,0,0,,
1,0,0,0,1,,
2,0,0,1,0,0.009777,
3,0,0,1,1,,
4,0,0,2,0,,
...,...,...,...,...,...,...
163,6,3,0,1,0.256538,0.274313
164,6,3,1,0,0.302242,0.296064
165,6,3,1,1,0.158360,0.275670
166,6,3,2,0,0.376692,0.302538


In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
diabetes_train_df = result_df[['AGE', 'BMI', 'SLEEP', 'SMOKE', 'DIABETES_PREVALENCE']].dropna()
X_diabs = diabetes_train_df[['AGE', 'BMI', 'SLEEP', 'SMOKE']]
y_diabs = diabetes_train_df['DIABETES_PREVALENCE']
model_diabs = RandomForestRegressor(n_estimators=100, max_depth=3)
model_diabs.fit(X_diabs, y_diabs)
model_diabs.score(X_diabs, y_diabs)

0.8327395037842262

In [17]:
cvd_train_df = result_df[['AGE', 'BMI', 'SLEEP', 'SMOKE', 'CVD_PREVALENCE']].dropna()
X_cvd = cvd_train_df[['AGE', 'BMI', 'SLEEP', 'SMOKE']]
y_cvd = cvd_train_df['CVD_PREVALENCE']
model_cvd = RandomForestRegressor(n_estimators=100, max_depth=3)
model_cvd.fit(X_cvd, y_cvd)
model_cvd.score(X_cvd, y_cvd)

0.7933990285260049

In [18]:
X_pred = result_df[['AGE', 'BMI', 'SLEEP', 'SMOKE']]
result_df['DIABETES_PREVALENCE'] = model_diabs.predict(X_pred)
result_df['CVD_PREVALENCE'] = model_cvd.predict(X_pred)

In [19]:
diabetes_prevalence_overall = df.loc[df.DIABETES == 1]['POPULATION'].sum() / df['POPULATION'].sum()
cvd_prevalence_overall = df.loc[df.CVD == 1]['POPULATION'].sum() / df['POPULATION'].sum()

In [20]:
diabetes_prevalence_overall

0.12369516493324022

In [21]:
cvd_prevalence_overall

0.09139971088876757

In [22]:
result_df['DIABETES_RISK'] = result_df['DIABETES_PREVALENCE'] / diabetes_prevalence_overall
result_df['CVD_RISK'] = result_df['CVD_PREVALENCE'] / cvd_prevalence_overall
result_df['AGE'] = (result_df['AGE'] * 10) + 25

In [23]:
result_df.to_csv('model.csv', index=False)