In [43]:
import pandas as pd

In [44]:
# Define file paths
mcq_path = "MCQ_J.xpt"   # Family history (already uploaded)        # Demographics (already uploaded)
demo_path = "DEMO_J.xpt" #demographics


# Load both datasets
mcq_df = pd.read_sas(mcq_path)
demo_df = pd.read_sas(demo_path)

In [45]:
readable_columns = {
    "SEQN": "Respondent_ID",
    "SDDSRVYR": "Survey_Cycle",
    "RIDSTATR": "Interview_Exam_Status",
    "RIAGENDR": "Gender",
    "RIDAGEYR": "Age_Years",
    "RIDAGEMN": "Age_Months_Screening",
    "RIDRETH1": "Race_Ethnicity",
    "RIDRETH3": "Race_Ethnicity_Asian",
    "RIDEXMON": "Exam_Period",
    "RIDEXAGM": "Age_Months_Exam",
    "DMQMILIZ": "Military_Service",
    "DMQADFC": "Served_Abroad",
    "DMDBORN4": "Country_of_Birth",
    "DMDCITZN": "Citizenship_Status",
    "DMDYRSUS": "Years_in_US",
    "DMDEDUC3": "Education_Level_6_19",
    "DMDEDUC2": "Education_Level_20+",
    "DMDMARTL": "Marital_Status",
    "RIDEXPRG": "Pregnancy_Status",
    "SIALANG": "SP_Interview_Language",
    "SIAPROXY": "SP_Interview_Proxy",
    "SIAINTRP": "SP_Interview_Interpreter",
    "FIALANG": "Family_Interview_Language",
    "FIAPROXY": "Family_Interview_Proxy",
    "FIAINTRP": "Family_Interview_Interpreter",
    "MIALANG": "MEC_Interview_Language",
    "MIAPROXY": "MEC_Interview_Proxy",
    "MIAINTRP": "MEC_Interview_Interpreter",
    "AIALANGA": "ACASI_Interview_Language",
    "DMDHHSIZ": "Household_Size",
    "DMDFMSIZ": "Family_Size",
    "DMDHHSZA": "Children_0_5_in_HH",
    "DMDHHSZB": "Children_6_17_in_HH",
    "DMDHHSZE": "Adults_60plus_in_HH",
    "DMDHRGND": "HH_Ref_Person_Gender",
    "DMDHRAGZ": "HH_Ref_Person_Age",
    "DMDHREDZ": "HH_Ref_Person_Education",
    "DMDHRMAZ": "HH_Ref_Person_Marital_Status",
    "DMDHSEDZ": "HH_Ref_Person_Spouse_Education",
    "WTINT2YR": "Interview_Weight",
    "WTMEC2YR": "Exam_Weight",
    "SDMVPSU": "Masked_PSU",
    "SDMVSTRA": "Masked_Stratum",
    "INDHHIN2": "Annual_Household_Income",
    "INDFMIN2": "Annual_Family_Income",
    "INDFMPIR": "Income_Poverty_Ratio"
}

# Rename columns in the DataFrame
demo_df.rename(columns=readable_columns, inplace=True)


In [46]:
# Select specific columns after renaming
selected_columns = [
    "Respondent_ID",
    "Gender",
    "Age_Years",
]

# Create a new DataFrame with only those columns
clean_demo_df = demo_df[selected_columns]


In [47]:
# Mapping of original variable names to their readable labels from the codebook
readable_mcq_columns = {
    "SEQN": "Respondent_ID",
    "MCQ010": "Ever_Told_Had_Asthma",
    "MCQ025": "Age_When_First_Had_Asthma",
    "MCQ035": "Still_Have_Asthma",
    "MCQ040": "Had_Asthma_Attack_Past_Year",
    "MCQ050": "ER_Visit_For_Asthma_Past_Year",
    "AGQ030": "Had_Hay_Fever_Past_Year",
    "MCQ053": "Treated_For_Anemia_Past_3mo",
    "MCQ080": "Doctor_Said_Overweight",
    "MCQ092": "Ever_Had_Blood_Transfusion",
    "MCD093": "Year_Received_First_Transfusion",
    "MCQ149": "Menstrual_Periods_Started",
    "MCQ151": "Age_First_Menstrual_Period",
    "RHD018": "Age_Menarche_Months",
    "MCQ160A": "Doctor_Said_Arthritis",
    "MCD180A": "Age_Told_Had_Arthritis",
    "MCQ195": "Type_Of_Arthritis",
    "MCQ160N": "Doctor_Said_Gout",
    "MCD180N": "Age_Told_Had_Gout",
    "MCQ160B": "Told_Had_Heart_Failure",
    "MCD180B": "Age_Told_Heart_Failure",
    "MCQ160C": "Told_Had_Coronary_Heart_Disease",
    "MCD180C": "Age_Told_Coronary_Heart_Disease",
    "MCQ160D": "Told_Had_Angina",
    "MCD180D": "Age_Told_Had_Angina",
    "MCQ160E": "Told_Had_Heart_Attack",
    "MCD180E": "Age_Told_Had_Heart_Attack",
    "MCQ160F": "Told_Had_Stroke",
    "MCD180F": "Age_Told_Had_Stroke",
    "MCQ160M": "Told_Had_Thyroid_Problem",
    "MCQ170M": "Still_Have_Thyroid_Problem",
    "MCD180M": "Age_Told_Had_Thyroid_Problem",
    "MCQ160G": "Told_Had_Emphysema",
    "MCD180G": "Age_Told_Had_Emphysema",
    "MCQ160K": "Told_Had_Chronic_Bronchitis",
    "MCQ170K": "Still_Have_Chronic_Bronchitis",
    "MCD180K": "Age_Told_Had_Chronic_Bronchitis",
    "MCQ160O": "Told_Had_COPD",
    "MCQ160L": "Told_Had_Liver_Condition",
    "MCQ170L": "Still_Have_Liver_Condition",
    "MCD180L": "Age_Told_Had_Liver_Condition",
    "MCQ500": "Told_Had_Liver_Condition_Youth",
    "MCQ510A": "Liver_Condition_Fatty_Liver",
    "MCQ510B": "Liver_Condition_Fibrosis",
    "MCQ510C": "Liver_Condition_Cirrhosis",
    "MCQ510D": "Liver_Condition_Hepatitis",
    "MCQ510E": "Liver_Condition_Autoimmune_Hepatitis",
    "MCQ510F": "Liver_Condition_Other",
    "MCQ520": "Abdominal_Pain_Past_Year",
    "MCQ530": "Location_Of_Most_Uncomfortable_Pain",
    "MCQ540": "Seen_Doctor_For_Pain",
    "MCQ550": "Doctor_Said_Gallstones",
    "MCQ560": "Had_Gallbladder_Surgery",
    "MCQ570": "Age_First_Gallbladder_Surgery",
    "MCQ203": "Told_Had_Jaundice",
    "MCQ206": "Age_Told_Jaundice",
    "MCQ220": "Told_Had_Cancer_Or_Malignancy",
    "MCQ230A": "Type_First_Cancer",
    "MCD240A": "Age_First_Cancer_Diagnosed",
    "MCQ230B": "Type_Second_Cancer",
    "MCD240B": "Age_Second_Cancer_Diagnosed",
    "MCQ230C": "Type_Third_Cancer",
    "MCD240C": "Age_Third_Cancer_Diagnosed",
    "MCQ230D": "More_Than_Three_Cancers",
    "MCQ300B": "Relative_Had_Asthma",
    "MCQ300C": "Relative_Had_Diabetes",
    "MCQ300A": "Relative_Had_Heart_Attack",
    "MCQ366A": "Doctor_Told_To_Lose_Weight",
    "MCQ366B": "Doctor_Told_To_Exercise",
    "MCQ366C": "Doctor_Told_To_Reduce_Salt",
    "MCQ366D": "Doctor_Told_To_Reduce_Fat_Calories",
    "MCQ371A": "Currently_Losing_Weight",
    "MCQ371B": "Currently_Exercising",
    "MCQ371C": "Currently_Reducing_Salt",
    "MCQ371D": "Currently_Reducing_Fat_Calories",
    "OSQ230": "Currently_Taking_Osteoporosis_Meds"
}

# Rename columns in the MCQ dataframe
mcq_df.rename(columns=readable_mcq_columns, inplace=True)

In [48]:
# List of columns to extract from the renamed mcq_df
selected_mcq_columns = [
    "Respondent_ID",
    "Ever_Told_Had_Asthma",
    "Treated_For_Anemia_Past_3mo",
    "Doctor_Said_Overweight",
    "Ever_Had_Blood_Transfusion",
    "Doctor_Said_Arthritis",
    "Doctor_Said_Gout",
    "Told_Had_Heart_Failure",
    "Told_Had_Coronary_Heart_Disease",
    "Told_Had_Angina",
    "Told_Had_Heart_Attack",
    "Told_Had_Stroke",
    "Told_Had_Thyroid_Problem",
    "Told_Had_Emphysema",
    "Told_Had_Chronic_Bronchitis",
    "Told_Had_COPD",
    "Told_Had_Liver_Condition",
    "Told_Had_Liver_Condition_Youth",
    "Liver_Condition_Fatty_Liver",
    "Liver_Condition_Fibrosis",
    "Liver_Condition_Cirrhosis",
    "Liver_Condition_Hepatitis",
    "Liver_Condition_Autoimmune_Hepatitis",
    "Abdominal_Pain_Past_Year",
    "Location_Of_Most_Uncomfortable_Pain",
    "Doctor_Said_Gallstones",
    "Had_Gallbladder_Surgery",
    "Told_Had_Jaundice",
    "Told_Had_Cancer_Or_Malignancy",
    "Type_First_Cancer",
    "Relative_Had_Asthma",
    "Relative_Had_Diabetes",
    "Relative_Had_Heart_Attack",
    "Doctor_Told_To_Lose_Weight",
    "Doctor_Told_To_Exercise",
    "Doctor_Told_To_Reduce_Salt",
    "Doctor_Told_To_Reduce_Fat_Calories",
    "Currently_Losing_Weight",
    "Currently_Exercising",
    "Currently_Reducing_Salt",
    "Currently_Reducing_Fat_Calories",
    "Currently_Taking_Osteoporosis_Meds"
]

# Create a new DataFrame with just the selected columns
clean_mcq_df = mcq_df[selected_mcq_columns]


In [49]:
merged = pd.merge(clean_demo_df,clean_mcq_df, on = "Respondent_ID", how = "inner")
merged.head()

Unnamed: 0,Respondent_ID,Gender,Age_Years,Ever_Told_Had_Asthma,Treated_For_Anemia_Past_3mo,Doctor_Said_Overweight,Ever_Had_Blood_Transfusion,Doctor_Said_Arthritis,Doctor_Said_Gout,Told_Had_Heart_Failure,...,Relative_Had_Heart_Attack,Doctor_Told_To_Lose_Weight,Doctor_Told_To_Exercise,Doctor_Told_To_Reduce_Salt,Doctor_Told_To_Reduce_Fat_Calories,Currently_Losing_Weight,Currently_Exercising,Currently_Reducing_Salt,Currently_Reducing_Fat_Calories,Currently_Taking_Osteoporosis_Meds
0,93703.0,2.0,2.0,2.0,2.0,,,,,,...,,,,,,,,,,
1,93704.0,1.0,2.0,2.0,2.0,,,,,,...,,,,,,,,,,
2,93705.0,2.0,66.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0
3,93706.0,1.0,18.0,2.0,2.0,2.0,2.0,,,,...,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,
4,93707.0,1.0,13.0,2.0,2.0,,2.0,,,,...,,,,,,,,,,


In [50]:
# Drop rows with NaN in target or input
merged = merged.dropna(subset=["Ever_Told_Had_Asthma"])


In [51]:
X = merged.drop(columns=["Ever_Told_Had_Asthma"])
y = merged["Ever_Told_Had_Asthma"]



In [52]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale (very important for neural nets)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Clean target
clean_mcq_df = clean_mcq_df.dropna(subset=["Ever_Told_Had_Asthma"])
clean_mcq_df["Ever_Told_Had_Asthma"] = clean_mcq_df["Ever_Told_Had_Asthma"].replace({2: 0})

# Feature/target split
X = clean_mcq_df.drop(columns=["Ever_Told_Had_Asthma", "Respondent_ID"])
X = X.fillna(0)  # or better: impute
y = clean_mcq_df["Ever_Told_Had_Asthma"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]
)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7999 - loss: 0.4973 - val_accuracy: 0.8506 - val_loss: 0.4195
Epoch 2/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 850us/step - accuracy: 0.8552 - loss: 0.3819 - val_accuracy: 0.8478 - val_loss: 0.4159
Epoch 3/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834us/step - accuracy: 0.8491 - loss: 0.4022 - val_accuracy: 0.8506 - val_loss: 0.4174
Epoch 4/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 836us/step - accuracy: 0.8476 - loss: 0.3924 - val_accuracy: 0.8506 - val_loss: 0.4148
Epoch 5/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step - accuracy: 0.8433 - loss: 0.3721 - val_accuracy: 0.8449 - val_loss: 0.4147
Epoch 6/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step - accuracy: 0.8538 - loss: 0.3732 - val_accuracy: 0.8483 - val_loss: 0.4218
Epoch 7/50
[1m223/223[0m [

In [54]:
# Save model
model.save("asthma_risk_model.keras")

# Save scaler
import joblib
joblib.dump(scaler, "asthma_scaler.pkl")


['asthma_scaler.pkl']

In [55]:
clean_mcq_df.columns.tolist()

['Respondent_ID',
 'Ever_Told_Had_Asthma',
 'Treated_For_Anemia_Past_3mo',
 'Doctor_Said_Overweight',
 'Ever_Had_Blood_Transfusion',
 'Doctor_Said_Arthritis',
 'Doctor_Said_Gout',
 'Told_Had_Heart_Failure',
 'Told_Had_Coronary_Heart_Disease',
 'Told_Had_Angina',
 'Told_Had_Heart_Attack',
 'Told_Had_Stroke',
 'Told_Had_Thyroid_Problem',
 'Told_Had_Emphysema',
 'Told_Had_Chronic_Bronchitis',
 'Told_Had_COPD',
 'Told_Had_Liver_Condition',
 'Told_Had_Liver_Condition_Youth',
 'Liver_Condition_Fatty_Liver',
 'Liver_Condition_Fibrosis',
 'Liver_Condition_Cirrhosis',
 'Liver_Condition_Hepatitis',
 'Liver_Condition_Autoimmune_Hepatitis',
 'Abdominal_Pain_Past_Year',
 'Location_Of_Most_Uncomfortable_Pain',
 'Doctor_Said_Gallstones',
 'Had_Gallbladder_Surgery',
 'Told_Had_Jaundice',
 'Told_Had_Cancer_Or_Malignancy',
 'Type_First_Cancer',
 'Relative_Had_Asthma',
 'Relative_Had_Diabetes',
 'Relative_Had_Heart_Attack',
 'Doctor_Told_To_Lose_Weight',
 'Doctor_Told_To_Exercise',
 'Doctor_Told_To_Reduc