In [5]:
import pandas as pd

In [20]:
# Define file paths
mcq_path = "MCQ_J.xpt"               # Family history (already uploaded)
demo_path = "DEMO_J.xpt"         # Demographics (already uploaded)

# Load both datasets
mcq_df = pd.read_sas(mcq_path)
demo_df = pd.read_sas(demo_path)

In [21]:
# Mapping of original variable names to their readable labels from the codebook
readable_mcq_columns = {
    "SEQN": "Respondent_ID",
    "MCQ010": "Ever_Told_Had_Asthma",
    "MCQ025": "Age_When_First_Had_Asthma",
    "MCQ035": "Still_Have_Asthma",
    "MCQ040": "Had_Asthma_Attack_Past_Year",
    "MCQ050": "ER_Visit_For_Asthma_Past_Year",
    "AGQ030": "Had_Hay_Fever_Past_Year",
    "MCQ053": "Treated_For_Anemia_Past_3mo",
    "MCQ080": "Doctor_Said_Overweight",
    "MCQ092": "Ever_Had_Blood_Transfusion",
    "MCD093": "Year_Received_First_Transfusion",
    "MCQ149": "Menstrual_Periods_Started",
    "MCQ151": "Age_First_Menstrual_Period",
    "RHD018": "Age_Menarche_Months",
    "MCQ160A": "Doctor_Said_Arthritis",
    "MCD180A": "Age_Told_Had_Arthritis",
    "MCQ195": "Type_Of_Arthritis",
    "MCQ160N": "Doctor_Said_Gout",
    "MCD180N": "Age_Told_Had_Gout",
    "MCQ160B": "Told_Had_Heart_Failure",
    "MCD180B": "Age_Told_Heart_Failure",
    "MCQ160C": "Told_Had_Coronary_Heart_Disease",
    "MCD180C": "Age_Told_Coronary_Heart_Disease",
    "MCQ160D": "Told_Had_Angina",
    "MCD180D": "Age_Told_Had_Angina",
    "MCQ160E": "Told_Had_Heart_Attack",
    "MCD180E": "Age_Told_Had_Heart_Attack",
    "MCQ160F": "Told_Had_Stroke",
    "MCD180F": "Age_Told_Had_Stroke",
    "MCQ160M": "Told_Had_Thyroid_Problem",
    "MCQ170M": "Still_Have_Thyroid_Problem",
    "MCD180M": "Age_Told_Had_Thyroid_Problem",
    "MCQ160G": "Told_Had_Emphysema",
    "MCD180G": "Age_Told_Had_Emphysema",
    "MCQ160K": "Told_Had_Chronic_Bronchitis",
    "MCQ170K": "Still_Have_Chronic_Bronchitis",
    "MCD180K": "Age_Told_Had_Chronic_Bronchitis",
    "MCQ160O": "Told_Had_COPD",
    "MCQ160L": "Told_Had_Liver_Condition",
    "MCQ170L": "Still_Have_Liver_Condition",
    "MCD180L": "Age_Told_Had_Liver_Condition",
    "MCQ500": "Told_Had_Liver_Condition_Youth",
    "MCQ510A": "Liver_Condition_Fatty_Liver",
    "MCQ510B": "Liver_Condition_Fibrosis",
    "MCQ510C": "Liver_Condition_Cirrhosis",
    "MCQ510D": "Liver_Condition_Hepatitis",
    "MCQ510E": "Liver_Condition_Autoimmune_Hepatitis",
    "MCQ510F": "Liver_Condition_Other",
    "MCQ520": "Abdominal_Pain_Past_Year",
    "MCQ530": "Location_Of_Most_Uncomfortable_Pain",
    "MCQ540": "Seen_Doctor_For_Pain",
    "MCQ550": "Doctor_Said_Gallstones",
    "MCQ560": "Had_Gallbladder_Surgery",
    "MCQ570": "Age_First_Gallbladder_Surgery",
    "MCQ203": "Told_Had_Jaundice",
    "MCQ206": "Age_Told_Jaundice",
    "MCQ220": "Told_Had_Cancer_Or_Malignancy",
    "MCQ230A": "Type_First_Cancer",
    "MCD240A": "Age_First_Cancer_Diagnosed",
    "MCQ230B": "Type_Second_Cancer",
    "MCD240B": "Age_Second_Cancer_Diagnosed",
    "MCQ230C": "Type_Third_Cancer",
    "MCD240C": "Age_Third_Cancer_Diagnosed",
    "MCQ230D": "More_Than_Three_Cancers",
    "MCQ300B": "Relative_Had_Asthma",
    "MCQ300C": "Relative_Had_Diabetes",
    "MCQ300A": "Relative_Had_Heart_Attack",
    "MCQ366A": "Doctor_Told_To_Lose_Weight",
    "MCQ366B": "Doctor_Told_To_Exercise",
    "MCQ366C": "Doctor_Told_To_Reduce_Salt",
    "MCQ366D": "Doctor_Told_To_Reduce_Fat_Calories",
    "MCQ371A": "Currently_Losing_Weight",
    "MCQ371B": "Currently_Exercising",
    "MCQ371C": "Currently_Reducing_Salt",
    "MCQ371D": "Currently_Reducing_Fat_Calories",
    "OSQ230": "Currently_Taking_Osteoporosis_Meds"
}

# Rename columns in the MCQ dataframe
mcq_df.rename(columns=readable_mcq_columns, inplace=True)

In [22]:
# List of columns to extract from the renamed mcq_df
selected_mcq_columns = [
    "Respondent_ID",
    "Ever_Told_Had_Asthma",
    "Treated_For_Anemia_Past_3mo",
    "Doctor_Said_Overweight",
    "Ever_Had_Blood_Transfusion",
    "Doctor_Said_Arthritis",
    "Doctor_Said_Gout",
    "Told_Had_Heart_Failure",
    "Told_Had_Coronary_Heart_Disease",
    "Told_Had_Angina",
    "Told_Had_Heart_Attack",
    "Told_Had_Stroke",
    "Told_Had_Thyroid_Problem",
    "Told_Had_Emphysema",
    "Told_Had_Chronic_Bronchitis",
    "Told_Had_COPD",
    "Told_Had_Liver_Condition",
    "Told_Had_Liver_Condition_Youth",
    "Liver_Condition_Fatty_Liver",
    "Liver_Condition_Fibrosis",
    "Liver_Condition_Cirrhosis",
    "Liver_Condition_Hepatitis",
    "Liver_Condition_Autoimmune_Hepatitis",
    "Abdominal_Pain_Past_Year",
    "Location_Of_Most_Uncomfortable_Pain",
    "Doctor_Said_Gallstones",
    "Had_Gallbladder_Surgery",
    "Told_Had_Jaundice",
    "Told_Had_Cancer_Or_Malignancy",
    "Type_First_Cancer",
    "Relative_Had_Asthma",
    "Relative_Had_Diabetes",
    "Relative_Had_Heart_Attack",
    "Doctor_Told_To_Lose_Weight",
    "Doctor_Told_To_Exercise",
    "Doctor_Told_To_Reduce_Salt",
    "Doctor_Told_To_Reduce_Fat_Calories",
    "Currently_Losing_Weight",
    "Currently_Exercising",
    "Currently_Reducing_Salt",
    "Currently_Reducing_Fat_Calories",
    "Currently_Taking_Osteoporosis_Meds"
]

# Create a new DataFrame with just the selected columns
clean_mcq_df = mcq_df[selected_mcq_columns]


In [23]:
# Drop rows with NaN in target or input
clean_mcq_df = clean_mcq_df.dropna(subset=["Ever_Told_Had_Asthma"])


In [24]:
X = clean_mcq_df.drop(columns=["Ever_Told_Had_Asthma"])
y = clean_mcq_df["Ever_Told_Had_Asthma"]



In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale (very important for neural nets)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Clean target
clean_mcq_df = clean_mcq_df.dropna(subset=["Ever_Told_Had_Asthma"])
clean_mcq_df["Ever_Told_Had_Asthma"] = clean_mcq_df["Ever_Told_Had_Asthma"].replace({2: 0})

# Feature/target split
X = clean_mcq_df.drop(columns=["Ever_Told_Had_Asthma", "Respondent_ID"])
X = X.fillna(0)  # or better: impute
y = clean_mcq_df["Ever_Told_Had_Asthma"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[EarlyStopping(patience=5, restore_best_weights=True)]
)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8205 - loss: 0.4793 - val_accuracy: 0.8500 - val_loss: 0.4186
Epoch 2/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 844us/step - accuracy: 0.8458 - loss: 0.4082 - val_accuracy: 0.8500 - val_loss: 0.4048
Epoch 3/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 876us/step - accuracy: 0.8511 - loss: 0.3968 - val_accuracy: 0.8511 - val_loss: 0.4145
Epoch 4/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 825us/step - accuracy: 0.8546 - loss: 0.3852 - val_accuracy: 0.8522 - val_loss: 0.4044
Epoch 5/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 838us/step - accuracy: 0.8455 - loss: 0.3712 - val_accuracy: 0.8545 - val_loss: 0.4096
Epoch 6/50
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 832us/step - accuracy: 0.8480 - loss: 0.3570 - val_accuracy: 0.8534 - val_loss: 0.4094
Epoch 7/50
[1m223/223[0m [

In [29]:
# Save model
model.save("asthma_risk_model.keras")

# Save scaler
import joblib
joblib.dump(scaler, "asthma_scaler.pkl")


['asthma_scaler.pkl']

In [30]:
clean_mcq_df.columns.tolist()

['Respondent_ID',
 'Ever_Told_Had_Asthma',
 'Treated_For_Anemia_Past_3mo',
 'Doctor_Said_Overweight',
 'Ever_Had_Blood_Transfusion',
 'Doctor_Said_Arthritis',
 'Doctor_Said_Gout',
 'Told_Had_Heart_Failure',
 'Told_Had_Coronary_Heart_Disease',
 'Told_Had_Angina',
 'Told_Had_Heart_Attack',
 'Told_Had_Stroke',
 'Told_Had_Thyroid_Problem',
 'Told_Had_Emphysema',
 'Told_Had_Chronic_Bronchitis',
 'Told_Had_COPD',
 'Told_Had_Liver_Condition',
 'Told_Had_Liver_Condition_Youth',
 'Liver_Condition_Fatty_Liver',
 'Liver_Condition_Fibrosis',
 'Liver_Condition_Cirrhosis',
 'Liver_Condition_Hepatitis',
 'Liver_Condition_Autoimmune_Hepatitis',
 'Abdominal_Pain_Past_Year',
 'Location_Of_Most_Uncomfortable_Pain',
 'Doctor_Said_Gallstones',
 'Had_Gallbladder_Surgery',
 'Told_Had_Jaundice',
 'Told_Had_Cancer_Or_Malignancy',
 'Type_First_Cancer',
 'Relative_Had_Asthma',
 'Relative_Had_Diabetes',
 'Relative_Had_Heart_Attack',
 'Doctor_Told_To_Lose_Weight',
 'Doctor_Told_To_Exercise',
 'Doctor_Told_To_Reduc