In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

In [None]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [None]:
train_data.fillna(train_data.mode().iloc[0], inplace=True)
test_features.fillna(test_features.mode().iloc[0], inplace=True)

In [None]:
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty',
                        'marital_status', 'rent_or_own', 'employment_status',
                        'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

In [None]:
train_data = pd.get_dummies(train_data, columns=categorical_features)
test_features = pd.get_dummies(test_features, columns=categorical_features)

In [None]:
train_features = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
train_labels = train_data[['xyz_vaccine', 'seasonal_vaccine']]
test_features = test_features.drop(columns=['respondent_id'])

In [None]:
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

In [None]:
model_xyz = RandomForestClassifier(n_estimators=100, random_state=42)
model_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
model_xyz.fit(train_features, train_labels['xyz_vaccine'])
model_seasonal.fit(train_features, train_labels['seasonal_vaccine'])

In [None]:
test_preds_xyz = model_xyz.predict_proba(test_features)[:, 1]
test_preds_seasonal = model_seasonal.predict_proba(test_features)[:, 1]

In [None]:
test_features['respondent_id'] = range(26707, 53415)

In [None]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'h1n1_vaccine': test_preds_xyz,
    'seasonal_vaccine': test_preds_seasonal
})

In [None]:
submission.to_csv('submission_.csv', index=False)

In [None]:
xyz_roc_auc = roc_auc_score(train_labels['xyz_vaccine'], model_xyz.predict_proba(train_features)[:, 1])
seasonal_roc_auc = roc_auc_score(train_labels['seasonal_vaccine'], model_seasonal.predict_proba(train_features)[:, 1])

In [None]:
print(f"ROC AUC Score for XYZ Vaccine: {xyz_roc_auc}")
print(f"ROC AUC Score for Seasonal Vaccine: {seasonal_roc_auc}")
print("Submission file created successfully!")

ROC AUC Score for XYZ Vaccine: 1.0
ROC AUC Score for Seasonal Vaccine: 1.0
Submission file created successfully!
