In [16]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [17]:

train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')


In [18]:
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [19]:
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']


In [20]:
train_data = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
test_data = test_features.drop('respondent_id', axis=1)


In [21]:
numeric_features = train_data.select_dtypes(include=['number']).columns.tolist()
categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()


In [22]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [23]:
logreg_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])


In [24]:
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'lbfgs']
}

In [25]:
grid_search_xyz = GridSearchCV(logreg_model, param_grid, cv=5, scoring='roc_auc')
grid_search_xyz.fit(train_data, y_xyz)



In [26]:
best_model_xyz = grid_search_xyz.best_estimator_
best_score_xyz = grid_search_xyz.best_score_
print(f"Best Model for xyz_vaccine: {best_model_xyz}")
print(f"Best ROC AUC Score for xyz_vaccine: {best_score_xyz}")

grid_search_seasonal = GridSearchCV(logreg_model, param_grid, cv=5, scoring='roc_auc')
grid_search_seasonal.fit(train_data, y_seasonal)
best_model_seasonal = grid_search_seasonal.best_estimator_
best_score_seasonal = grid_search_seasonal.best_score_

print(f"Best Model for seasonal_vaccine: {best_model_seasonal}")
print(f"Best ROC AUC Score for seasonal_vaccine: {best_score_seasonal}")

####
preds_xyz = best_model_xyz.predict_proba(test_data)[:, 1]
preds_seasonal = best_model_seasonal.predict_proba(test_data)[:, 1]


Best Model for xyz_vaccine: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['xyz_concern',
                                                   'xyz_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
               

In [27]:
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': preds_xyz,
    'seasonal_vaccine': preds_seasonal
})


submission.to_csv('submission.csv', index=False)

mean_auc = (best_score_xyz + best_score_seasonal) / 2.0
print(f"Mean ROC AUC: {mean_auc}")


Mean ROC AUC: 0.8439051605665693


In [28]:
submission.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.060641,0.294485
1,26708,0.047384,0.046038
2,26709,0.447648,0.5981
3,26710,0.485494,0.877166
4,26711,0.160231,0.476273
