In [1]:
import pandas as pd

# Load datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Inspect the datasets
print(train_features.head())
print(train_labels.head())
print(test_features.head())




   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Identify feature columns
feature_cols = train_features.columns.difference(['respondent_id'])
target_cols = ['xyz_vaccine', 'seasonal_vaccine']

# Separate features and target from training data
X = train_features[feature_cols]
y = train_labels[target_cols]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Define preprocessing pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))])

# Train the model
model.fit(X_train, y_train)


In [5]:
from sklearn.metrics import roc_auc_score
import numpy as np
# Predict probabilities on the validation set
y_pred_proba = model.predict_proba(X_val)
y_pred_proba = np.hstack([y_pred_proba[i][:, 1].reshape(-1, 1) for i in range(len(target_cols))])

# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[:, 0])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[:, 1])
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


ROC AUC for xyz_vaccine: 0.8285037838132167
ROC AUC for seasonal_vaccine: 0.8480600364581103
Mean ROC AUC: 0.8382819101356636


In [6]:
# Predict probabilities on the test set
X_test = test_features[feature_cols]
y_test_pred_proba = model.predict_proba(X_test)
y_test_pred_proba = np.hstack([y_test_pred_proba[i][:, 1].reshape(-1, 1) for i in range(len(target_cols))])



In [7]:
# Prepare the submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_proba[:, 0],
    'seasonal_vaccine': y_test_pred_proba[:, 1]
})

submission.to_csv('submission.csv', index=False)
