<a href="https://colab.research.google.com/github/tisha-uwu/Ayushi_Sahu_DataHack/blob/main/Ayushi_Sahu_DataHack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score


labels_df = pd.read_csv('/content/training_set_labels.csv')
features_df = pd.read_csv('/content/training_set_features.csv')
test_features_df = pd.read_csv('/content/test_set_features.csv')
submission_df = pd.read_csv('/content/submission_format.csv')


merged_train_df = features_df.merge(labels_df, on='respondent_id')


X = merged_train_df.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = merged_train_df[['xyz_vaccine', 'seasonal_vaccine']]


cat_features = X.select_dtypes(include=['object']).columns
num_features = X.select_dtypes(exclude=['object']).columns


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


data_preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])


classifier_model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))


model_pipeline = Pipeline(steps=[('preprocessor', data_preprocessor),
                                 ('model', classifier_model)])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


model_pipeline.fit(X_train, y_train)


y_pred_probabilities = model_pipeline.predict_proba(X_val)


roc_auc_xyz_vaccine = roc_auc_score(y_val['xyz_vaccine'], y_pred_probabilities[0][:, 1])
roc_auc_seasonal_vaccine = roc_auc_score(y_val['seasonal_vaccine'], y_pred_probabilities[1][:, 1])
average_roc_auc = (roc_auc_xyz_vaccine + roc_auc_seasonal_vaccine) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz_vaccine}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal_vaccine}')
print(f'Mean ROC AUC: {average_roc_auc}')


test_features_with_ids = test_features_df.copy()
X_test = test_features_df.drop(columns=['respondent_id'])

test_predictions = model_pipeline.predict_proba(X_test)
submission = pd.DataFrame({
    'respondent_id': test_features_with_ids['respondent_id'],
    'xyz_vaccine': test_predictions[0][:, 1],
    'seasonal_vaccine': test_predictions[1][:, 1]
})

submission.to_csv('ayushi_submission.csv', index=False)

ROC AUC for xyz_vaccine: 0.864173999277244
ROC AUC for seasonal_vaccine: 0.8570519011081396
Mean ROC AUC: 0.8606129501926918
