In [56]:
# necessary libraries

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
# Datasets
dtf = pd.read_csv('training_set_features.csv')
dtfe = pd.read_csv('test_set_features.csv')
dtl = pd.read_csv('training_set_labels.csv')

In [3]:
#setting respondent id as index
dtf.set_index('respondent_id', inplace=True)
dtfe.set_index('respondent_id', inplace=True)
dtl.set_index('respondent_id', inplace=True)

In [4]:
dtf.shape

(26707, 35)

In [5]:
dtfe.shape

(26708, 35)

In [7]:
dtl.shape

(26707, 2)

In [10]:
#numerical column
numerical_cols = dtf.select_dtypes(include=['number']).columns.tolist()

In [12]:
len(numerical_cols)

23

In [13]:
# categorical columns
categorical_cols = dtf.select_dtypes(exclude=['number']).columns.tolist()

In [14]:
len(categorical_cols)

12

In [32]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))  
])

In [33]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('scaler', StandardScaler())  
])

In [34]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ],
    remainder='passthrough'  
)

In [36]:
# feature seperation
X_train = dtf
y_train = dtl
X_test = dtfe

In [37]:
# passing through transformer
X_train_new = preprocessor.fit_transform(X_train)
X_test_new = preprocessor.transform(X_test) 



In [38]:
X_train_new.shape

(26707, 105)

In [39]:
X_test_new.shape

(26708, 105)

In [43]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_new, y_train, test_size=0.3, random_state=49)

In [45]:
# base model
rf_model = RandomForestClassifier(n_estimators=100, random_state=49)

In [46]:
multi_target_model = MultiOutputClassifier(rf_model, n_jobs=-1)

In [47]:
multi_target_model.fit(X_train_split, y_train_split)

In [50]:
y_pred_proba = multi_target_model.predict_proba(X_val_split)
y_pred_proba

[array([[0.85, 0.15],
        [0.71, 0.29],
        [0.74, 0.26],
        ...,
        [0.47, 0.53],
        [0.91, 0.09],
        [0.89, 0.11]]),
 array([[0.51, 0.49],
        [0.49, 0.51],
        [0.71, 0.29],
        ...,
        [0.34, 0.66],
        [0.62, 0.38],
        [0.68, 0.32]])]

In [51]:
y_pred_proba = np.array(y_pred_proba)[:, :, 1].T

In [54]:
roc_auc_xyz = roc_auc_score(y_val_split['xyz_vaccine'], y_pred_proba[:, 0])
roc_auc_seasonal = roc_auc_score(y_val_split['seasonal_vaccine'], y_pred_proba[:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz Vaccine: {roc_auc_xyz}')
print(f'ROC AUC for Seasonal Vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

ROC AUC for xyz Vaccine: 0.8275296287030087
ROC AUC for Seasonal Vaccine: 0.8546001743910553
Mean ROC AUC: 0.841064901547032


In [58]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=2)
multi_target_model = MultiOutputClassifier(rf_model, n_jobs=2)

pa_grid = {
    'estimator__n_estimators': [50, 100],
    'estimator__max_depth': [None, 10],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV with limited n_jobs
grid_search = GridSearchCV(multi_target_model, pa_grid, cv=3, scoring='roc_auc', n_jobs=2)

# Fit GridSearchCV
grid_search.fit(X_train_split, y_train_split)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best ROC AUC score from GridSearchCV: {best_score}')



Best parameters: {'estimator__max_depth': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 50}
Best ROC AUC score from GridSearchCV: nan


In [60]:
best_model = grid_search.best_estimator_
y_test_pred_proba = best_model.predict_proba(X_test_new)

# Convert the list of arrays into a 2D array
y_test_pred_proba = np.array(y_test_pred_proba)[:, :, 1].T

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': dtfe.index,
    'xyz_vaccine': y_test_pred_proba[:, 0],
    'seasonal_vaccine': y_test_pred_proba[:, 1]
})

submission.to_csv('submission.csv', index=False)
print(submission.head())

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707         0.18              0.30
1          26708         0.10              0.08
2          26709         0.52              0.76
3          26710         0.42              0.86
4          26711         0.26              0.30
