In [5]:
!pip install pandas
!pip install numpy
!pip install scikit-learn




[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting scikit-learn


[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Collecting joblib>=1.2.0
  Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.5.0


In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [15]:
# Load datasets
train_features = pd.read_csv("./training_set_features.csv")
train_labels = pd.read_csv("./training_set_labels.csv")
test_features = pd.read_csv("./test_set_features.csv")

In [None]:
# Drop 'respondent_id' since it's not a feature
train_features.drop(columns=['respondent_id'], inplace=True)
test_respondent_id = test_features['respondent_id']  # Save IDs for submission
test_features.drop(columns=['respondent_id'], inplace=True)

# Identify categorical and numerical features
categorical_features = train_features.select_dtypes(include=['object']).columns.tolist()
numerical_features = train_features.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove target variables from numerical features list
numerical_features = [col for col in numerical_features if col not in ['h1n1_vaccine', 'seasonal_vaccine']]

# Ensure test dataset has all required columns
missing_cols = set(train_features.columns) - set(test_features.columns)
for col in missing_cols:
    test_features[col] = np.nan  # Add missing columns as NaN

# Reorder test features to match train features
test_features = test_features[train_features.columns]

# Define preprocessing steps
numerical_imputer = SimpleImputer(strategy='median')  # Median for numerical features
categorical_imputer = SimpleImputer(strategy='most_frequent')  # Most frequent value for categorical features
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')  # One-hot encoding for categorical variables

# Create transformers for numerical and categorical columns
numerical_pipeline = Pipeline([("imputer", numerical_imputer)])
categorical_pipeline = Pipeline([("imputer", categorical_imputer), ("encoder", one_hot_encoder)])

# Combine pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)

# Fit the preprocessor on the training data and transform both train and test
X_train_processed = preprocessor.fit_transform(train_features)
X_test_processed = preprocessor.transform(test_features)

# Convert processed data to DataFrame
X_train_processed_df = pd.DataFrame(X_train_processed)
X_test_processed_df = pd.DataFrame(X_test_processed)

# Define target variables
y_train = train_labels[['h1n1_vaccine', 'seasonal_vaccine']]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)

# Train Random Forest classifiers for both target variables
rf_h1n1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit models
rf_h1n1.fit(X_train, y_train['h1n1_vaccine'])
rf_seasonal.fit(X_train, y_train['seasonal_vaccine'])

# Predict probabilities on validation set
y_pred_h1n1 = rf_h1n1.predict_proba(X_val)[:, 1]
y_pred_seasonal = rf_seasonal.predict_proba(X_val)[:, 1]

# Compute ROC AUC scores
auc_h1n1 = roc_auc_score(y_val['h1n1_vaccine'], y_pred_h1n1)
auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_seasonal)
mean_auc = (auc_h1n1 + auc_seasonal) / 2

# Print evaluation results
print(f"ROC AUC Score for H1N1 Vaccine: {auc_h1n1:.4f}")
print(f"ROC AUC Score for Seasonal Vaccine: {auc_seasonal:.4f}")
print(f"Mean ROC AUC Score: {mean_auc:.4f}")

# Make final predictions on test data
test_pred_h1n1 = rf_h1n1.predict_proba(X_test_processed)[:, 1]
test_pred_seasonal = rf_seasonal.predict_proba(X_test_processed)[:, 1]

# Prepare submission file
submission = pd.DataFrame({
    "respondent_id": test_respondent_id,
    "h1n1_vaccine": test_pred_h1n1,
    "seasonal_vaccine": test_pred_seasonal
})

# Save submission file
submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' has been created successfully!")

ROC AUC Score for H1N1 Vaccine: 0.8294
ROC AUC Score for Seasonal Vaccine: 0.8518
Mean ROC AUC Score: 0.8406
Submission file 'submission.csv' has been created successfully!
