In [8]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install xgboost




[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
# Load datasets
train_features = pd.read_csv("./training_set_features.csv")
train_labels = pd.read_csv("./training_set_labels.csv")
test_features = pd.read_csv("./test_set_features.csv")

In [11]:
# Merge training labels with features
train = train_features.merge(train_labels, on="respondent_id")

# Extract respondent_id for test
test_respondent_id = test_features["respondent_id"]

# Drop respondent_id from features
train.drop(columns=["respondent_id"], inplace=True)
test_features.drop(columns=["respondent_id"], inplace=True)

In [12]:
# Define categorical and numerical features
categorical_features = train.select_dtypes(include=["object"]).columns.tolist()
numerical_features = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_features.remove("h1n1_vaccine")
numerical_features.remove("seasonal_vaccine")

In [13]:
# Preprocessing Pipeline
num_pipeline = SimpleImputer(strategy="mean")
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

# Fit and transform training data
X_train_processed = preprocessor.fit_transform(train.drop(columns=["h1n1_vaccine", "seasonal_vaccine"]))
y_train = train[["h1n1_vaccine", "seasonal_vaccine"]]

# Transform test data
X_test_processed = preprocessor.transform(test_features)

In [26]:
# Define number of folds
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Create empty arrays for OOF and test predictions
oof_h1n1 = np.zeros(len(X_train_processed))
oof_seasonal = np.zeros(len(X_train_processed))

pred_h1n1 = np.zeros(len(X_test_processed))
pred_seasonal = np.zeros(len(X_test_processed))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_processed, y_train["h1n1_vaccine"])):

    print("#" * 25)
    print(f"### Fold {fold + 1}")
    print("#" * 25)

    # Split data for this fold
    x_train, x_valid = X_train_processed[train_idx], X_train_processed[val_idx]
    y_train_h1n1, y_valid_h1n1 = y_train["h1n1_vaccine"].iloc[train_idx], y_train["h1n1_vaccine"].iloc[val_idx]
    y_train_seasonal, y_valid_seasonal = y_train["seasonal_vaccine"].iloc[train_idx], y_train["seasonal_vaccine"].iloc[val_idx]

    # Define models
    model_h1n1 = XGBClassifier(
        max_depth=5,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.015,  
        reg_alpha=0.1, reg_lambda=1.0, 
        min_child_weight=80,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )

    model_seasonal = XGBClassifier(
        max_depth=5,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.015,  
        reg_alpha=0.1, reg_lambda=1.0, 
        min_child_weight=80,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )


    # Train H1N1 model
    model_h1n1.fit(
        x_train, y_train_h1n1,
        eval_set=[(x_valid, y_valid_h1n1)],
        verbose=500,
    )

    # Train Seasonal model
    model_seasonal.fit(
        x_train, y_train_seasonal,
        eval_set=[(x_valid, y_valid_seasonal)],
        verbose=500,
    )

    # Infer OOF predictions
    oof_h1n1[val_idx] = model_h1n1.predict_proba(x_valid)[:, 1]
    oof_seasonal[val_idx] = model_seasonal.predict_proba(x_valid)[:, 1]

    # Infer test predictions (averaging across folds)
    pred_h1n1 += model_h1n1.predict_proba(X_test_processed)[:, 1] / FOLDS
    pred_seasonal += model_seasonal.predict_proba(X_test_processed)[:, 1] / FOLDS


#########################
### Fold 1
#########################
[0]	validation_0-logloss:0.51682


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.35569
[1000]	validation_0-logloss:0.35098
[1500]	validation_0-logloss:0.34934
[1999]	validation_0-logloss:0.34844
[0]	validation_0-logloss:0.68639


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.46095
[1000]	validation_0-logloss:0.45700
[1500]	validation_0-logloss:0.45629
[1999]	validation_0-logloss:0.45649
#########################
### Fold 2
#########################
[0]	validation_0-logloss:0.51679


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.34522
[1000]	validation_0-logloss:0.34433
[1500]	validation_0-logloss:0.34486
[1999]	validation_0-logloss:0.34569
[0]	validation_0-logloss:0.68610


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.48088
[1000]	validation_0-logloss:0.48068
[1500]	validation_0-logloss:0.48009
[1999]	validation_0-logloss:0.47980
#########################
### Fold 3
#########################
[0]	validation_0-logloss:0.51651


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.34307
[1000]	validation_0-logloss:0.34111
[1500]	validation_0-logloss:0.34114
[1999]	validation_0-logloss:0.34128
[0]	validation_0-logloss:0.68759


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.46208
[1000]	validation_0-logloss:0.45767
[1500]	validation_0-logloss:0.45739
[1999]	validation_0-logloss:0.45765
#########################
### Fold 4
#########################
[0]	validation_0-logloss:0.51700


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.33311
[1000]	validation_0-logloss:0.32909
[1500]	validation_0-logloss:0.32768
[1999]	validation_0-logloss:0.32706
[0]	validation_0-logloss:0.68630


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.47391
[1000]	validation_0-logloss:0.46771
[1500]	validation_0-logloss:0.46651
[1999]	validation_0-logloss:0.46640
#########################
### Fold 5
#########################
[0]	validation_0-logloss:0.51718


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.36224
[1000]	validation_0-logloss:0.36120
[1500]	validation_0-logloss:0.36098
[1999]	validation_0-logloss:0.36133
[0]	validation_0-logloss:0.68620


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.46970
[1000]	validation_0-logloss:0.46632
[1500]	validation_0-logloss:0.46606
[1999]	validation_0-logloss:0.46657
#########################
### Fold 6
#########################
[0]	validation_0-logloss:0.51684


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.33474
[1000]	validation_0-logloss:0.32982
[1500]	validation_0-logloss:0.32845
[1999]	validation_0-logloss:0.32723
[0]	validation_0-logloss:0.68608


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.46229
[1000]	validation_0-logloss:0.45727
[1500]	validation_0-logloss:0.45641
[1999]	validation_0-logloss:0.45626
#########################
### Fold 7
#########################
[0]	validation_0-logloss:0.51699


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.34812
[1000]	validation_0-logloss:0.34330
[1500]	validation_0-logloss:0.34183
[1999]	validation_0-logloss:0.34149
[0]	validation_0-logloss:0.68621


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.44586
[1000]	validation_0-logloss:0.44041
[1500]	validation_0-logloss:0.43895
[1999]	validation_0-logloss:0.43838
#########################
### Fold 8
#########################
[0]	validation_0-logloss:0.51652


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.34292
[1000]	validation_0-logloss:0.33891
[1500]	validation_0-logloss:0.33787
[1999]	validation_0-logloss:0.33733
[0]	validation_0-logloss:0.68474


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.47097
[1000]	validation_0-logloss:0.46798
[1500]	validation_0-logloss:0.46740
[1999]	validation_0-logloss:0.46805
#########################
### Fold 9
#########################
[0]	validation_0-logloss:0.51691


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.35094
[1000]	validation_0-logloss:0.34782
[1500]	validation_0-logloss:0.34743
[1999]	validation_0-logloss:0.34852
[0]	validation_0-logloss:0.68695


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.47466
[1000]	validation_0-logloss:0.47304
[1500]	validation_0-logloss:0.47280
[1999]	validation_0-logloss:0.47313
#########################
### Fold 10
#########################
[0]	validation_0-logloss:0.51679


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.35739
[1000]	validation_0-logloss:0.35530
[1500]	validation_0-logloss:0.35505
[1999]	validation_0-logloss:0.35511
[0]	validation_0-logloss:0.68875


Parameters: { "use_label_encoder" } are not used.



[500]	validation_0-logloss:0.45543
[1000]	validation_0-logloss:0.45184
[1500]	validation_0-logloss:0.45128
[1999]	validation_0-logloss:0.45190


In [27]:
# Compute AUC for out-of-fold predictions
auc_h1n1 = roc_auc_score(y_train["h1n1_vaccine"], oof_h1n1)
auc_seasonal = roc_auc_score(y_train["seasonal_vaccine"], oof_seasonal)

print(f"\nFinal OOF AUC - H1N1: {auc_h1n1:.4f}")
print(f"Final OOF AUC - Seasonal: {auc_seasonal:.4f}")


Final OOF AUC - H1N1: 0.8683
Final OOF AUC - Seasonal: 0.8628


In [29]:
# Prepare final test predictions
submission = pd.DataFrame({
    "respondent_id": test_respondent_id,
    "h1n1_vaccine": pred_h1n1,
    "seasonal_vaccine": pred_seasonal
})

# Save submission file
submission.to_csv("submission_KFold-XGBoost.csv", index=False)
print("Final submission file 'submission_KFold-XGBoost.csv' created successfully!")

Final submission file 'submission_KFold-XGBoost.csv' created successfully!
