In [8]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install xgboost




[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
train_features = pd.read_csv("./Training Set Features.csv")
train_labels = pd.read_csv("./Training Set Labels.csv")
test_features = pd.read_csv("./Test Set Features.csv")

In [4]:
# Merge training labels with features
train = train_features.merge(train_labels, on="respondent_id")

# Extract respondent_id for test
test_respondent_id = test_features["respondent_id"]

# Drop respondent_id from features
train.drop(columns=["respondent_id"], inplace=True)
test_features.drop(columns=["respondent_id"], inplace=True)

In [5]:
# Define categorical and numerical features
categorical_features = train.select_dtypes(include=["object"]).columns.tolist()
numerical_features = train.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_features.remove("h1n1_vaccine")
numerical_features.remove("seasonal_vaccine")

In [6]:
# Preprocessing Pipeline
num_pipeline = SimpleImputer(strategy="mean")
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numerical_features),
    ("cat", cat_pipeline, categorical_features)
])

# Fit and transform training data
X_train_processed = preprocessor.fit_transform(train.drop(columns=["h1n1_vaccine", "seasonal_vaccine"]))
y_train = train[["h1n1_vaccine", "seasonal_vaccine"]]

# Transform test data
X_test_processed = preprocessor.transform(test_features)

In [7]:
# Define number of folds
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Create empty arrays for OOF and test predictions
oof_h1n1 = np.zeros(len(X_train_processed))
oof_seasonal = np.zeros(len(X_train_processed))

pred_h1n1 = np.zeros(len(X_test_processed))
pred_seasonal = np.zeros(len(X_test_processed))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_processed, y_train["h1n1_vaccine"])):

    print("#" * 25)
    print(f"### Fold {fold + 1}")
    print("#" * 25)

    # Split data for this fold
    x_train, x_valid = X_train_processed[train_idx], X_train_processed[val_idx]
    y_train_h1n1, y_valid_h1n1 = y_train["h1n1_vaccine"].iloc[train_idx], y_train["h1n1_vaccine"].iloc[val_idx]
    y_train_seasonal, y_valid_seasonal = y_train["seasonal_vaccine"].iloc[train_idx], y_train["seasonal_vaccine"].iloc[val_idx]

    # Define models
    model_h1n1 = XGBClassifier(
        max_depth=5,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.015,  
        reg_alpha=0.1, reg_lambda=1.0, 
        min_child_weight=80,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )

    model_seasonal = XGBClassifier(
        max_depth=5,  
        colsample_bytree=0.5,  
        subsample=0.8,  
        n_estimators=2000,  
        learning_rate=0.015,  
        reg_alpha=0.1, reg_lambda=1.0, 
        min_child_weight=80,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )


    # Train H1N1 model
    model_h1n1.fit(
        x_train, y_train_h1n1,
        eval_set=[(x_valid, y_valid_h1n1)],
        verbose=500,
    )

    # Train Seasonal model
    model_seasonal.fit(
        x_train, y_train_seasonal,
        eval_set=[(x_valid, y_valid_seasonal)],
        verbose=500,
    )

    # Infer OOF predictions
    oof_h1n1[val_idx] = model_h1n1.predict_proba(x_valid)[:, 1]
    oof_seasonal[val_idx] = model_seasonal.predict_proba(x_valid)[:, 1]

    # Infer test predictions (averaging across folds)
    pred_h1n1 += model_h1n1.predict_proba(X_test_processed)[:, 1] / FOLDS
    pred_seasonal += model_seasonal.predict_proba(X_test_processed)[:, 1] / FOLDS


#########################
### Fold 1
#########################
[0]	validation_0-logloss:0.51381


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.35482
[1000]	validation_0-logloss:0.35016
[1500]	validation_0-logloss:0.34858
[1999]	validation_0-logloss:0.34797
[0]	validation_0-logloss:0.68882


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.46120
[1000]	validation_0-logloss:0.45687
[1500]	validation_0-logloss:0.45601
[1999]	validation_0-logloss:0.45623
#########################
### Fold 2
#########################
[0]	validation_0-logloss:0.51364


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.34545
[1000]	validation_0-logloss:0.34498
[1500]	validation_0-logloss:0.34516
[1999]	validation_0-logloss:0.34580
[0]	validation_0-logloss:0.68839


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.48039
[1000]	validation_0-logloss:0.48082
[1500]	validation_0-logloss:0.48026
[1999]	validation_0-logloss:0.47980
#########################
### Fold 3
#########################
[0]	validation_0-logloss:0.51333


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.34219
[1000]	validation_0-logloss:0.34094
[1500]	validation_0-logloss:0.34073
[1999]	validation_0-logloss:0.34114
[0]	validation_0-logloss:0.68991


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.46271
[1000]	validation_0-logloss:0.45871
[1500]	validation_0-logloss:0.45815
[1999]	validation_0-logloss:0.45830
#########################
### Fold 4
#########################
[0]	validation_0-logloss:0.51421


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.33254
[1000]	validation_0-logloss:0.32855
[1500]	validation_0-logloss:0.32771
[1999]	validation_0-logloss:0.32725
[0]	validation_0-logloss:0.68862


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.47418
[1000]	validation_0-logloss:0.46814
[1500]	validation_0-logloss:0.46679
[1999]	validation_0-logloss:0.46687
#########################
### Fold 5
#########################
[0]	validation_0-logloss:0.51421


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.36145
[1000]	validation_0-logloss:0.35968
[1500]	validation_0-logloss:0.35973
[1999]	validation_0-logloss:0.36016
[0]	validation_0-logloss:0.68852


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.47037
[1000]	validation_0-logloss:0.46687
[1500]	validation_0-logloss:0.46616
[1999]	validation_0-logloss:0.46682
#########################
### Fold 6
#########################
[0]	validation_0-logloss:0.51384


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.33441
[1000]	validation_0-logloss:0.33018
[1500]	validation_0-logloss:0.32843
[1999]	validation_0-logloss:0.32775
[0]	validation_0-logloss:0.68833


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.46226
[1000]	validation_0-logloss:0.45772
[1500]	validation_0-logloss:0.45668
[1999]	validation_0-logloss:0.45654
#########################
### Fold 7
#########################
[0]	validation_0-logloss:0.51403


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.34783
[1000]	validation_0-logloss:0.34313
[1500]	validation_0-logloss:0.34206
[1999]	validation_0-logloss:0.34169
[0]	validation_0-logloss:0.68862


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.44553
[1000]	validation_0-logloss:0.44001
[1500]	validation_0-logloss:0.43840
[1999]	validation_0-logloss:0.43793
#########################
### Fold 8
#########################
[0]	validation_0-logloss:0.51359


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.34232
[1000]	validation_0-logloss:0.33849
[1500]	validation_0-logloss:0.33707
[1999]	validation_0-logloss:0.33642
[0]	validation_0-logloss:0.68701


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.47017
[1000]	validation_0-logloss:0.46773
[1500]	validation_0-logloss:0.46733
[1999]	validation_0-logloss:0.46764
#########################
### Fold 9
#########################
[0]	validation_0-logloss:0.51388


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.35127
[1000]	validation_0-logloss:0.34809
[1500]	validation_0-logloss:0.34822
[1999]	validation_0-logloss:0.34881
[0]	validation_0-logloss:0.68934


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.47488
[1000]	validation_0-logloss:0.47347
[1500]	validation_0-logloss:0.47323
[1999]	validation_0-logloss:0.47359
#########################
### Fold 10
#########################
[0]	validation_0-logloss:0.51365


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.35675
[1000]	validation_0-logloss:0.35486
[1500]	validation_0-logloss:0.35416
[1999]	validation_0-logloss:0.35437
[0]	validation_0-logloss:0.69110


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[500]	validation_0-logloss:0.45540
[1000]	validation_0-logloss:0.45192
[1500]	validation_0-logloss:0.45164
[1999]	validation_0-logloss:0.45194


In [8]:
# Compute AUC for out-of-fold predictions
auc_h1n1 = roc_auc_score(y_train["h1n1_vaccine"], oof_h1n1)
auc_seasonal = roc_auc_score(y_train["seasonal_vaccine"], oof_seasonal)

print(f"\nFinal OOF AUC - H1N1: {auc_h1n1:.4f}")
print(f"Final OOF AUC - Seasonal: {auc_seasonal:.4f}")


Final OOF AUC - H1N1: 0.8685
Final OOF AUC - Seasonal: 0.8627


In [11]:
# Prepare final test predictions
submission = pd.DataFrame({
    "respondent_id": test_respondent_id,
    "h1n1_vaccine": pred_h1n1,
    "seasonal_vaccine": pred_seasonal
})

# Save submission file
submission.to_csv("submission_KFold-XGBoost.csv", index=False)
print("Final submission file 'submission_df.csv' created successfully!")

Final submission file 'submission_df.csv' created successfully!


## Ensemble Model for Higher Accuracy

In [14]:

# Load predictions from different models
catboost_preds = pd.read_csv('./cb.csv', index_col='respondent_id')
lgbm_preds = pd.read_csv('./lgbm.csv', index_col='respondent_id')
xgboost_preds = pd.read_csv('./submission_KFold-XGBoost.csv', index_col='respondent_id')

# Simple average ensemble
ensemble_preds = (catboost_preds + lgbm_preds + xgboost_preds) / 3

# Save ensemble predictions
ensemble_preds.to_csv(f'ensemble_submission.csv', index=True)

ensemble_preds.head()


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.213101,0.257885
26708,0.051261,0.035531
26709,0.326377,0.766655
26710,0.76109,0.886581
26711,0.42125,0.509461


In [16]:
# Define weights for each model (adjust according to validation performance)
weights = {
    'catboost': 0.4,
    'lgbm': 0.4,
    'xgboost': 0.2
}

# Load predictions
catboost_preds = pd.read_csv('./cb.csv', index_col='respondent_id')
lgbm_preds = pd.read_csv('./lgbm.csv', index_col='respondent_id')
xgboost_preds = pd.read_csv('./submission_KFold-XGBoost.csv', index_col='respondent_id')

# Weighted ensemble calculation
ensemble_preds = (
    catboost_preds * weights['catboost'] +
    lgbm_preds * weights['lgbm'] +
    xgboost_preds * weights['xgboost']
)

# Save weighted ensemble predictions
ensemble_preds.to_csv('weighted_ensemble_submission.csv', index=True)

ensemble_preds.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.227762,0.256806
26708,0.054392,0.037561
26709,0.337687,0.765825
26710,0.771766,0.888729
26711,0.443386,0.519597
