In [6]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [7]:
from xgboost import XGBClassifier
print("XGBoost successfully imported!")


XGBoost successfully imported!


In [8]:
#installing required models
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# Loading data
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")
sample_submission = pd.read_csv("Sample_Submission.csv")

# Drop rows with missing target and encode
train = train.dropna(subset=['age_group']).copy()
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})

# Feature list
base_features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']

# Feature engineering
def add_features(df):
    df = df.copy()
    df['GLU_IN_RATIO'] = df['LBXGLU'] / (df['LBXIN'] + 1e-3)
    df['GLT_IN_RATIO'] = df['LBXGLT'] / (df['LBXIN'] + 1e-3)
    df['BMI_BIN'] = pd.qcut(df['BMXBMI'], 4, labels=False, duplicates='drop')
    return df

train = add_features(train)
test = add_features(test)

# Define full feature set
features = base_features + ['GLU_IN_RATIO', 'GLT_IN_RATIO', 'BMI_BIN']

X = train[features]
y = train['age_group']
X_test = test[features]

# Combine train+test for consistent imputation
combined = pd.concat([X, X_test], axis=0)
imputer = IterativeImputer(random_state=42)
combined_imputed = imputer.fit_transform(combined)

# Split back
X_imputed = combined_imputed[:len(X)]
X_test_imputed = combined_imputed[len(X):]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Initialize XGBoost
model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    n_estimators=250,
    max_depth=5,
    learning_rate=0.05
)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(X_test_scaled.shape[0])
f1_scores = []

for train_idx, val_idx in skf.split(X_scaled, y):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    f1 = f1_score(y_val, val_preds)
    f1_scores.append(f1)

    test_preds += model.predict(X_test_scaled)


final_preds = (test_preds / skf.n_splits >= 0.5).astype(int)


submission = pd.DataFrame({'age_group': final_preds})
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv created.")
print("📊 CV F1 Scores:", f1_scores)
print("📈 Mean F1 Score:", np.mean(f1_scores))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ submission.csv created.
📊 CV F1 Scores: [0.375, 0.3841059602649007, 0.375, 0.3472222222222222, 0.4533333333333333]
📈 Mean F1 Score: 0.3869323031640912
