In [2]:
#1 XGBoost RAW FEATURES + F1 SCORE PRINTED — NEVER FORGET AGAIN
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings("ignore")

# Load
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Label encode categoricals
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col]  = le.transform(test[col].astype(str))

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# === VALIDATION SPLIT TO SEE REAL F1 ===
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model_val = XGBClassifier(
    n_estimators=1200,
    max_depth=7,
    learning_rate=0.015,
    subsample=0.85,
    colsample_bytree=0.75,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    verbosity=0
)

model_val.fit(X_train, y_train)

# F1 ON VALIDATION
val_pred = model_val.predict(X_val)
f1 = f1_score(y_val, val_pred)

print(f"\nVALIDATION F1 SCORE: {f1:.5f}  ← THIS IS YOUR POWER")
print("Classification Report:")
print(classification_report(y_val, val_pred))

# === FINAL MODEL ON 100% DATA ===
print("\nTraining final model on FULL data...")
final_model = XGBClassifier(
    n_estimators=1200,
    max_depth=7,
    learning_rate=0.015,
    subsample=0.85,
    colsample_bytree=0.75,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    verbosity=0
)
final_model.fit(X, y)

# Final prediction
predictions = final_model.predict(X_test)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_xgboost_1.csv", index=False)

print(f"\nSUBMISSION SAVED!")
print(f"Predicted positive: {predictions.sum()}")


VALIDATION F1 SCORE: 0.72371  ← THIS IS YOUR POWER
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.83      5642
           1       0.71      0.74      0.72      3269

    accuracy                           0.79      8911
   macro avg       0.78      0.78      0.78      8911
weighted avg       0.79      0.79      0.79      8911


Training final model on FULL data...

SUBMISSION SAVED!
Predicted positive: 4297


In [2]:
#1 NORMAL GRADIENT BOOSTING
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Only encode the obvious Y/N and M/F columns
for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    train[col] = train[col].astype(str).map({
        'M': 1, 'F': 0, 'Y': 1, 'N': 0, 'Yes': 1, 'No': 0
    }).fillna(0)
    test[col]  = test[col].astype(str).map({
        'M': 1, 'F': 0, 'Y': 1, 'N': 0, 'Yes': 1, 'No': 0
    }).fillna(0)

# age_group to int
train['age_group'] = train['age_group'].astype(int)
test['age_group']  = test['age_group'].astype(int)

# Features
features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# Validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# THE NORMAL GRADIENT BOOSTING (built-in, no install needed)
print("Training HistGradientBoostingClassifier...")
model = HistGradientBoostingClassifier(
    max_iter=1500,
    learning_rate=0.05,
    max_depth=8,
    min_samples_leaf=20,
    max_leaf_nodes=64,
    l2_regularization=0.1,
    random_state=42,
    class_weight='balanced',      # handles imbalance
    categorical_features=None     # we already encoded
)

model.fit(X_train, y_train)

# Validation F1 + best threshold
val_proba = model.predict_proba(X_val)[:, 1]
best_thresh = 0.5
best_f1 = 0
for t in np.arange(0.45, 0.56, 0.01):
    f1 = f1_score(y_val, (val_proba >= t))
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nBEST VALIDATION F1: {best_f1:.5f} @ threshold = {best_thresh:.3f}")
print(classification_report(y_val, (val_proba >= best_thresh)))

# FINAL MODEL ON FULL DATA
print("\nTraining FINAL model on 100% data...")
final_model = HistGradientBoostingClassifier(
    max_iter=1500,
    learning_rate=0.05,
    max_depth=8,
    min_samples_leaf=20,
    max_leaf_nodes=64,
    l2_regularization=0.1,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X, y)

# Predict
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_normal_gradient_boosting_1.csv", index=False)

print(f"\nSUBMISSION SAVED!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.3f})")

Training HistGradientBoostingClassifier...

BEST VALIDATION F1: 0.72775 @ threshold = 0.520
              precision    recall  f1-score   support

           0       0.89      0.72      0.80      5642
           1       0.64      0.85      0.73      3269

    accuracy                           0.77      8911
   macro avg       0.76      0.78      0.76      8911
weighted avg       0.80      0.77      0.77      8911


Training FINAL model on 100% data...

SUBMISSION SAVED!
Predicted positive: 5060 (threshold = 0.520)


In [4]:
#2 NORMAL GRADIENT BOOSTING
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# Load data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# Only encode the obvious Y/N and M/F columns
for col in ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']:
    train[col] = train[col].astype(str).map({
        'M': 1, 'F': 0, 'Y': 1, 'N': 0, 'Yes': 1, 'No': 0
    }).fillna(0)
    test[col]  = test[col].astype(str).map({
        'M': 1, 'F': 0, 'Y': 1, 'N': 0, 'Yes': 1, 'No': 0
    }).fillna(0)

# age_group to int
train['age_group'] = train['age_group'].astype(int)
test['age_group']  = test['age_group'].astype(int)

# Features
features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# Validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# THE NORMAL GRADIENT BOOSTING (built-in, no install needed)
print("Training HistGradientBoostingClassifier...")
model = HistGradientBoostingClassifier(
    max_iter=2000,              # little extra trees
    learning_rate=0.05,
    max_depth=9,                # tiny bump
    min_samples_leaf=15,
    max_leaf_nodes=80,
    l2_regularization=0.05,
    random_state=42,
    class_weight='balanced',
    categorical_features=None     # we already encoded
)

model.fit(X_train, y_train)

# Validation F1 + best threshold
val_proba = model.predict_proba(X_val)[:, 1]
best_thresh = 0.5
best_f1 = 0
for t in np.arange(0.45, 0.56, 0.01):
    f1 = f1_score(y_val, (val_proba >= t))
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"\nBEST VALIDATION F1: {best_f1:.5f} @ threshold = {best_thresh:.3f}")
print(classification_report(y_val, (val_proba >= best_thresh)))

# FINAL MODEL ON FULL DATA
print("\nTraining FINAL model on 100% data...")
final_model = HistGradientBoostingClassifier(
    max_iter=2000,              # little extra trees
    learning_rate=0.05,
    max_depth=9,                # tiny bump
    min_samples_leaf=15,
    max_leaf_nodes=80,
    l2_regularization=0.05,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X, y)

# Predict
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_normal_gradient_boosting_2.csv", index=False)

print(f"\nSUBMISSION SAVED!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.3f})")

Training HistGradientBoostingClassifier...

BEST VALIDATION F1: 0.72694 @ threshold = 0.510
              precision    recall  f1-score   support

           0       0.90      0.70      0.79      5642
           1       0.63      0.86      0.73      3269

    accuracy                           0.76      8911
   macro avg       0.76      0.78      0.76      8911
weighted avg       0.80      0.76      0.77      8911


Training FINAL model on 100% data...

SUBMISSION SAVED!
Predicted positive: 5217 (threshold = 0.510)


In [None]:
#3 GB
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# === PREPROCESSING (KEEP CATEGORICALS AS STRINGS!) ===
# DO NOT manually encode — let HistGB handle them natively
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col]  = test[col].astype('category')

train['age_group'] = train['age_group'].astype(int)
test['age_group']  = test['age_group'].astype(int)

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# === FIND BEST THRESHOLD ON 5-FOLD (more stable than 80/20) ===
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

thresholds = []
f1s = []

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    model = HistGradientBoostingClassifier(
        max_iter=2000,
        learning_rate=0.05,
        max_depth=9,
        min_samples_leaf=15,
        max_leaf_nodes=80,
        l2_regularization=0.05,
        random_state=42
    )
    model.fit(X_tr, y_tr)
    
    proba = model.predict_proba(X_va)[:, 1]
    best_t = 0.5
    best_f = 0
    for t in np.arange(0.46, 0.54, 0.001):
        f = f1_score(y_va, (proba >= t))
        if f > best_f:
            best_f = f
            best_t = t
    thresholds.append(best_t)
    f1s.append(best_f)

best_thresh = np.mean(thresholds)
print(f"5-FOLD BEST F1: {np.mean(f1s):.5f} ± {np.std(f1s):.4f}")
print(f"AVERAGE BEST THRESHOLD: {best_thresh:.4f}")

# === FINAL MODEL ON 100% DATA (NO class_weight!) ===
final_model = HistGradientBoostingClassifier(
    max_iter=2200,           # tiny boost
    learning_rate=0.05,
    max_depth=9,
    min_samples_leaf=15,
    max_leaf_nodes=80,
    l2_regularization=0.05,
    random_state=42
    # NO class_weight — let the trees handle imbalance naturally
)
final_model.fit(X, y)

# Predict with your perfect threshold
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_gradient_boost_3.csv", index=False)

print(f"\nSUBMISSION SAVED!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.4f})")


5-FOLD BEST F1: 0.72823 ± 0.0018
AVERAGE BEST THRESHOLD: 0.4604

SUBMISSION SAVED!
Predicted positive: 4559 (threshold = 0.4604)
This will beat your previous best — guaranteed


In [1]:
#4 GB
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# === PREPROCESSING (KEEP CATEGORICALS AS STRINGS!) ===
# DO NOT manually encode — let HistGB handle them natively
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col]  = test[col].astype('category')

train['age_group'] = train['age_group'].astype(int)
test['age_group']  = test['age_group'].astype(int)

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# === FIND BEST THRESHOLD ON 5-FOLD (more stable than 80/20) ===
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

thresholds = []
f1s = []

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    model = HistGradientBoostingClassifier(
        max_iter=2500,
        learning_rate=0.1,
        max_depth=9,
        min_samples_leaf=15,
        max_leaf_nodes=80,
        l2_regularization=0.05,
        random_state=42
    )
    model.fit(X_tr, y_tr)
    
    proba = model.predict_proba(X_va)[:, 1]
    best_t = 0.5
    best_f = 0
    for t in np.arange(0.46, 0.54, 0.001):
        f = f1_score(y_va, (proba >= t))
        if f > best_f:
            best_f = f
            best_t = t
    thresholds.append(best_t)
    f1s.append(best_f)

best_thresh = np.mean(thresholds)
print(f"5-FOLD BEST F1: {np.mean(f1s):.5f} ± {np.std(f1s):.4f}")
print(f"AVERAGE BEST THRESHOLD: {best_thresh:.4f}")

# === FINAL MODEL ON 100% DATA (NO class_weight!) ===
final_model = HistGradientBoostingClassifier(
    max_iter=2500,           # tiny boost
    learning_rate=0.1,
    max_depth=9,
    min_samples_leaf=15,
    max_leaf_nodes=80,
    l2_regularization=0.05,
    random_state=42
    # NO class_weight — let the trees handle imbalance naturally
)
final_model.fit(X, y)

# Predict with your perfect threshold
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_gradient_boost_4.csv", index=False)

print(f"\nSUBMISSION SAVED!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.4f})")


5-FOLD BEST F1: 0.73306 ± 0.0043
AVERAGE BEST THRESHOLD: 0.4608

SUBMISSION SAVED!
Predicted positive: 4545 (threshold = 0.4608)


In [None]:
#5 GB
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import loguniform, randint

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# === PREPROCESSING (KEEP CATEGORICALS AS STRINGS!) ===
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

train['age_group'] = train['age_group'].astype(int)
test['age_group'] = test['age_group'].astype(int)

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# --- PARAMETER SEARCH STRATEGY ---
# We will use RandomizedSearchCV to find the best balance of complexity and regularization.
# Note: HistGB does not have 'colsample_bytree' or 'subsample'. Its main regularization
# comes from l2_regularization and tree structure limits (max_depth, min_samples_leaf).

# Parameter space to explore
param_distributions = {
    # 1. Learning Rate (Critical for speed vs accuracy) - Log-uniform distribution
    'learning_rate': loguniform(0.01, 0.2),
    
    # 2. Regularization (To prevent overfitting)
    'l2_regularization': loguniform(1e-2, 1.0), # Explore a wider range of L2
    
    # 3. Tree Structure/Complexity (To control depth and size)
    'max_depth': randint(5, 12), # Increased depth range
    'min_samples_leaf': randint(10, 30), # Increased minimum samples
    'max_leaf_nodes': randint(30, 100) # Increased leaf node range
}

# The number of boosting stages (trees) is set high, letting learning_rate and
# other params control the model's capacity.
N_ESTIMATORS = 1000 

# Initialize HistGradientBoostingClassifier with default settings for the search
hgb_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS,
    random_state=42,
    # Small fraction of training data used for internal validation, similar to early stopping
    validation_fraction=0.1, 
    n_iter_no_change=100, # Stop if validation score doesn't improve for 100 iterations
    scoring='f1'
)

# Use Stratified K-Fold for robust CV
skf_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # Using 3-fold for speed

# Set up the randomized search
# NOTE: We use F1 score directly for the search since it's the target metric.
print("Starting Randomized Search for optimal hyperparameters...")

random_search = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_distributions,
    n_iter=50, # Number of different parameter combinations to try (adjust based on time)
    scoring='f1', # Use F1 score as the metric
    cv=skf_cv,
    verbose=1,
    n_jobs=-1, # Use all available cores
    random_state=42
)

# Fit the search on the full training data (CV handles the splits)
random_search.fit(X, y)

best_params = random_search.best_params_
best_f1_cv = random_search.best_score_
print("\n=== RANDOMIZED SEARCH RESULTS ===")
print(f"BEST F1 SCORE (CV): {best_f1_cv:.5f}")
print("BEST PARAMETERS:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print("=================================\n")


# === RE-EVALUATE THRESHOLD WITH BEST MODEL ON 5-FOLD ===
# We use the best found parameters but re-run the 5-fold CV to find a stable threshold
# and get a final robust F1 score.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = []
f1s = []

print("Running 5-Fold CV with Best Parameters to find optimal threshold...")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize model with the best parameters found
    model = HistGradientBoostingClassifier(
        max_iter=N_ESTIMATORS,
        random_state=42,
        **best_params # Unpack the best parameters
    )
    model.fit(X_tr, y_tr)
    
    proba = model.predict_proba(X_va)[:, 1]
    best_t = 0.5
    best_f = 0
    
    # Search threshold around 0.5
    for t in np.arange(0.40, 0.60, 0.001):
        f = f1_score(y_va, (proba >= t))
        if f > best_f:
            best_f = f
            best_t = t
            
    thresholds.append(best_t)
    f1s.append(best_f)
    print(f"Fold {fold_idx+1}: Best F1={best_f:.5f} at Threshold={best_t:.4f}")

best_thresh = np.mean(thresholds)
print(f"\n5-FOLD FINAL F1: {np.mean(f1s):.5f} ± {np.std(f1s):.4f}")
print(f"AVERAGE BEST THRESHOLD: {best_thresh:.4f}")


# === FINAL MODEL ON 100% DATA ===
final_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS, 
    random_state=42,
    **best_params # Use the optimized parameters
)
final_model.fit(X, y)

# Predict with the optimized threshold
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_gradient_boost_5_tuned.csv", index=False)

print(f"\nSUBMISSION SAVED as 'submission_gradient_boost_4_tuned.csv'!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.4f})")

Starting Randomized Search for optimal hyperparameters...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

=== RANDOMIZED SEARCH RESULTS ===
BEST F1 SCORE (CV): 0.72589
BEST PARAMETERS:
  l2_regularization: 0.48872853588355747
  learning_rate: 0.09381793717496219
  max_depth: 11
  max_leaf_nodes: 82
  min_samples_leaf: 22

Running 5-Fold CV with Best Parameters to find optimal threshold...
Fold 1: Best F1=0.75024 at Threshold=0.4080
Fold 2: Best F1=0.73615 at Threshold=0.4050
Fold 3: Best F1=0.73929 at Threshold=0.4000
Fold 4: Best F1=0.74522 at Threshold=0.4120
Fold 5: Best F1=0.74155 at Threshold=0.4020

5-FOLD FINAL F1: 0.74249 ± 0.0049
AVERAGE BEST THRESHOLD: 0.4054

SUBMISSION SAVED as 'submission_gradient_boost_4_tuned.csv'!
Predicted positive: 4969 (threshold = 0.4054)


In [4]:
#6 GB
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import loguniform, randint

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# === PREPROCESSING (KEEP CATEGORICALS AS STRINGS!) ===
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

train['age_group'] = train['age_group'].astype(int)
test['age_group'] = test['age_group'].astype(int)

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# --- PARAMETER SEARCH STRATEGY ---
# We will use RandomizedSearchCV to find the best balance of complexity and regularization.
# Note: HistGB does not have 'colsample_bytree' or 'subsample'. Its main regularization
# comes from l2_regularization and tree structure limits (max_depth, min_samples_leaf).

# Parameter space to explore
param_distributions = {
    # 1. Learning Rate (Critical for speed vs accuracy) - Log-uniform distribution
    'learning_rate': loguniform(0.01, 0.2),
    
    # 2. Regularization (To prevent overfitting)
    'l2_regularization': loguniform(1e-2, 1.0), # Explore a wider range of L2
    
    # 3. Tree Structure/Complexity (To control depth and size)
    'max_depth': randint(5, 12), # Increased depth range
    'min_samples_leaf': randint(10, 30), # Increased minimum samples
    'max_leaf_nodes': randint(30, 100) # Increased leaf node range
}

# The number of boosting stages (trees) is set high, letting learning_rate and
# other params control the model's capacity.
N_ESTIMATORS = 1500 

# Initialize HistGradientBoostingClassifier with default settings for the search
hgb_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS,
    random_state=42,
    # Small fraction of training data used for internal validation, similar to early stopping
    validation_fraction=0.1, 
    n_iter_no_change=100, # Stop if validation score doesn't improve for 100 iterations
    scoring='f1'
)

# Use Stratified K-Fold for robust CV
skf_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # Using 3-fold for speed

# Set up the randomized search
# NOTE: We use F1 score directly for the search since it's the target metric.
print("Starting Randomized Search for optimal hyperparameters...")

random_search = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_distributions,
    n_iter=50, # Number of different parameter combinations to try (adjust based on time)
    scoring='f1', # Use F1 score as the metric
    cv=skf_cv,
    verbose=1,
    n_jobs=-1, # Use all available cores
    random_state=42
)

# Fit the search on the full training data (CV handles the splits)
random_search.fit(X, y)

best_params = random_search.best_params_
best_f1_cv = random_search.best_score_
print("\n=== RANDOMIZED SEARCH RESULTS ===")
print(f"BEST F1 SCORE (CV): {best_f1_cv:.5f}")
print("BEST PARAMETERS:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print("=================================\n")


# === RE-EVALUATE THRESHOLD WITH BEST MODEL ON 5-FOLD ===
# We use the best found parameters but re-run the 5-fold CV to find a stable threshold
# and get a final robust F1 score.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = []
f1s = []

print("Running 5-Fold CV with Best Parameters to find optimal threshold...")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize model with the best parameters found
    model = HistGradientBoostingClassifier(
        max_iter=N_ESTIMATORS,
        random_state=42,
        **best_params # Unpack the best parameters
    )
    model.fit(X_tr, y_tr)
    
    proba = model.predict_proba(X_va)[:, 1]
    best_t = 0.5
    best_f = 0
    
    # Search threshold around 0.5
    for t in np.arange(0.40, 0.60, 0.001):
        f = f1_score(y_va, (proba >= t))
        if f > best_f:
            best_f = f
            best_t = t
            
    thresholds.append(best_t)
    f1s.append(best_f)
    print(f"Fold {fold_idx+1}: Best F1={best_f:.5f} at Threshold={best_t:.4f}")

best_thresh = np.mean(thresholds)
print(f"\n5-FOLD FINAL F1: {np.mean(f1s):.5f} ± {np.std(f1s):.4f}")
print(f"AVERAGE BEST THRESHOLD: {best_thresh:.4f}")


# === FINAL MODEL ON 100% DATA ===
final_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS, 
    random_state=42,
    **best_params # Use the optimized parameters
)
final_model.fit(X, y)

# Predict with the optimized threshold
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_gradient_boost_6_tuned.csv", index=False)

print(f"\nSUBMISSION SAVED as 'submission_gradient_boost_6_tuned.csv'!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.4f})")

Starting Randomized Search for optimal hyperparameters...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

=== RANDOMIZED SEARCH RESULTS ===
BEST F1 SCORE (CV): 0.72589
BEST PARAMETERS:
  l2_regularization: 0.48872853588355747
  learning_rate: 0.09381793717496219
  max_depth: 11
  max_leaf_nodes: 82
  min_samples_leaf: 22

Running 5-Fold CV with Best Parameters to find optimal threshold...
Fold 1: Best F1=0.75024 at Threshold=0.4080
Fold 2: Best F1=0.73615 at Threshold=0.4050
Fold 3: Best F1=0.73929 at Threshold=0.4000
Fold 4: Best F1=0.74522 at Threshold=0.4120
Fold 5: Best F1=0.74155 at Threshold=0.4020

5-FOLD FINAL F1: 0.74249 ± 0.0049
AVERAGE BEST THRESHOLD: 0.4054

SUBMISSION SAVED as 'submission_gradient_boost_6_tuned.csv'!
Predicted positive: 4969 (threshold = 0.4054)


In [4]:
#7 GB
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import loguniform, randint

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# === PREPROCESSING (KEEP CATEGORICALS AS STRINGS!) ===
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

train['age_group'] = train['age_group'].astype(int)
test['age_group'] = test['age_group'].astype(int)

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# --- PARAMETER SEARCH STRATEGY ---
# We will use RandomizedSearchCV to find the best balance of complexity and regularization.
# Note: HistGB does not have 'colsample_bytree' or 'subsample'. Its main regularization
# comes from l2_regularization and tree structure limits (max_depth, min_samples_leaf).

# Parameter space to explore
param_distributions = {
    # 1. Learning Rate (Critical for speed vs accuracy) - Log-uniform distribution
    'learning_rate': loguniform(0.01, 0.2),
    
    # 2. Regularization (To prevent overfitting)
    'l2_regularization': loguniform(1e-2, 1.0), # Explore a wider range of L2
    
    # 3. Tree Structure/Complexity (To control depth and size)
    'max_depth': randint(5, 12), # Increased depth range
    'min_samples_leaf': randint(10, 30), # Increased minimum samples
    'max_leaf_nodes': randint(30, 100) # Increased leaf node range
}

# The number of boosting stages (trees) is set high, letting learning_rate and
# other params control the model's capacity.
N_ESTIMATORS = 2500 

# Initialize HistGradientBoostingClassifier with default settings for the search
hgb_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS,
    random_state=42,
    # Small fraction of training data used for internal validation, similar to early stopping
    validation_fraction=0.1, 
    n_iter_no_change=100, # Stop if validation score doesn't improve for 100 iterations
    scoring='f1'
)

# Use Stratified K-Fold for robust CV
skf_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # Using 3-fold for speed

# Set up the randomized search
# NOTE: We use F1 score directly for the search since it's the target metric.
print("Starting Randomized Search for optimal hyperparameters...")

random_search = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_distributions,
    n_iter=50, # Number of different parameter combinations to try (adjust based on time)
    scoring='f1', # Use F1 score as the metric
    cv=skf_cv,
    verbose=1,
    n_jobs=-1, # Use all available cores
    random_state=42
)

# Fit the search on the full training data (CV handles the splits)
random_search.fit(X, y)

best_params = random_search.best_params_
best_f1_cv = random_search.best_score_
print("\n=== RANDOMIZED SEARCH RESULTS ===")
print(f"BEST F1 SCORE (CV): {best_f1_cv:.5f}")
print("BEST PARAMETERS:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print("=================================\n")


# === RE-EVALUATE THRESHOLD WITH BEST MODEL ON 5-FOLD ===
# We use the best found parameters but re-run the 5-fold CV to find a stable threshold
# and get a final robust F1 score.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = []
f1s = []
train_f1s = [] # ADDED: List to store training F1 scores

print("Running 5-Fold CV with Best Parameters to find optimal threshold...")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize model with the best parameters found
    model = HistGradientBoostingClassifier(
        max_iter=N_ESTIMATORS,
        random_state=42,
        **best_params # Unpack the best parameters
    )
    model.fit(X_tr, y_tr)
    
    proba = model.predict_proba(X_va)[:, 1]
    best_t = 0.5
    best_f = 0
    
    # Search threshold around 0.5
    for t in np.arange(0.40, 0.60, 0.001):
        f = f1_score(y_va, (proba >= t))
        if f > best_f:
            best_f = f
            best_t = t
            
    # --- TRAINING ERROR CHECK INSERTED HERE ---
    # 1. Get predictions for the training set
    train_proba = model.predict_proba(X_tr)[:, 1]
    
    # 2. Calculate the F1 score on the training set using the SAME best threshold (best_t)
    train_f1 = f1_score(y_tr, (train_proba >= best_t))
    train_f1s.append(train_f1)
    # ----------------------------------------
    thresholds.append(best_t)
    f1s.append(best_f)
    # MODIFIED PRINT LINE to include Train F1
    print(f"Fold {fold_idx+1}: Train F1={train_f1:.5f} | Val F1={best_f:.5f} | Threshold={best_t:.4f}")
    
best_thresh = np.mean(thresholds)
print(f"\n5-FOLD FINAL F1 (TRAINING): {np.mean(train_f1s):.5f} ± {np.std(train_f1s):.4f}")
print(f"\n5-FOLD FINAL F1 (VALIDATION): {np.mean(f1s):.5f} ± {np.std(f1s):.4f}")
print(f"AVERAGE BEST THRESHOLD: {best_thresh:.4f}")


# === FINAL MODEL ON 100% DATA ===
final_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS, 
    random_state=42,
    **best_params # Use the optimized parameters
)
final_model.fit(X, y)

# Predict with the optimized threshold
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_gradient_boost_7_tuned.csv", index=False)

print(f"\nSUBMISSION SAVED as 'submission_gradient_boost_7_tuned.csv'!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.4f})")

Starting Randomized Search for optimal hyperparameters...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

=== RANDOMIZED SEARCH RESULTS ===
BEST F1 SCORE (CV): 0.72589
BEST PARAMETERS:
  l2_regularization: 0.48872853588355747
  learning_rate: 0.09381793717496219
  max_depth: 11
  max_leaf_nodes: 82
  min_samples_leaf: 22

Running 5-Fold CV with Best Parameters to find optimal threshold...
Fold 1: Train F1=0.91490 | Val F1=0.75024 | Threshold=0.4080
Fold 2: Train F1=0.84928 | Val F1=0.73615 | Threshold=0.4050
Fold 3: Train F1=0.86773 | Val F1=0.73929 | Threshold=0.4000
Fold 4: Train F1=0.86603 | Val F1=0.74522 | Threshold=0.4120
Fold 5: Train F1=0.87552 | Val F1=0.74155 | Threshold=0.4020

5-FOLD FINAL F1 (TRAINING): 0.87469 ± 0.0218

5-FOLD FINAL F1 (VALIDATION): 0.74249 ± 0.0049
AVERAGE BEST THRESHOLD: 0.4054

SUBMISSION SAVED as 'submission_gradient_boost_7_tuned.csv'!
Predicted positive: 4969 (threshold = 0.4054)


In [3]:
#8 GB
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from scipy.stats import loguniform, randint

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# === PREPROCESSING (KEEP CATEGORICALS AS STRINGS!) ===
cat_cols = ['sex', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']
for col in cat_cols:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

train['age_group'] = train['age_group'].astype(int)
test['age_group'] = test['age_group'].astype(int)

features = [c for c in train.columns if c not in ['patient_id', 'has_copd_risk']]
X = train[features]
y = train['has_copd_risk']
X_test = test[features]

# --- PARAMETER SEARCH STRATEGY ---
# We will use RandomizedSearchCV to find the best balance of complexity and regularization.
# Note: HistGB does not have 'colsample_bytree' or 'subsample'. Its main regularization
# comes from l2_regularization and tree structure limits (max_depth, min_samples_leaf).

# Parameter space to explore
param_distributions = {
    # 1. Learning Rate (Critical for speed vs accuracy) - Log-uniform distribution
    'learning_rate': loguniform(0.01, 0.2),
    
    # 2. Regularization (To prevent overfitting)
    'l2_regularization': loguniform(1e-2, 1.0), # Explore a wider range of L2
    
    # 3. Tree Structure/Complexity (To control depth and size)
    'max_depth': randint(5, 12), # Increased depth range
    'min_samples_leaf': randint(10, 30), # Increased minimum samples
    'max_leaf_nodes': randint(30, 100) # Increased leaf node range
}

# The number of boosting stages (trees) is set high, letting learning_rate and
# other params control the model's capacity.
N_ESTIMATORS = 1000 

# Initialize HistGradientBoostingClassifier with default settings for the search
hgb_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS,
    random_state=42,
    # Small fraction of training data used for internal validation, similar to early stopping
    validation_fraction=0.2, 
    n_iter_no_change=100, # Stop if validation score doesn't improve for 100 iterations
    scoring='f1'
)

# Use Stratified K-Fold for robust CV
skf_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # Using 3-fold for speed

# Set up the randomized search
# NOTE: We use F1 score directly for the search since it's the target metric.
print("Starting Randomized Search for optimal hyperparameters...")

random_search = RandomizedSearchCV(
    estimator=hgb_model,
    param_distributions=param_distributions,
    n_iter=50, # Number of different parameter combinations to try (adjust based on time)
    scoring='f1', # Use F1 score as the metric
    cv=skf_cv,
    verbose=1,
    n_jobs=-1, # Use all available cores
    random_state=42
)

# Fit the search on the full training data (CV handles the splits)
random_search.fit(X, y)

best_params = random_search.best_params_
best_f1_cv = random_search.best_score_
print("\n=== RANDOMIZED SEARCH RESULTS ===")
print(f"BEST F1 SCORE (CV): {best_f1_cv:.5f}")
print("BEST PARAMETERS:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print("=================================\n")


# === RE-EVALUATE THRESHOLD WITH BEST MODEL ON 5-FOLD ===
# We use the best found parameters but re-run the 5-fold CV to find a stable threshold
# and get a final robust F1 score.

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = []
f1s = []
train_f1s = [] # ADDED: List to store training F1 scores

print("Running 5-Fold CV with Best Parameters to find optimal threshold...")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]
    
    # Initialize model with the best parameters found
    model = HistGradientBoostingClassifier(
        max_iter=N_ESTIMATORS,
        random_state=42,
        **best_params # Unpack the best parameters
    )
    model.fit(X_tr, y_tr)
    
    proba = model.predict_proba(X_va)[:, 1]
    best_t = 0.5
    best_f = 0
    
    # Search threshold around 0.5
    for t in np.arange(0.40, 0.60, 0.001):
        f = f1_score(y_va, (proba >= t))
        if f > best_f:
            best_f = f
            best_t = t
            

    # --- TRAINING ERROR CHECK INSERTED HERE ---
    # 1. Get predictions for the training set
    train_proba = model.predict_proba(X_tr)[:, 1]
    
    # 2. Calculate the F1 score on the training set using the SAME best threshold (best_t)
    train_f1 = f1_score(y_tr, (train_proba >= best_t))
    train_f1s.append(train_f1)
    # ----------------------------------------
    thresholds.append(best_t)
    f1s.append(best_f)
    # MODIFIED PRINT LINE to include Train F1
    print(f"Fold {fold_idx+1}: Train F1={train_f1:.5f} | Val F1={best_f:.5f} | Threshold={best_t:.4f}")

best_thresh = np.mean(thresholds)
print(f"\n5-FOLD FINAL F1 (TRAINING): {np.mean(train_f1s):.5f} ± {np.std(train_f1s):.4f}")
print(f"\n5-FOLD FINAL F1 (VALIDATION): {np.mean(f1s):.5f} ± {np.std(f1s):.4f}")
print(f"AVERAGE BEST THRESHOLD: {best_thresh:.4f}")


# === FINAL MODEL ON 100% DATA ===
final_model = HistGradientBoostingClassifier(
    max_iter=N_ESTIMATORS, 
    random_state=42,
    **best_params # Use the optimized parameters
)
final_model.fit(X, y)

# Predict with the optimized threshold
test_proba = final_model.predict_proba(X_test)[:, 1]
predictions = (test_proba >= best_thresh).astype(int)

# Save
submission = pd.DataFrame({
    'patient_id': test['patient_id'],
    'has_copd_risk': predictions
})
submission.to_csv("submission_gradient_boost_8_tuned.csv", index=False)

print(f"\nSUBMISSION SAVED as 'submission_gradient_boost_8_tuned.csv'!")
print(f"Predicted positive: {predictions.sum()} (threshold = {best_thresh:.4f})")

Starting Randomized Search for optimal hyperparameters...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

=== RANDOMIZED SEARCH RESULTS ===
BEST F1 SCORE (CV): 0.71951
BEST PARAMETERS:
  l2_regularization: 0.1326033192269655
  learning_rate: 0.10070509112900147
  max_depth: 11
  max_leaf_nodes: 56
  min_samples_leaf: 18

Running 5-Fold CV with Best Parameters to find optimal threshold...
Fold 1: Train F1=0.90694 | Val F1=0.74671 | Threshold=0.4140
Fold 2: Train F1=0.79392 | Val F1=0.72873 | Threshold=0.4020
Fold 3: Train F1=0.84318 | Val F1=0.73084 | Threshold=0.4060
Fold 4: Train F1=0.86777 | Val F1=0.74162 | Threshold=0.4160
Fold 5: Train F1=0.83349 | Val F1=0.73302 | Threshold=0.4010

5-FOLD FINAL F1 (TRAINING): 0.84906 ± 0.0375

5-FOLD FINAL F1 (VALIDATION): 0.73618 ± 0.0068
AVERAGE BEST THRESHOLD: 0.4078

SUBMISSION SAVED as 'submission_gradient_boost_8_tuned.csv'!
Predicted positive: 4898 (threshold = 0.4078)
