<a href="https://colab.research.google.com/github/vivekvj18/ML_PROJECT/blob/main/Shared_ML_Project_91_597.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m276.5/400.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# XGBoost

# 1. Max Data (Train + Original)
# 2. KEEP Duplicates (This is the experiment)
# 3. Holistic Preprocessing (All-in-one Scaler/Encoder)
# 4. NO Advanced Feature Engineering
# 5. Robust Optuna Tuning (5-Fold CV + Sample Weights)
# 6. Robust Prediction (5-Fold Averaging)

import pandas as pd
import numpy as np
import os
import gc
import warnings
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import optuna
from optuna.samplers import TPESampler

warnings.filterwarnings('ignore')
RND = 42

# Setup Kaggle
print("Setting up Kaggle...")
from google.colab import files
try:
    # To upload kaggle.json only if not present
    if not os.path.exists('/root/.kaggle/kaggle.json'):
        files.upload()
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
        !pip install kaggle -q
    else:
        print("Kaggle.json already exists.")

    # Download data if not present
    if not os.path.exists('data/train.csv'):
        !kaggle competitions download -c ait-511-course-project-1-obesity-risk
        !unzip -q ait-511-course-project-1-obesity-risk.zip -d data
    else:
        print("Competition data already exists.")

    if not os.path.exists('data/ObesityDataSet.csv'):
        !kaggle datasets download -d aravindpcoder/obesity-or-cvd-risk-classifyregressorcluster
        !unzip -q obesity-or-cvd-risk-classifyregressorcluster.zip -d data
    else:
        print("Original dataset already exists.")

    print("Kaggle setup and data download complete.")
except Exception as e:
    print(f"Kaggle setup failed (might be running locally or credentials error): {e}")


# Load and Combine Data
print("Loading and combining data...")
try:
    df_train = pd.read_csv('data/train.csv')
    df_test = pd.read_csv('data/test.csv')
    df_train_orig = pd.read_csv("data/ObesityDataSet.csv")
    df_train_orig.rename(columns={'NObeyesdad': 'WeightCategory'}, inplace=True)
except FileNotFoundError:
    print("Error: CSV files not found. Make sure data downloaded correctly.")
    raise

# Combine train and original
df_train_combined = pd.concat([df_train, df_train_orig], ignore_index=True)
print(f"Shape before dropping duplicates: {df_train_combined.shape}")

df_train_combined.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df_train_combined.shape}")

# Adding 'typ' marker for Holistic Preprocessing
df_test_copy = df_test.copy()
df_train_combined['typ'] = 0
df_test_copy['typ'] = 1

# Combine all into one for processing
df_all = pd.concat([df_train_combined, df_test_copy], ignore_index=True)
print(f"Total shape for preprocessing: {df_all.shape}")


# STEP 3: Holistic Preprocessing

print("Starting holistic preprocessing...")

# Define feature lists
target = 'WeightCategory'
exclude_cols = ['id', 'typ', target]

# Exclude the target variable from the categorical features list
cat_feats = [col for col in df_all.select_dtypes(include=['object', 'category']).columns if col != target]
num_feats = [col for col in df_all.select_dtypes(include=np.number).columns if col not in exclude_cols]

print(f"Found {len(num_feats)} numeric features and {len(cat_feats)} categorical features to encode.")

# Apply pd.get_dummies
df_all = pd.get_dummies(df_all, columns=cat_feats, drop_first=True)

# Apply StandardScaler
scaler = StandardScaler()
df_all[num_feats] = scaler.fit_transform(df_all[num_feats])

print("Preprocessing complete.")


# STEP 4: Split Data for Training

print("Splitting data back into train and test...")

# Split back into train and test sets
df_train_final = df_all[df_all['typ'] == 0].reset_index(drop=True)
df_test_final = df_all[df_all['typ'] == 1].reset_index(drop=True)

# Define final feature columns
feat_cols = [col for col in df_train_final.columns if col not in exclude_cols]

X = df_train_final[feat_cols]
y_raw = df_train_final[target]
X_test = df_test_final[feat_cols]

# Align columns just in case
X_test = X_test[X.columns]

le = LabelEncoder()
y = le.fit_transform(y_raw)

# Clean up memory
del df_train, df_test, df_train_orig, df_train_combined, df_all, df_train_final
gc.collect()

print(f"Final training features: {X.shape[1]}")

# Optuna Objective

def objective(trial):
    """Optuna objective function for hyperparameter optimization"""

    # Using the same proven ranges from the 91.377 script
    params = {
        'objective': 'multi:softprob',
        'num_class': len(le.classes_),
        'eval_metric': 'mlogloss',
        'seed': RND,
        'use_label_encoder': False,
        'verbosity': 0,

        # Tuned Ranges
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.9, step=0.05),
        'gamma': trial.suggest_float('gamma', 0.0, 3.0, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 5.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 5.0, log=True),
    }

    # 5-fold CV for robust scores
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RND)
    scores = []

    # Use .iloc for safe indexing
    for train_idx, val_idx in cv.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        # Robust Imbalance Handling
        class_weights = compute_class_weight('balanced',
                                           classes=np.unique(y_train_fold),
                                           y=y_train_fold)
        sample_weights = np.array([class_weights[y] for y in y_train_fold])

        model = XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold, sample_weight=sample_weights, verbose=False)

        preds = model.predict(X_val_fold)
        acc = accuracy_score(y_val_fold, preds)
        scores.append(acc)

    return np.mean(scores)


# Optuna Optimization

print("\n" + "="*60)
print("PHASE 2: Starting Optuna optimization ...")
print("Trials: 300 configurations (using all cores)")
print("Timeout: 5 hours")
print("CV: 5-fold (for robustness)")
print("="*60 + "\n")

study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=RND),
    study_name='xgboost_keep_dupes_optuna'
)

study.optimize(
    objective,
    n_trials=300,    # 300 trials
    timeout=18000,    # 5 hour
    show_progress_bar=True,
    n_jobs=-1
)

print("\n" + "="*60)
print("Optimization Complete!")
print("="*60)
print(f"Best CV Score: {study.best_value:.6f}")
print(f"Best Parameters:")
for key, value in study.best_params.items():
    print(f"  {key:20s}: {value}")
print("="*60 + "\n")

# Final Training

print("Training final model with best parameters (5-fold CV)...\n")

best_params = {
    'objective': 'multi:softprob',
    'num_class': len(le.classes_),
    'eval_metric': 'mlogloss',
    'seed': RND,
    'use_label_encoder': False,
    **study.best_params
}

K_FOLDS = 5
skf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=RND)

test_preds_proba = np.zeros((X_test.shape[0], len(le.classes_)))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"  --- Fold {fold}/{K_FOLDS} ---")

    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]

    # Sample weights
    class_weights = compute_class_weight('balanced',
                                       classes=np.unique(y_train_fold),
                                       y=y_train_fold)
    sample_weights = np.array([class_weights[y] for y in y_train_fold])

    #  USE 'best_params'
    model = XGBClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold, sample_weight=sample_weights, verbose=False)

    val_acc = accuracy_score(y_val_fold, model.predict(X_val_fold))
    fold_scores.append(val_acc)
    print(f"    Fold {fold} Accuracy: {val_acc:.6f}")

    # Predict on the test data (average predictions)
    test_preds_proba += model.predict_proba(X_test)

# Average predictions
test_preds_proba /= K_FOLDS
test_pred_encoded = np.argmax(test_preds_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_encoded)

final_cv = np.mean(fold_scores)
final_std = np.std(fold_scores)

# Corrected print statements ---
print(f"\n{'='*60}")
print(f"FINAL RESULTS (Keep Duplicates + Optuna)")
print(f"{'='*60}")
print(f"▶ Mean CV Accuracy: {final_cv:.6f} ± {final_std:.6f}")
print(f"▶ Fold Scores: {[f'{s:.6f}' for s in fold_scores]}")
print(f"{'='*60}\n")
# -----------------------------------

if final_cv >= 0.9185:
    print(f"🎉🎉🎉 SUCCESS: CV >= 0.919! (Score: {final_cv:.6f})")
elif final_cv >= 0.917:
    print(f" Excellent Score: CV = {final_cv:.6f} (target: 0.919)")
else:
    print(f" Final CV: {final_cv:.6f} (target: 0.919)")

# Save Submission

submission_file = 'kaggle_submission_optuna.csv'
submission_df = pd.DataFrame({
    'id': df_test_copy['id'],
    'WeightCategory': test_pred_labels
})
submission_df.to_csv(submission_file, index=False)
submission_df.rename(columns={'WeightCategory': 'NObeyesdad'}, inplace=True)
submission_df.to_csv("submission.csv", index=False)

print(f"\n Submission file created: {submission_file}")
print(f" Also saved as 'submission.csv' for Kaggle")
print(f" Optimized with {len(study.trials)} Optuna trials.")
print(f" Best trial score: {study.best_value:.6f}\n")

print(submission_df.head())

Setting up Kaggle...


Saving kaggle.json to kaggle.json
Downloading ait-511-course-project-1-obesity-risk.zip to /content
  0% 0.00/570k [00:00<?, ?B/s]
100% 570k/570k [00:00<00:00, 968MB/s]
Dataset URL: https://www.kaggle.com/datasets/aravindpcoder/obesity-or-cvd-risk-classifyregressorcluster
License(s): CC-BY-SA-4.0
Downloading obesity-or-cvd-risk-classifyregressorcluster.zip to /content
  0% 0.00/57.5k [00:00<?, ?B/s]
100% 57.5k/57.5k [00:00<00:00, 188MB/s]
Kaggle setup and data download complete.
Loading and combining data...
Shape before dropping duplicates: (17644, 18)
Shape after dropping duplicates: (17620, 18)
Total shape for preprocessing: (22845, 19)
NO Advanced features being created.
Starting holistic preprocessing...
Found 8 numeric features and 8 categorical features to encode.
Preprocessing complete.
Splitting data back into train and test...


[I 2025-10-25 05:26:00,852] A new study created in memory with name: xgboost_keep_dupes_optuna


Final training features: 23

PHASE 2: Starting Optuna optimization (KEEPING Duplicates)...
Trials: 150 configurations (using all cores)
Timeout: 2 hours
CV: 5-fold (for robustness)



  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-10-25 05:28:04,406] Trial 0 finished with value: 0.9119182746878547 and parameters: {'n_estimators': 1300, 'learning_rate': 0.0306079986068559, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.65, 'gamma': 1.8, 'reg_alpha': 0.013169219264507252, 'reg_lambda': 4.3747430881287395}. Best is trial 0 with value: 0.9119182746878547.
[I 2025-10-25 05:29:38,491] Trial 1 finished with value: 0.9159477866061294 and parameters: {'n_estimators': 1900, 'learning_rate': 0.02244465395853898, 'max_depth': 9, 'min_child_weight': 7, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.4, 'gamma': 0.8, 'reg_alpha': 0.27789040825559985, 'reg_lambda': 0.7439574760713358}. Best is trial 1 with value: 0.9159477866061294.
[I 2025-10-25 05:31:44,425] Trial 2 finished with value: 0.9107264472190693 and parameters: {'n_estimators': 1800, 'learning_rate': 0.010325386112631162, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.55, 'colsample_bytree': 0.4, 