<a href="https://colab.research.google.com/github/vivekvj18/ML_PROJECT/blob/main/ML_Project_(91_212).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==================================================
# XGBoost FINAL (NO EARLY STOPPING) + 5-FOLD CV AVERAGING
# ==================================================
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from google.colab import files

RND = 42

# --- Load Data ---
train_df = pd.read_csv("combined_data.csv")
test_df_original = pd.read_csv("test.csv")  # keep original id column

# =====================================================
# --- FEATURE ENGINEERING (same as before) ---
# =====================================================
for df in [train_df, test_df_original]:
    df['BMI'] = df['Weight'] / (df['Height']**2)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 30, 45, 60, 100],
                            labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior'])
    df['BMICategory'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 100],
                               labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
    df['WeightHeightRatio'] = df['Weight'] / df['Height']
    df['BMI_per_Age'] = df['BMI'] / df['Age'].replace(0, np.nan)
    df['Height_per_Age'] = df['Height'] / df['Age'].replace(0, np.nan)
    # --- Extra ratio-based features (new additions) ---
    df['BMI_Age_Ratio'] = df['BMI'] / (df['Age'] + 1)
    df['Age_Height_Ratio'] = df['Age'] / (df['Height'] + 1)



    # Interaction features (kept for now)
    df['Weight_x_Age'] = df['Weight'] * df['Age']
    df['BMI_x_Height'] = df['BMI'] * df['Height']

    if 'Gender' in df.columns:
        df['Weight_z_byGender'] = df.groupby('Gender')['Weight'].transform(lambda x: (x - x.mean()) / x.std())
        df['Height_z_byGender'] = df.groupby('Gender')['Height'].transform(lambda x: (x - x.mean()) / x.std())

    if {'Gender', 'AgeGroup'} <= set(df.columns):
        df['Gender_AgeGroup'] = df['Gender'].astype(str) + "_" + df['AgeGroup'].astype(str)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)

# --- Prepare Training Data ---
X = train_df.drop(columns=['WeightCategory', 'id'], errors='ignore')
y = train_df['WeightCategory']

if 'AgeGroup' in X.columns and X['AgeGroup'].isnull().sum() > 0:
    X['AgeGroup'].fillna(X['AgeGroup'].mode()[0], inplace=True)

numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# =====================================================
# --- XGBoost PARAMETERS ---
# =====================================================
xgb_params = dict(
    objective='multi:softprob',
    num_class=len(le.classes_),
    n_estimators=1200,        # allow longer learning
    learning_rate=0.03,       # smaller step
    max_depth=6,              # deeper trees for more complexity
    min_child_weight=3,       # allows finer splits
    subsample=0.85,
    colsample_bytree=0.85,
    gamma=0.15,               # slightly lower pruning
    reg_alpha=0.1,            # L1 regularization
    reg_lambda=1.5,           # L2 regularization
    eval_metric='mlogloss',
    random_state=RND,
    n_jobs=-1
)

# =====================================================
# STEP 1: Quick Train/Val evaluation (NO early stopping)
# =====================================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=RND, stratify=y_encoded
)

print("▶ Preprocessing (fit on train)...")
preprocessor.fit(X_train)
X_train_t = preprocessor.transform(X_train)
X_val_t = preprocessor.transform(X_val)

print("▶ Fitting XGBoost (no early stopping)...")
model = XGBClassifier(**xgb_params)
model.fit(X_train_t, y_train)  # no early stopping

train_acc = accuracy_score(y_train, model.predict(X_train_t))
val_acc = accuracy_score(y_val, model.predict(X_val_t))
print("\n=============================================")
print("MODEL PERFORMANCE (NO EARLY STOPPING)")
print("=============================================")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Overfitting Gap: {(train_acc - val_acc)*100:.2f}%")
print("=============================================\n")

# =====================================================
# STEP 2: Stratified 5-Fold CV Averaging (CORRECTED)
# =====================================================
print("▶ Performing Stratified 5-Fold CV averaging (no early stopping)...")
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=RND)
test_data = test_df_original.drop(columns=['id', 'WeightCategory'], errors='ignore')

# 1. Initialize array to accumulate PROBABILITY scores, not just predictions.
test_preds_proba = np.zeros((test_data.shape[0], len(le.classes_)))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded), 1):
    print(f"  - Fold {fold}/5")
    X_tr = X.iloc[train_idx]
    y_tr = y_encoded[train_idx]

    # fit preprocessor on training fold
    preprocessor.fit(X_tr)
    X_tr_t = preprocessor.transform(X_tr)
    X_test_t = preprocessor.transform(test_data)

    fold_model = XGBClassifier(**xgb_params)
    fold_model.fit(X_tr_t, y_tr)

    # 2. CORRECTED: Get probability predictions from the fold model and accumulate
    test_preds_proba += fold_model.predict_proba(X_test_t)


# 3. Average the accumulated probabilities and convert to final labels
test_preds_avg = test_preds_proba / skf.n_splits
test_pred_encoded = np.argmax(test_preds_avg, axis=1)
test_pred_labels = le.inverse_transform(test_pred_encoded)
print("▶ CV-averaged test predictions ready.")

# =====================================================
# STEP 3: (Optional) Retrain on FULL training data for final model
# =====================================================
print("▶ Retraining model on full training data (no early stopping)...")
preprocessor.fit(X)
X_full_t = preprocessor.transform(X)
final_model = XGBClassifier(**xgb_params)
final_model.fit(X_full_t, y_encoded)  # no early stopping
print("▶ Retrain complete.")

# =====================================================
# STEP 4: Save submission (using CV-averaged predictions)
# =====================================================
#submission_file = 'kaggle_submission_xgb_no_earlystop_cvavg_FIXED.csv'
#submission_df = pd.DataFrame({
#    'id': test_df_original['id'],
#    'WeightCategory': test_pred_labels # Ensure this is the corrected variable
#})
#submission_df.to_csv(submission_file, index=False)
#print(f"✅ Submission file created: {submission_file}")

# Uncomment to auto-download in Colab:
# files.download(submission_file)

▶ Preprocessing (fit on train)...
▶ Fitting XGBoost (no early stopping)...

MODEL PERFORMANCE (NO EARLY STOPPING)
Training Accuracy: 0.9956
Validation Accuracy: 0.9150
Overfitting Gap: 8.06%

▶ Performing Stratified 5-Fold CV averaging (no early stopping)...
  - Fold 1/5
  - Fold 2/5
  - Fold 3/5
  - Fold 4/5


KeyboardInterrupt: 

In [None]:
submission_file = 'kaggle_submission_xgb_no_earlystop_cvavg_FIXED.csv'
submission_df = pd.DataFrame({
    'id': test_df_original['id'],
    'WeightCategory': test_pred_labels # Ensure this is the corrected variable
})
submission_df.to_csv(submission_file, index=False)
print(f"✅ Submission file created: {submission_file}")

✅ Submission file created: kaggle_submission_xgb_no_earlystop_cvavg_FIXED.csv


In [None]:
# ==================================================
# 🏆 XGBoost FINAL (NO EARLY STOPPING) + 5-FOLD CV AVERAGING
# Optimized for Kaggle (Single Model Only)
# ==================================================
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from google.colab import files

RND = 42
np.random.seed(RND)

# ==================================================
# --- LOAD DATA ---
# ==================================================
train_df = pd.read_csv("combined_data.csv")
test_df_original = pd.read_csv("test.csv")  # keep id column for submission

# ==================================================
# --- FEATURE ENGINEERING ---
# ==================================================
for df in [train_df, test_df_original]:
    # --- Base ratios ---
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 30, 45, 60, 100],
                            labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior'])
    df['BMICategory'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 100],
                               labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
    df['WeightHeightRatio'] = df['Weight'] / df['Height']
    df['BMI_per_Age'] = df['BMI'] / df['Age'].replace(0, np.nan)
    df['Height_per_Age'] = df['Height'] / df['Age'].replace(0, np.nan)
    df['BMI_Age_Ratio'] = df['BMI'] / (df['Age'] + 1)
    df['Age_Height_Ratio'] = df['Age'] / (df['Height'] + 1)
    df['Weight_x_Age'] = df['Weight'] * df['Age']
    df['BMI_x_Height'] = df['BMI'] * df['Height']

    # --- Gender normalization ---
    if 'Gender' in df.columns:
        df['Weight_z_byGender'] = df.groupby('Gender')['Weight'].transform(lambda x: (x - x.mean()) / x.std())
        df['Height_z_byGender'] = df.groupby('Gender')['Height'].transform(lambda x: (x - x.mean()) / x.std())

    # --- Categorical interaction ---
    if {'Gender', 'AgeGroup'} <= set(df.columns):
        df['Gender_AgeGroup'] = df['Gender'].astype(str) + "_" + df['AgeGroup'].astype(str)

    # --- NEW extra engineered ratios & products ---
    if all(col in df.columns for col in ['HCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'Water']):
        df['HCVC_NCP_Ratio'] = df['HCVC'] / (df['NCP'] + 1)
        df['CH2O_FAF_Product'] = df['CH2O'] * df['FAF']
        df['FAF_TUE_Ratio'] = df['FAF'] / (df['TUE'] + 1)
        df['Water_CH2O_Ratio'] = df['Water'] / (df['CH2O'] + 1)
        df['BMIxFAF'] = df['BMI'] * df['FAF']

    # --- Handle inf/nan ---
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(numeric_only=True), inplace=True)

# ==================================================
# --- PREPARE TRAINING DATA ---
# ==================================================
X = train_df.drop(columns=['WeightCategory', 'id'], errors='ignore')
y = train_df['WeightCategory']

if 'AgeGroup' in X.columns and X['AgeGroup'].isnull().sum() > 0:
    X['AgeGroup'].fillna(X['AgeGroup'].mode()[0], inplace=True)

numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ==================================================
# --- XGBOOST PARAMETERS (TUNED) ---
# ==================================================
xgb_params = dict(
    objective='multi:softprob',
    num_class=len(le.classes_),
    n_estimators=1200,          # Must be higher for smaller learning rate
    learning_rate=0.02,         # Much slower learning
    max_depth=4,                # Conservative tree depth
    subsample=0.75,
    colsample_bytree=0.75,
    gamma=0.5,                  # Stricter pruning
    min_child_weight=6,
    reg_alpha=0.01,           # L2 regularization
    eval_metric='mlogloss',
    random_state=RND,
    n_jobs=-1
)


# ==================================================
# --- TRAIN / VALIDATION SPLIT (Quick Check) ---
# ==================================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=RND, stratify=y_encoded
)

print("▶ Preprocessing (fit on train)...")
preprocessor.fit(X_train)
X_train_t = preprocessor.transform(X_train)
X_val_t = preprocessor.transform(X_val)

print("▶ Training single XGBoost model (no early stopping)...")
model = XGBClassifier(**xgb_params)
model.fit(X_train_t, y_train)

train_acc = accuracy_score(y_train, model.predict(X_train_t))
val_acc = accuracy_score(y_val, model.predict(X_val_t))
print("\n=============================================")
print("MODEL PERFORMANCE (NO EARLY STOPPING)")
print("=============================================")
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Overfitting Gap: {(train_acc - val_acc)*100:.2f}%")
print("=============================================\n")

# ==================================================
# --- STRATIFIED 5-FOLD CV AVERAGING ---
# ==================================================
print("▶ Performing Stratified 5-Fold CV averaging (no early stopping)...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RND)
test_data = test_df_original.drop(columns=['id', 'WeightCategory'], errors='ignore')

test_preds_proba = np.zeros((test_data.shape[0], len(le.classes_)))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded), 1):
    print(f"  - Fold {fold}/5")
    X_tr = X.iloc[train_idx]
    y_tr = y_encoded[train_idx]

    preprocessor.fit(X_tr)
    X_tr_t = preprocessor.transform(X_tr)
    X_test_t = preprocessor.transform(test_data)

    fold_model = XGBClassifier(**xgb_params)
    fold_model.fit(X_tr_t, y_tr)

    test_preds_proba += fold_model.predict_proba(X_test_t)

# --- Average predicted probabilities ---
test_preds_avg = test_preds_proba / skf.n_splits
test_pred_encoded = np.argmax(test_preds_avg, axis=1)
test_pred_labels = le.inverse_transform(test_pred_encoded)
print("▶ CV-averaged test predictions ready.")

# ==================================================
# --- RETRAIN ON FULL TRAIN DATA (FINAL MODEL) ---
# ==================================================
print("▶ Retraining model on full training data (no early stopping)...")
preprocessor.fit(X)
X_full_t = preprocessor.transform(X)
final_model = XGBClassifier(**xgb_params)
final_model.fit(X_full_t, y_encoded)
print("▶ Retrain complete.")

# ==================================================
# --- SAVE SUBMISSION ---
# ==================================================
#submission_file = 'kaggle_submission_xgb_FINAL.csv'
#submission_df = pd.DataFrame({
#    'id': test_df_original['id'],
#    'WeightCategory': test_pred_labels
#})
#submission_df.to_csv(submission_file, index=False)
#print(f"✅ Submission file created: {submission_file}")

# Uncomment in Colab to auto-download:
# files.download(submission_file)


▶ Preprocessing (fit on train)...
▶ Training single XGBoost model (no early stopping)...

MODEL PERFORMANCE (NO EARLY STOPPING)
Training Accuracy: 0.9508
Validation Accuracy: 0.9158
Overfitting Gap: 3.50%

▶ Performing Stratified 5-Fold CV averaging (no early stopping)...
  - Fold 1/5
  - Fold 2/5
  - Fold 3/5
  - Fold 4/5
  - Fold 5/5
▶ CV-averaged test predictions ready.
▶ Retraining model on full training data (no early stopping)...
▶ Retrain complete.


In [None]:
submission_file = 'kaggle_submission_xgb_FINAL.csv'
submission_df = pd.DataFrame({
    'id': test_df_original['id'],
    'WeightCategory': test_pred_labels
})
submission_df.to_csv(submission_file, index=False)
print(f"✅ Submission file created: {submission_file}")

# Uncomment in Colab to auto-download:
# files.download(submission_file)

✅ Submission file created: kaggle_submission_xgb_FINAL.csv
