<a href="https://colab.research.google.com/github/vivekvj18/ML_PROJECT/blob/main/ML_Project(91_019).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==================================================
# CLEAN XGBOOST TRAINING (80-20 SPLIT)
# From scratch: Data Cleaning + Preprocessing + Training + Accuracy
# ==================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# 1️⃣ Load Data
train_df = pd.read_csv("train.csv")
print("✅ Data Loaded Successfully!")
print("Shape:", train_df.shape)
print("Columns:", train_df.columns.tolist())

# 2️⃣ Basic Cleaning
# Drop duplicates
train_df = train_df.drop_duplicates()

# Handle missing values
train_df = train_df.fillna(train_df.mode().iloc[0])  # Fill all missing values with mode

# 3️⃣ Feature Engineering
train_df['BMI'] = train_df['Weight'] / (train_df['Height']**2)
train_df['AgeGroup'] = pd.cut(
    train_df['Age'],
    bins=[0, 18, 30, 45, 60, 100],
    labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior']
)

# 4️⃣ Separate features and target
X = train_df.drop(columns=['WeightCategory', 'id'], errors='ignore')
y = train_df['WeightCategory']

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 5️⃣ Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# 6️⃣ Identify categorical and numeric columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numeric_features = X.select_dtypes(include=np.number).columns

# 7️⃣ Preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 8️⃣ Model Definition (Basic XGBoost)
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    learning_rate=0.05,
    n_estimators=300,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    gamma=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# 9️⃣ Pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

# 🔟 Train the model
print("\n🚀 Training XGBoost Model...")
model.fit(X_train, y_train)
print("✅ Training Complete!")

# 1️⃣1️⃣ Evaluate
y_pred = model.predict(X_test)

print("\n📊 Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


✅ Data Loaded Successfully!
Shape: (15533, 18)
Columns: ['id', 'Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'WeightCategory']
Train shape: (12426, 18), Test shape: (3107, 18)

🚀 Training XGBoost Model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Training Complete!

📊 Model Performance:
Accuracy: 0.9051

Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.92      0.94      0.93       374
      Normal_Weight       0.89      0.89      0.89       469
     Obesity_Type_I       0.89      0.86      0.88       441
    Obesity_Type_II       0.96      0.97      0.96       481
   Obesity_Type_III       0.99      1.00      0.99       597
 Overweight_Level_I       0.81      0.75      0.78       369
Overweight_Level_II       0.81      0.85      0.83       376

           accuracy                           0.91      3107
          macro avg       0.90      0.90      0.90      3107
       weighted avg       0.90      0.91      0.90      3107


Confusion Matrix:
[[353  19   0   0   0   2   0]
 [ 26 417   0   0   0  22   4]
 [  0   0 380  18   4  12  27]
 [  0   0   9 468   2   0   2]
 [  0   0   0   0 596   1   0]
 [  4  31  13   0   0 277  44]
 [  0   1  23   3   0  28 321]]


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ✅ Predict on test (20%) data
y_test_pred = model.predict(X_test)

# ✅ Calculate accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\n🎯 Accuracy on 20% Test Data: {test_accuracy:.4f}")

# ✅ Detailed metrics
print("\nClassification Report on 20% Test Data:")
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix on 20% Test Data:")
print(confusion_matrix(y_test, y_test_pred))



🎯 Accuracy on 20% Test Data: 0.9051

Classification Report on 20% Test Data:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       374
           1       0.89      0.89      0.89       469
           2       0.89      0.86      0.88       441
           3       0.96      0.97      0.96       481
           4       0.99      1.00      0.99       597
           5       0.81      0.75      0.78       369
           6       0.81      0.85      0.83       376

    accuracy                           0.91      3107
   macro avg       0.90      0.90      0.90      3107
weighted avg       0.90      0.91      0.90      3107


Confusion Matrix on 20% Test Data:
[[353  19   0   0   0   2   0]
 [ 26 417   0   0   0  22   4]
 [  0   0 380  18   4  12  27]
 [  0   0   9 468   2   0   2]
 [  0   0   0   0 596   1   0]
 [  4  31  13   0   0 277  44]
 [  0   1  23   3   0  28 321]]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from scipy.stats import uniform, randint

# Identify features
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns
numeric_features = X_train.select_dtypes(include=np.number).columns

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Base model
num_classes = len(np.unique(y_train))
xgb_base = XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_base)
])

# Hyperparameter distributions
param_dist = {
    'classifier__max_depth': randint(4, 9),
    'classifier__learning_rate': uniform(0.01, 0.09),
    'classifier__n_estimators': randint(300, 900),
    'classifier__subsample': uniform(0.7, 0.3),
    'classifier__colsample_bytree': uniform(0.7, 0.3),
    'classifier__min_child_weight': randint(1, 6),
    'classifier__gamma': uniform(0, 0.3)
}

# Randomized Search
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=25,  # smaller for speed
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit
random_search.fit(X_train, y_train)

# Best estimator
best_model = random_search.best_estimator_


Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# ===============================
# XGBoost Full Pipeline with Faster Hypertuning
# ===============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# -------------------------------
# 1️⃣ Load Data
# -------------------------------
train_df = pd.read_csv("train.csv")  # make sure train.csv is uploaded

# -------------------------------
# 2️⃣ Feature Engineering
# -------------------------------
train_df['BMI'] = train_df['Weight'] / (train_df['Height']**2)
train_df['AgeGroup'] = pd.cut(
    train_df['Age'],
    bins=[0, 18, 30, 45, 60, 100],
    labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior']
)

# -------------------------------
# 3️⃣ Separate features and target
# -------------------------------
X = train_df.drop(columns=['id', 'WeightCategory'], errors='ignore')
y = train_df['WeightCategory']

# -------------------------------
# 4️⃣ Train-Test Split (80%-20%)
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -------------------------------
# 5️⃣ Preprocessing Pipeline
# -------------------------------
# Identify categorical and numeric features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numeric_features = X.select_dtypes(include=np.number).columns

# Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Encode target
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# -------------------------------
# 6️⃣ XGBoost Model & RandomizedSearchCV
# -------------------------------
xgb_model = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_jobs=-1,
    seed=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

# Faster hypertuning
param_dist = {
    'classifier__n_estimators': [200, 400, 600],
    'classifier__max_depth': [4, 5, 6],
    'classifier__learning_rate': [0.01, 0.03, 0.05],
    'classifier__subsample': [0.8, 0.85],
    'classifier__colsample_bytree': [0.8, 0.85],
    'classifier__gamma': [0, 0.1, 0.2]
}

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=15,  # random 15 combinations
    cv=cv_strategy,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# -------------------------------
# 7️⃣ Fit RandomizedSearchCV
# -------------------------------
print("🚀 Starting Faster XGBoost Hypertuning...")
random_search.fit(X_train, y_train_enc)
print("✅ Hypertuning Complete!")

# -------------------------------
# 8️⃣ Evaluate on Train & Test
# -------------------------------
best_model = random_search.best_estimator_

# Predictions
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Best Hyperparameters: {random_search.best_params_}\n")
print(f"Train Accuracy: {accuracy_score(y_train_enc, y_train_pred):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test_enc, y_test_pred):.4f}\n")

print("Classification Report (Test Set):")
print(classification_report(y_test_enc, y_test_pred, target_names=le.classes_))

print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test_enc, y_test_pred))


🚀 Starting Faster XGBoost Hypertuning...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
✅ Hypertuning Complete!
Best Hyperparameters: {'classifier__subsample': 0.8, 'classifier__n_estimators': 400, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.05, 'classifier__gamma': 0.2, 'classifier__colsample_bytree': 0.85}

Train Accuracy: 0.9485
Test Accuracy: 0.9063

Classification Report (Test Set):
                     precision    recall  f1-score   support

Insufficient_Weight       0.93      0.95      0.94       374
      Normal_Weight       0.89      0.91      0.90       469
     Obesity_Type_I       0.89      0.87      0.88       441
    Obesity_Type_II       0.96      0.97      0.97       481
   Obesity_Type_III       0.99      1.00      0.99       597
 Overweight_Level_I       0.82      0.75      0.78       369
Overweight_Level_II       0.80      0.84      0.82       376

           accuracy                           0.91      3107
          macro avg       0.9

In [None]:
# ==================================================
# FINAL SUBMISSION CODE WITH TUNED XGBOOST
# ==================================================
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from google.colab import files

# --- Load Data ---
train_df = pd.read_csv("train.csv")
test_df_original = pd.read_csv("test.csv")  # Keep original 'id'

# --- Feature Engineering ---
for df in [train_df, test_df_original]:
    df['BMI'] = df['Weight'] / (df['Height']**2)
    df['AgeGroup'] = pd.cut(
        df['Age'],
        bins=[0, 18, 30, 45, 60, 100],
        labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior']
    )

# --- Prepare Training Data ---
X = train_df.drop(columns=['WeightCategory', 'id'], errors='ignore')
y = train_df['WeightCategory']

# Fill missing AgeGroup if any
if 'AgeGroup' in X.columns and X['AgeGroup'].isnull().sum() > 0:
    mode_age_group = X['AgeGroup'].mode()[0]
    X['AgeGroup'].fillna(mode_age_group, inplace=True)
else:
    mode_age_group = X['AgeGroup'].mode()[0] if 'AgeGroup' in X.columns else None

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# --- Define Final XGBoost Model with Tuned Hyperparameters ---
final_xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    n_estimators=400,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.2,
    eval_metric='mlogloss',
    use_label_encoder=False,
    seed=42
)

# --- Full Pipeline ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', final_xgb)
])

# --- Train on FULL training data ---
print("🚀 Training final XGBoost model on full training data...")
pipeline.fit(X, y_encoded)
print("✅ Training complete!")

# --- Prepare Test Data ---
X_test = test_df_original.drop(columns=['id', 'WeightCategory'], errors='ignore')
if 'AgeGroup' in X_test.columns and X_test['AgeGroup'].isnull().sum() > 0 and mode_age_group is not None:
    X_test['AgeGroup'].fillna(mode_age_group, inplace=True)

# --- Predict ---
test_pred_encoded = pipeline.predict(X_test)
test_pred_labels = le.inverse_transform(test_pred_encoded)
print("✅ Test predictions generated!")

# --- Create Submission ---
submission_file = 'kaggle_submission_xgb_final_withhypertuning.csv'
submission_df = pd.DataFrame({
    'id': test_df_original['id'],
    'WeightCategory': test_pred_labels
})
submission_df.to_csv(submission_file, index=False)
print(f"✅ Submission file created: {submission_file}")

# Uncomment to download automatically in Colab
# files.download(submission_file)


🚀 Training final XGBoost model on full training data...
✅ Training complete!
✅ Test predictions generated!
✅ Submission file created: kaggle_submission_xgb_final_withhypertuning.csv


In [None]:
# =============================
# XGBoost Hypertuning + Final Model + Submission
# =============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# -----------------------------
# 1️⃣ Load Data
# -----------------------------
print("✅ Data Loaded Successfully!")
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Train shape:", train_df.shape, ", Test shape:", test_df.shape)

# -----------------------------
# 2️⃣ Feature Engineering (Apply to both train and test)
# -----------------------------
for df in [train_df, test_df]:
    df['BMI'] = df['Weight'] / (df['Height']**2)
    df['AgeGroup'] = pd.cut(
        df['Age'],
        bins=[0, 18, 30, 45, 60, 100],
        labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior']
    )
    # Fill missing AgeGroup if any
    if 'AgeGroup' in df.columns and df['AgeGroup'].isnull().sum() > 0:
        mode_age_group = df['AgeGroup'].mode()[0]
        df['AgeGroup'].fillna(mode_age_group, inplace=True)


# -----------------------------
# 3️⃣ Separate features and target
# -----------------------------
X = train_df.drop(columns=["WeightCategory", 'id'])   # Dropping 'id' as well
y = train_df["WeightCategory"]

# Encode categorical target if necessary
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

# -----------------------------
# 4️⃣ Split Train (80%) / Validation (20%)
# -----------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 5️⃣ Preprocessing Pipeline
# -----------------------------
# Identify categorical and numeric features (based on X, which is train_df features)
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numeric_features = X.select_dtypes(include=np.number).columns

# Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns (like 'id' if not dropped)
)

# -----------------------------
# 6️⃣ Hypertuning with RandomizedSearchCV
# -----------------------------
param_dist = {
    'classifier__n_estimators': [200, 400, 600, 800],
    'classifier__max_depth': [3, 4, 5, 6, 8],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'classifier__gamma': [0, 0.1, 0.2, 0.3],
    'classifier__min_child_weight': [1, 3, 5],
    'classifier__reg_lambda': [0.5, 1, 1.5],
}

xgb = XGBClassifier(
    objective='multi:softmax',  # Changed to multi:softmax for multiclass
    eval_metric='mlogloss',     # Changed to mlogloss for multiclass
    random_state=42,
    use_label_encoder=False,
    tree_method='hist'
)

# Create a pipeline for hyperparameter tuning
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


random_search = RandomizedSearchCV(
    estimator=pipeline, # Tuned on the pipeline
    param_distributions=param_dist,
    n_iter=20, # Reduced n_iter for quicker execution, can increase for better tuning
    scoring='accuracy',
    cv=cv_strategy,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

print("\n🚀 Starting Advanced XGBoost Hypertuning...")
random_search.fit(X_train, y_train)
print("\n✅ Hypertuning Complete!")
print("Best Hyperparameters:", random_search.best_params_)

# -----------------------------
# 7️⃣ Retrain final model on 100% train.csv with best parameters
# -----------------------------
print("\n🏁 Retraining Final Model on Full Data with Best Hyperparameters...")

# Extract the best parameters for the classifier step
best_classifier_params = {
    key.replace('classifier__', ''): value
    for key, value in random_search.best_params_.items()
    if key.startswith('classifier__')
}

# Define final XGBoost model with best parameters
final_xgb_best = XGBClassifier(
    **best_classifier_params, # Pass the extracted classifier params
    objective='multi:softmax',
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False,
    tree_method='hist'
)

# Create final pipeline with preprocessor and best model
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', final_xgb_best)
])

# Train on the full training data
final_pipeline.fit(X, y)

# -----------------------------
# 8️⃣ Predict on test.csv
# -----------------------------
print("\n🧮 Predicting on test.csv...")
# Prepare test data (excluding 'id' and 'WeightCategory' if it exists)
X_test_submission = test_df.drop(columns=['id', 'WeightCategory'], errors='ignore')

test_pred_encoded = final_pipeline.predict(X_test_submission)

# Decode labels
test_pred_labels = le.inverse_transform(test_pred_encoded)

# -----------------------------
# 9️⃣ Generate submission.csv
# -----------------------------
submission = pd.DataFrame({
    "id": test_df['id'],      # Use the 'id' column from the original test_df
    "WeightCategory": test_pred_labels # Changed 'target' to 'WeightCategory'
})
submission.to_csv("submission.csv", index=False)
print("\n✅ submission.csv generated successfully!")

✅ Data Loaded Successfully!
Train shape: (15533, 18) , Test shape: (5225, 17)

🚀 Starting Advanced XGBoost Hypertuning...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

✅ Hypertuning Complete!
Best Hyperparameters: {'classifier__subsample': 1.0, 'classifier__reg_lambda': 0.5, 'classifier__n_estimators': 800, 'classifier__min_child_weight': 1, 'classifier__max_depth': 4, 'classifier__learning_rate': 0.1, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 0.8}

🏁 Retraining Final Model on Full Data with Best Hyperparameters...

🧮 Predicting on test.csv...

✅ submission.csv generated successfully!


In [None]:
# =============================
# Advanced XGBoost Hypertuning + Final Model + Submission (No Early Stopping)
# =============================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# -----------------------------
# 1️⃣ Load Data
# -----------------------------
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("✅ Data Loaded Successfully!")
print("Train shape:", train_df.shape, ", Test shape:", test_df.shape)

# -----------------------------
# 2️⃣ Feature Engineering
# -----------------------------
for df in [train_df, test_df]:
    df['BMI'] = df['Weight'] / (df['Height']**2)
    df['AgeGroup'] = pd.cut(
        df['Age'],
        bins=[0, 18, 30, 45, 60, 100],
        labels=['Teen', 'Young', 'Adult', 'MidAge', 'Senior']
    )
    if 'AgeGroup' in df.columns and df['AgeGroup'].isnull().sum() > 0:
        mode_age_group = df['AgeGroup'].mode()[0]
        df['AgeGroup'].fillna(mode_age_group, inplace=True)

# -----------------------------
# 3️⃣ Separate Features & Target
# -----------------------------
X = train_df.drop(columns=["WeightCategory", "id"])
y = train_df["WeightCategory"]

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# -----------------------------
# 4️⃣ Train-Validation Split
# -----------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# -----------------------------
# 5️⃣ Preprocessing Pipeline
# -----------------------------
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numeric_features = X.select_dtypes(include=np.number).columns

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# -----------------------------
# 6️⃣ Define XGBoost & Hyperparameter Ranges
# -----------------------------
xgb = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    tree_method='hist'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

param_dist = {
    'classifier__n_estimators': [400, 600, 800, 1000],
    'classifier__max_depth': [4, 5, 6, 7],
    'classifier__learning_rate': [0.01, 0.03, 0.05, 0.07],
    'classifier__subsample': [0.75, 0.8, 0.85, 0.9],
    'classifier__colsample_bytree': [0.75, 0.8, 0.85, 0.9],
    'classifier__gamma': [0, 0.1, 0.2, 0.3],
    'classifier__min_child_weight': [1, 2, 3, 4],
    'classifier__reg_lambda': [1, 1.2, 1.5, 2.0]
}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,               # 50 combinations for better tuning
    scoring='accuracy',
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# -----------------------------
# 7️⃣ Hyperparameter Search
# -----------------------------
print("\n🚀 Starting Advanced XGBoost Hypertuning...")
random_search.fit(X_train, y_train)
print("\n✅ Hypertuning Complete!")
print("Best Hyperparameters:", random_search.best_params_)

# -----------------------------
# 8️⃣ Retrain Final Model on Full Train Data
# -----------------------------
best_params = {key.replace('classifier__', ''): value
               for key, value in random_search.best_params_.items()
               if key.startswith('classifier__')}

final_xgb = XGBClassifier(
    **best_params,
    objective='multi:softmax',
    eval_metric='mlogloss',
    use_label_encoder=False,
    tree_method='hist',
    random_state=42
)

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', final_xgb)
])

print("\n🏁 Retraining Final Model on Full Training Data...")
final_pipeline.fit(X, y_encoded)

# -----------------------------
# 9️⃣ Evaluate Train & Validation Accuracy
# -----------------------------
y_train_pred = final_pipeline.predict(X)
y_valid_pred = final_pipeline.predict(X_valid)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"\n🎯 Train Accuracy: {accuracy_score(y_encoded, y_train_pred):.4f}")
print(f"🎯 Validation Accuracy: {accuracy_score(y_valid, y_valid_pred):.4f}")

# -----------------------------
# 🔟 Predict on Test Data & Generate Submission
# -----------------------------
X_test_submission = test_df.drop(columns=['id', 'WeightCategory'], errors='ignore')
test_pred_encoded = final_pipeline.predict(X_test_submission)
test_pred_labels = le.inverse_transform(test_pred_encoded)

submission = pd.DataFrame({
    "id": test_df['id'],
    "WeightCategory": test_pred_labels
})

submission.to_csv("submission.csv", index=False)
print("\n✅ submission.csv generated successfully!")


In [None]:
from google.colab import files

# Download the submission file
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>