Supercode

In [1]:
import numpy as np
import pandas as pd
import time
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

import optuna
from tqdm import tqdm

# Track execution time
start_time = time.time()

In [2]:
# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
extra_train_df = pd.read_csv("training_extra.csv")

train_df = pd.concat([train_df, extra_train_df], ignore_index=True)
train_df

# Display basic info
print("Train Data Info:")
train_df.info()

print("\nTest Data Info:")
test_df.info()

# Check missing values
print("\nMissing Values in Train Data:\n", train_df.isnull().sum())
print("\nMissing Values in Test Data:\n", test_df.isnull().sum())

# Display first few rows
train_df.head()

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 335.2+ MB

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material  

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [3]:
def preprocess_data(train_df, test_df):
    """
    Preprocess the data: Handle missing values, encode categorical features, and scale numerical features.
    """
    # Drop ID column
    train_df.drop("id", axis=1, inplace=True)
    test_ids = test_df["id"]
    test_df.drop("id", axis=1, inplace=True)

    # Separate target variable
    y = train_df["Price"]
    X = train_df.drop("Price", axis=1)

    # Identify categorical & numerical columns
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
    numerical_cols = X.select_dtypes(include=["number"]).columns.tolist()

    # Fill missing values
    for col in categorical_cols:
        X[col].fillna(X[col].mode()[0], inplace=True)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)

    for col in numerical_cols:
        X[col].fillna(X[col].median(), inplace=True)
        test_df[col].fillna(test_df[col].median(), inplace=True)

    # Encoding strategy
    encoders = []
    for col in categorical_cols:
        if X[col].nunique() <= 10:
            encoders.append((col, OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)))
        else:
            encoders.append((col, OneHotEncoder(handle_unknown="ignore")))

    # Define Column Transformer
    preprocessor = ColumnTransformer([
        ("num", StandardScaler(), numerical_cols),
        ("cat", Pipeline(encoders), categorical_cols)
    ])

    # Apply transformations
    X = preprocessor.fit_transform(X)
    test_df = preprocessor.transform(test_df)

    return X, y, test_df, test_ids, preprocessor

# Apply preprocessing
X, y, X_test, test_ids, preprocessor = preprocess_data(train_df, test_df)
print("Preprocessing Done!")

Preprocessing Done!


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(),
    "MLP Regressor": MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
}

results = {}
for name, model in tqdm(models.items(), desc="Training Models"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    results[name] = rmse
    print(f"{name}: RMSE = {rmse:.4f}")
    print(f"Time elapsed: {time.time() - start_time:.2f} seconds")

    # Save submission
    test_preds = model.predict(X_test)
    submission = pd.DataFrame({"id": test_ids, "Price": test_preds})
    submission.to_csv(f"submission_{name}.csv", index=False)
    print(f"Submission file created for {name}.")

Training Models:   0%|          | 0/7 [00:00<?, ?it/s]

Linear Regression: RMSE = 38.9079
Time elapsed: 50.05 seconds


Training Models:  14%|█▍        | 1/7 [00:00<00:04,  1.26it/s]

Submission file created for Linear Regression.
Ridge Regression: RMSE = 38.9079
Time elapsed: 50.53 seconds


Training Models:  29%|██▊       | 2/7 [00:01<00:02,  1.67it/s]

Submission file created for Ridge Regression.
Lasso Regression: RMSE = 38.9083
Time elapsed: 51.18 seconds


Training Models:  43%|████▎     | 3/7 [00:01<00:02,  1.59it/s]

Submission file created for Lasso Regression.
Random Forest: RMSE = 40.5056
Time elapsed: 3501.25 seconds


Training Models:  57%|█████▋    | 4/7 [58:18<1:09:02, 1380.70s/it]

Submission file created for Random Forest.
Gradient Boosting: RMSE = 38.8797
Time elapsed: 4126.02 seconds


Training Models:  71%|███████▏  | 5/7 [1:07:57<36:23, 1091.66s/it]

Submission file created for Gradient Boosting.


In [None]:
def optimize_model(trial, model_name):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
    }

    if model_name == "XGBoost":
        model = xgb.XGBRegressor(tree_method="gpu_hist", **params)
    elif model_name == "LightGBM":
        model = lgb.LGBMRegressor(device="gpu", **params)
    else:
        model = cb.CatBoostRegressor(task_type="GPU", verbose=0, **params)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_squared_error(y_val, y_pred, squared=False)

best_params = {}
for model_name in ["XGBoost", "LightGBM", "CatBoost"]:
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: optimize_model(trial, model_name), n_trials=20)
    best_params[model_name] = study.best_params
    print(f"Best {model_name} Params:", best_params[model_name])

In [None]:
for name, model in best_models.items():
    model.fit(X, y)
    test_preds = model.predict(X_test)

    submission = pd.DataFrame({"id": test_ids, "Price": test_preds})
    submission.to_csv(f"submission_{name}.csv", index=False)
    print(f"Submission file created for {name}. Time elapsed: {time.time() - start_time:.2f} seconds")

In [None]:
end_time = time.time()
print(f"\nTotal Execution Time: {end_time - start_time:.2f} seconds")