In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
train_extra_df = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')

In [3]:
train_df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
test_df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [5]:
train_df.isna().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [6]:
train_extra_df.isna().sum()

id                           0
Brand                   117053
Material                102615
Size                     81190
Compartments                 0
Laptop Compartment       91089
Waterproof               87274
Style                    96210
Color                   123667
Weight Capacity (kg)      1670
Price                        0
dtype: int64

In [7]:
train_df = pd.concat([train_df, train_extra_df], axis=0, ignore_index=True)

In [8]:
categorical_cols = ['Brand', 'Material', 'Size', 'Style', 'Color']
numerical_cols = ['Compartments', 'Weight Capacity (kg)']

cat_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OneHotEncoder(drop='first'))
])

pipeline = ColumnTransformer([
    ('cat', cat_preprocessor, categorical_cols),
    ('num_imputer', SimpleImputer(strategy='median'), numerical_cols),
])

X_train_transformed = pipeline.fit_transform(train_df.drop(columns=['id', 'Price']))
y_train = train_df['Price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train_transformed, y_train, test_size=0.2, stratify=y_train, random_state=42)

In [10]:
import time
from functools import wraps

def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        print(f"{func.__name__} took {time.perf_counter() - start:.2f} seconds")
        return result
    return wrapper       

In [11]:
@timeit
def train_model_xgb():
    
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    model = xgb.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse:.4f}")
    
    return model
    
model = train_model_xgb()

RMSE: 38.8965
train_model_xgb took 20.95 seconds


In [12]:
# @timeit
# def train_model_xgb_random_search():

#     param_dist = {
#         'max_depth': [3, 4, 5, 6, 7],
#         'learning_rate': [0.01, 0.05, 0.1, 0.2],
#         'n_estimators': [100, 200, 300],
#         'subsample': [0.5, 0.7, 1.0],
#         'colsample_bytree': [0.5, 0.7, 1.0]
#     }
    
#     xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    
#     random_search = RandomizedSearchCV(
#         estimator=xgb,
#         param_distributions=param_dist,
#         n_iter=30,
#         cv=5,
#         scoring='neg_mean_squared_error',
#         random_state=42,
#         n_jobs=-1,
#         verbose=1
#     )
    
#     random_search.fit(X_train, y_train)
    
#     best_model = random_search.best_estimator_
    
#     y_pred = best_model.predict(X_test)
#     rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
#     print("Best Parameters:", random_search.best_params_)
#     print(f"RMSE: {rmse:.4f}")
    
#     return best_model

# hypertuned_model = train_model_xgb_random_search()

In [13]:
@timeit
def train_model_xgb_hypertuned():
    
    xgb = XGBRegressor(objective='reg:squarederror', 
                   random_state=42,
                   subsample=1.0,
                   n_estimators=200,
                   max_depth=7,
                   learning_rate=0.05,
                   colsample_bytree=0.5)
    
    model = xgb.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse:.4f}")
    
    return model

model = train_model_xgb_hypertuned()

RMSE: 38.8901
train_model_xgb_hypertuned took 51.62 seconds


In [14]:
X_test_transformed = pipeline.transform(test_df.drop('id', axis=1))
y_pred = model.predict(X_test_transformed)

test_df['Price'] = y_pred
test_df[['id', 'Price']].to_csv('/kaggle/working/submission.csv', index=False)