In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
train_extra_df = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')

In [3]:
train_df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
test_df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [5]:
train_df.isna().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [6]:
train_extra_df.isna().sum()

id                           0
Brand                   117053
Material                102615
Size                     81190
Compartments                 0
Laptop Compartment       91089
Waterproof               87274
Style                    96210
Color                   123667
Weight Capacity (kg)      1670
Price                        0
dtype: int64

In [7]:
train_df = pd.concat([train_df, train_extra_df], axis=0, ignore_index=True)

In [8]:
class FeatureEnginner(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        brand_material = np.array([f"{b}_{m}" for b, m in zip(X.iloc[:, 0], X.iloc[:, 1])])
        compartments_weight = X.iloc[:, 2] / X.iloc[:, 3]
        
        X['Brand_Material'] = brand_material
        X['Compartments_Weight_Capacity'] = compartments_weight
    
        return X

In [9]:
categorical_cols = ['Brand', 'Material', 'Style', 'Color']
ordinal_cols = ['Size']
feature_eng_cols = ['Brand', 'Material', 'Compartments', 'Weight Capacity (kg)']
numerical_cols = ['Compartments', 'Weight Capacity (kg)']

num_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))
])

cat_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OneHotEncoder(drop='first'))
])

ord_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(categories=[['Small', 'Medium', 'Large']]))
])

fe_preprocessor = Pipeline([
    ('feature_eng', FeatureEnginner()),
    ('encoder', ColumnTransformer([
        ('brand_material_enc', OneHotEncoder(drop='first'), ['Brand_Material']),
        ('num_passthrough', 'passthrough', ['Compartments_Weight_Capacity'])
    ]))
])

pipeline = ColumnTransformer([
    ('cat', cat_preprocessor, categorical_cols),
    ('num', num_preprocessor, numerical_cols),
    ('ord', ord_preprocessor, ordinal_cols),
    ('feature_eng', fe_preprocessor, feature_eng_cols)
])


X_train_transformed = pipeline.fit_transform(train_df.drop(columns=['id', 'Price']))
y_train = train_df['Price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train_transformed, y_train, test_size=0.2, random_state=42)

In [11]:
import time
from functools import wraps

def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        print(f"{func.__name__} took {time.perf_counter() - start:.2f} seconds")
        return result
    return wrapper       

In [12]:
@timeit
def train_lgbm_model():

    model = LGBMRegressor(random_state=42)

    model.fit(X_train, y_train)
    
    y_pred_lgbm = model.predict(X_test)
    test_rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
    print("Test RMSE for LGBMRegressor: {:.4f}".format(test_rmse_lgbm))

    return model

model = train_lgbm_model()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1139
[LightGBM] [Info] Number of data points in the train set: 3195454, number of used features: 54
[LightGBM] [Info] Start training from score 81.361311
Test RMSE for LGBMRegressor: 38.8726
train_lgbm_model took 16.73 seconds


In [13]:
# @timeit
# def hypertune_lgbm_model():
#     lgbm_param_grid = {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [5, 7, 10],
#         'learning_rate': [0.01, 0.05, 0.1],
#         'num_leaves': [31, 50, 70]
#     }
    
#     lgbm = LGBMRegressor(random_state=42)
    
#     lgbm_grid = GridSearchCV(estimator=lgbm, 
#                              param_grid=lgbm_param_grid, 
#                              cv=5,
#                              scoring='neg_mean_squared_error', 
#                              n_jobs=-1,
#                              verbose=1)
#     lgbm_grid.fit(X_train, y_train)

#     print("\nBest parameters for LGBMRegressor:", lgbm_grid.best_params_)
#     best_lgbm_rmse = np.sqrt(-lgbm_grid.best_score_)
#     print("Best cross-validated RMSE for LGBMRegressor: {:.4f}".format(best_lgbm_rmse))
    
#     y_pred_lgbm = lgbm_grid.predict(X_test)
#     test_rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
#     print("Test RMSE for LGBMRegressor: {:.4f}".format(test_rmse_lgbm))

# hypertune_lgbm_model()

In [14]:
model = LGBMRegressor(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

model.fit(X_train, y_train)

y_pred_lgbm = model.predict(X_test)
test_rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
print("Test RMSE for LGBMRegressor: {:.4f}".format(test_rmse_lgbm))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1139
[LightGBM] [Info] Number of data points in the train set: 3195454, number of used features: 54
[LightGBM] [Info] Start training from score 81.361311
Test RMSE for LGBMRegressor: 38.8719


In [15]:
X_test_transformed = pipeline.transform(test_df.drop('id', axis=1))
y_pred = model.predict(X_test_transformed)

test_df['Price'] = y_pred
test_df[['id', 'Price']].to_csv('/kaggle/working/submission.csv', index=False)