In [1]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!sudo apt install nvidia-driver-460 nvidia-cuda-toolkit clinfo
!apt-get update --fix-missing
!pip install -q  lightgbm==4.1.0 \
  --config-settings=cmake.define.USE_GPU=ON \
  --config-settings=cmake.define.OpenCL_INCLUDE_DIR="/usr/local/cuda/include/" \
  --config-settings=cmake.define.OpenCL_LIBRARY="/usr/local/cuda/lib64/libOpenCL.so"




clinfo is already the newest version (3.0.21.02.21-1).
Some packages could not be installed. This may mean that you have
requested an impossible situation or if you are using the unstable
distribution that some required packages have not yet been created
or been moved out of Incoming.
The following information may help to resolve the situation:

The following packages have unmet dependencies:
 libnvidia-compute-510 : Depends: libnvidia-compute-535 but it is not installable
 nvidia-cuda-dev : Breaks: libcuda1 (< 495)
                   Recommends: libnvcuvid1 but it is not installable
[1;31mE: [0mUnable to correct problems, you have held broken packages.[0m
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [3]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')
train_extra_df = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')

In [4]:
train_df = pd.concat([train_df, train_extra_df], axis=0, ignore_index=True)

In [5]:
train_df.isna().sum()

id                           0
Brand                   126758
Material                110962
Size                     87785
Compartments                 0
Laptop Compartment       98533
Waterproof               94324
Style                   104180
Color                   133617
Weight Capacity (kg)      1808
Price                        0
dtype: int64

In [6]:
import time
from functools import wraps

def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        print(f"{func.__name__} took {time.perf_counter() - start:.2f} seconds")
        return result
    return wrapper       

In [7]:
@timeit
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:

    X = df.copy()
  
    dict_empty_values = {
        'Material': 'NaN',
        'Style': 'NaN',
        'Brand': 'NaN',
        'Size': 'NaN',
        'Waterproof': 'NaN',
        'Color': 'NaN',
        'Laptop Compartment': 'NaN'
    }

    X = X.fillna(dict_empty_values)

    X['Brand_Material'] = X['Brand'] + '_' + X['Material']
    X['Weight_Capacity_Per_Compartments'] = X['Weight Capacity (kg)'] / X['Compartments']

    categorical_features = X.select_dtypes(include=['object']).columns
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

    return X


X = preprocess_data(train_df.drop(['id', 'Price'], axis=1))
y = train_df['Price']

preprocess_data took 8.90 seconds


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
@timeit
def train_stacking_model():

    N_ITERATIONS = 1000
    RANDOM_STATE = 42

    estimators = [
        ('xgb', XGBRegressor(
            random_state=RANDOM_STATE,
            n_estimators=N_ITERATIONS,
            tree_method='gpu_hist',
            predictor='gpu_predictor',
            device='cuda:0',
            verbosity=0 
         )),
        ('lgbm', LGBMRegressor(
            random_state=RANDOM_STATE,
            n_estimators=N_ITERATIONS,
            device='gpu',
            gpu_platform_id=0,
            gpu_device_id=1,
            verbose=-1
        ))
    ]

    meta_model = Ridge(random_state=RANDOM_STATE, alpha=0.1)

    stacking_regressor = StackingRegressor(
        estimators=estimators,
        final_estimator=meta_model,
        cv=5,
        passthrough=False        
    )

    stacking_regressor.fit(X_train, y_train)

    y_pred = stacking_regressor.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"test RMSE for Stacking Regressor: {rmse:.4f}")

    return stacking_regressor

model = train_stacking_model()

test RMSE for Stacking Regressor: 38.8562
train_stacking_model took 640.71 seconds


In [10]:
X_test = preprocess_data(test_df.drop(['id'], axis=1))
y_pred = model.predict(X_test)

test_df['Price'] = y_pred
test_df[['id', 'Price']].to_csv('/kaggle/working/submission.csv', index=False)

preprocess_data took 0.28 seconds
