In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [2]:
# Loading the train dataset
df = pd.read_csv("/kaggle/input/playground-series-s5e2/train.csv")  # Update the path to your train.csv file

In [3]:
# Loading the test dataset
test_data = pd.read_csv("/kaggle/input/playground-series-s5e2/test.csv") # Update the path to your test.csv file

In [4]:
test_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [5]:
df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material              194387 non-null  object 
 3   Size                  195619 non-null  object 
 4   Compartments          200000 non-null  float64
 5   Laptop Compartment    195038 non-null  object 
 6   Waterproof            195189 non-null  object 
 7   Style                 194847 non-null  object 
 8   Color                 193215 non-null  object 
 9   Weight Capacity (kg)  199923 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 15.3+ MB


In [7]:
# This tells us the number of rows, columns, and data types.  
df.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [8]:
# Checking for missing values in each column  
df.isnull().sum()  

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [9]:
# Checking for missing values in each column  
test_data.isnull().sum()  

id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

In [10]:
combined_data = pd.concat([df, test_data], ignore_index=True)

In [11]:
# Handle missing values in categorical columns
categorical_cols = combined_data.select_dtypes(include=[object]).columns
for col in categorical_cols:
    combined_data[col] = combined_data[col].fillna(combined_data[col].mode()[0])

In [12]:
# Handle missing values in numerical columns using SimpleImputer
numeric_cols = combined_data.select_dtypes(include=[np.number]).columns.difference(['Price'])
combined_data[numeric_cols] = combined_data[numeric_cols].fillna(combined_data[numeric_cols].mean())

In [13]:
# Encoding categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    combined_data[col] = label_encoders[col].fit_transform(combined_data[col])

In [14]:
#Split back into train and test
train_data = combined_data.iloc[:len(df)]
test_data = combined_data.iloc[len(df):].drop(columns=['Price'], errors='ignore')

In [15]:
combined_data

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,2,4,1,2,2.0,1,0,1,5,16.643760,39.17320
3,3,2,2,2,8.0,1,0,1,3,12.937220,80.60793
4,4,0,0,1,1.0,1,1,1,3,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...,...
499995,499995,0,0,0,2.0,1,0,1,5,7.383498,
499996,499996,2,3,2,9.0,0,1,1,4,6.058394,
499997,499997,1,2,2,9.0,0,1,2,3,26.890163,
499998,499998,3,2,0,10.0,1,0,2,2,25.769153,


In [16]:
# Verifying that the encoding is consistent for both train and test data
train_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,2,4,1,2,2.0,1,0,1,5,16.64376,39.1732
3,3,2,2,2,8.0,1,0,1,3,12.93722,80.60793
4,4,0,0,1,1.0,1,1,1,3,17.749338,86.02312


In [17]:
test_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
300000,300000,3,1,2,2.0,0,0,2,3,20.671147
300001,300001,2,0,1,7.0,0,1,0,3,13.564105
300002,300002,0,0,0,9.0,0,1,1,1,11.809799
300003,300003,0,2,0,1.0,1,0,1,3,18.477036
300004,300004,0,2,0,2.0,1,1,2,0,9.907953


In [18]:
# Prepare the feature set (X) and target variable (y) for training
X = train_data.drop(columns=['Price'])
y = train_data['Price']

In [19]:
# Train-test split for the model (this step can be skipped since we already split earlier)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Standardize numerical features
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])

In [21]:
# Initialize XGBoost Model with optimized parameters
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

In [22]:
# Train model
xgb_model.fit(X_train, y_train)

In [23]:
y_pred = xgb_model.predict(X_val)

In [24]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"XGBoost RMSE: {rmse:.4f}")

XGBoost RMSE: 38.9898


In [25]:
test_predictions = xgb_model.predict(test_data)

In [26]:
# Save the submission to a CSV file
submission = pd.DataFrame({"id": test_data.index, "Price": test_predictions})
submission.to_csv("submission.csv", index=False)
print("Submitted")

Submitted
