In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('/content/drive/MyDrive/khdl/luxury_watches_featured.csv')
df.columns

Index(['Brand', 'Model', 'Case Material', 'Strap Material', 'Movement Type',
       'Water Resistance', 'Case Diameter', 'Case Thickness', 'Band Width',
       'Dial Color', 'Crystal Material', 'Complications', 'Power Reserve',
       'Price', 'Complication_Score', 'Luxury_Index', 'Movement_Complexity',
       'Case_Proportion', 'Dial_Score', 'Material_Match', 'Has_Power_Reserve',
       'Case_Size_Category', 'Has_Complication', 'Brand_Tier_encoded',
       'Water_Tier_encoded', 'Brand_Tier_Mid-Range', 'Brand_Tier_Other',
       'Brand_Tier_Very High-End', 'Water_Tier_Professional',
       'Water_Tier_Standard'],
      dtype='object')

In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
import catboost
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [5]:
X = df.drop(columns=['Complications', 'Price'])
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_columns = [
    'Brand', 'Model', 'Case Material', 'Strap Material', 'Movement Type',
    'Dial Color', 'Crystal Material'
]

categorical_columns = X.select_dtypes(include='object').columns.tolist()

model = CatBoostRegressor(iterations=200, depth=6, learning_rate=0.1, loss_function='RMSE', cat_features=categorical_columns)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'R2 Score: {r2:.4f}')
print(f'RMSE: {rmse:.4f}')

0:	learn: 8034.5955757	total: 68.8ms	remaining: 13.7s
1:	learn: 7521.4533005	total: 89.1ms	remaining: 8.82s
2:	learn: 7046.8499299	total: 105ms	remaining: 6.88s
3:	learn: 6607.9416400	total: 120ms	remaining: 5.87s
4:	learn: 6218.0366910	total: 132ms	remaining: 5.15s
5:	learn: 5908.3116853	total: 150ms	remaining: 4.87s
6:	learn: 5536.2976217	total: 168ms	remaining: 4.64s
7:	learn: 5253.5221733	total: 182ms	remaining: 4.38s
8:	learn: 4955.1817571	total: 193ms	remaining: 4.1s
9:	learn: 4700.4951498	total: 203ms	remaining: 3.85s
10:	learn: 4478.8873403	total: 219ms	remaining: 3.76s
11:	learn: 4274.6552214	total: 230ms	remaining: 3.6s
12:	learn: 4087.3069314	total: 242ms	remaining: 3.49s
13:	learn: 3923.8780262	total: 257ms	remaining: 3.42s
14:	learn: 3791.8672034	total: 271ms	remaining: 3.34s
15:	learn: 3651.8636364	total: 289ms	remaining: 3.32s
16:	learn: 3529.4204552	total: 305ms	remaining: 3.28s
17:	learn: 3419.9298340	total: 317ms	remaining: 3.2s
18:	learn: 3304.1668717	total: 328ms	re

In [6]:
param_grid = {
    'iterations': [200, 500],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1]
}

grid_search = GridSearchCV(estimator=CatBoostRegressor(loss_function='RMSE', cat_features=categorical_columns),
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=3)

grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
101:	learn: 1301.9456504	total: 2.11s	remaining: 2.03s
102:	learn: 1298.9107367	total: 2.14s	remaining: 2.01s
103:	learn: 1291.9101382	total: 2.16s	remaining: 2s
104:	learn: 1284.7561665	total: 2.19s	remaining: 1.98s
105:	learn: 1282.5732507	total: 2.21s	remaining: 1.96s
106:	learn: 1271.5305548	total: 2.22s	remaining: 1.93s
107:	learn: 1266.0240516	total: 2.25s	remaining: 1.91s
108:	learn: 1258.7479196	total: 2.27s	remaining: 1.9s
109:	learn: 1247.7045094	total: 2.29s	remaining: 1.88s
110:	learn: 1240.1224127	total: 2.31s	remaining: 1.85s
111:	learn: 1236.2840546	total: 2.34s	remaining: 1.84s
112:	learn: 1231.9334284	total: 2.36s	remaining: 1.82s
113:	learn: 1229.4143595	total: 2.39s	remaining: 1.8s
114:	learn: 1220.7517521	total: 2.41s	remaining: 1.78s
115:	learn: 1216.1424209	total: 2.43s	remaining: 1.76s
116:	learn: 1212.5176986	total: 2.45s	remaining: 1.74s
117:	learn: 1211.1172534	total: 2.47s	remaining: 1.72s

In [7]:
best_model = CatBoostRegressor(iterations=200, depth=6, learning_rate=0.1, loss_function='RMSE', cat_features=categorical_columns)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'R2 Score: {r2:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')

0:	learn: 8034.5955757	total: 4.43ms	remaining: 882ms
1:	learn: 7521.4533005	total: 8.12ms	remaining: 804ms
2:	learn: 7046.8499299	total: 12ms	remaining: 785ms
3:	learn: 6607.9416400	total: 15.6ms	remaining: 765ms
4:	learn: 6218.0366910	total: 19.3ms	remaining: 754ms
5:	learn: 5908.3116853	total: 22.7ms	remaining: 735ms
6:	learn: 5536.2976217	total: 26.7ms	remaining: 736ms
7:	learn: 5253.5221733	total: 29.8ms	remaining: 715ms
8:	learn: 4955.1817571	total: 33ms	remaining: 701ms
9:	learn: 4700.4951498	total: 36.6ms	remaining: 695ms
10:	learn: 4478.8873403	total: 40.2ms	remaining: 691ms
11:	learn: 4274.6552214	total: 43.4ms	remaining: 680ms
12:	learn: 4087.3069314	total: 46.8ms	remaining: 674ms
13:	learn: 3923.8780262	total: 49.9ms	remaining: 663ms
14:	learn: 3791.8672034	total: 53.4ms	remaining: 658ms
15:	learn: 3651.8636364	total: 56.8ms	remaining: 653ms
16:	learn: 3529.4204552	total: 60.2ms	remaining: 648ms
17:	learn: 3419.9298340	total: 63.5ms	remaining: 642ms
18:	learn: 3304.1668717	