In [None]:
!pip install catboost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

In [14]:
# Chuẩn bị data train, validation
def data_prep(type_score):
    df = pd.read_csv('data/' + type_score + '/train.csv')
    df = df.drop('username', axis=1)
    
    X = df.drop(type_score, axis=1)
    y = df[type_score]
    
    # Scaler
    X_scaled = StandardScaler().fit_transform(X).astype('float32')
    
    # Chia thành các dataset
    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.05)
    
    return X_train, X_val, y_train, y_val

In [15]:
# Dự đoán tập test và lưu kết quả
def predict(model, type_score):
    df_test = pd.read_csv('data/' + type_score + '/test.csv')
    
    usernames = df_test['username']
    X_test = df_test.drop('username', axis=1)
    X_test = StandardScaler().fit_transform(X_test).astype('float32')    
    
    y_pred = model.predict(X_test)
    tmp = pd.DataFrame(columns=[type_score], data=y_pred)
    
    result = pd.concat([usernames, tmp], axis=1)
    result.to_csv('data/' + type_score + '/' + type_score + '.csv', header=False, index=False)

In [5]:
param_grid = {'learning_rate': np.logspace(-8, -1, 8), 'depth': np.arange(6, 8), 'l2_leaf_reg': np.arange(3, 5)}

### 1. TBTL

In [27]:
X_train, X_val, y_train, y_val = data_prep('TBTL')

In [8]:
# Hyperparameter tuning với GridSearchCV
model = CatBoostRegressor(iterations=400, verbose=False)
GS = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='r2', verbose=False, n_jobs=-1)
GS.fit(X_train, y_train)

40 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\catboost\core.py", line 5827, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fil

In [9]:
# Bộ hyerparameter tốt nhất
best_params = GS.best_params_
best_params

{'depth': 6, 'l2_leaf_reg': 3, 'learning_rate': 0.01}

In [28]:
# Train lại với iterations cao hơn
model = CatBoostRegressor(iterations=10000,
                          learning_rate=best_params['learning_rate'], 
                          depth=best_params['depth'], 
                          l2_leaf_reg=best_params['l2_leaf_reg'], 
                          eval_metric='R2', early_stopping_rounds=1000,
                          verbose=100)
model.fit(X_train, y_train, eval_set=(X_val, y_val)) 

0:	learn: 0.0019589	test: -0.0665477	best: -0.0665477 (0)	total: 10.4ms	remaining: 3m 28s
100:	learn: 0.1644178	test: 0.0316604	best: 0.0316604 (100)	total: 850ms	remaining: 2m 47s
200:	learn: 0.2652877	test: 0.0642451	best: 0.0642451 (200)	total: 1.64s	remaining: 2m 41s
300:	learn: 0.3391431	test: 0.0896480	best: 0.0896480 (300)	total: 2.39s	remaining: 2m 36s
400:	learn: 0.3987127	test: 0.1022444	best: 0.1023437 (394)	total: 3.08s	remaining: 2m 30s
500:	learn: 0.4444786	test: 0.1127872	best: 0.1127872 (500)	total: 3.75s	remaining: 2m 26s
600:	learn: 0.4858582	test: 0.1209299	best: 0.1212479 (594)	total: 4.49s	remaining: 2m 25s
700:	learn: 0.5230792	test: 0.1272323	best: 0.1272323 (700)	total: 5.16s	remaining: 2m 22s
800:	learn: 0.5587973	test: 0.1467276	best: 0.1467715 (798)	total: 5.85s	remaining: 2m 20s
900:	learn: 0.5923215	test: 0.1672186	best: 0.1672186 (900)	total: 6.58s	remaining: 2m 19s
1000:	learn: 0.6193709	test: 0.1817289	best: 0.1817289 (1000)	total: 7.26s	remaining: 2m 17

<catboost.core.CatBoostRegressor at 0x1819bee4620>

In [29]:
predict(model, 'TBTL')

### 2. TH

In [30]:
X_train, X_val, y_train, y_val = data_prep('TH')

In [31]:
model = CatBoostRegressor(iterations=400, verbose=False)
GS = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='r2', verbose=False, n_jobs=-1)
GS.fit(X_train, y_train)

40 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\catboost\core.py", line 5827, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fil

In [32]:
best_params = GS.best_params_
best_params

{'depth': 7, 'l2_leaf_reg': 3, 'learning_rate': 0.1}

In [33]:
model = CatBoostRegressor(iterations=10000,
                          learning_rate=best_params['learning_rate'], 
                          depth=best_params['depth'], 
                          l2_leaf_reg=best_params['l2_leaf_reg'], 
                          eval_metric='R2', early_stopping_rounds=1000,
                          verbose=100)
model.fit(X_train, y_train, eval_set=(X_val, y_val)) 

0:	learn: 0.0471269	test: 0.0430284	best: 0.0430284 (0)	total: 14.4ms	remaining: 2m 23s
100:	learn: 0.7486626	test: 0.6176568	best: 0.6176568 (100)	total: 1.51s	remaining: 2m 28s
200:	learn: 0.8802285	test: 0.7053130	best: 0.7053130 (200)	total: 2.79s	remaining: 2m 16s
300:	learn: 0.9337987	test: 0.7224610	best: 0.7224767 (299)	total: 4.15s	remaining: 2m 13s
400:	learn: 0.9568348	test: 0.7254677	best: 0.7266719 (385)	total: 5.64s	remaining: 2m 15s
500:	learn: 0.9691412	test: 0.7350455	best: 0.7359178 (494)	total: 6.99s	remaining: 2m 12s
600:	learn: 0.9753751	test: 0.7406285	best: 0.7406285 (600)	total: 8.46s	remaining: 2m 12s
700:	learn: 0.9787827	test: 0.7450022	best: 0.7450022 (700)	total: 10.1s	remaining: 2m 13s
800:	learn: 0.9806255	test: 0.7470437	best: 0.7471029 (799)	total: 11.5s	remaining: 2m 12s
900:	learn: 0.9816218	test: 0.7482260	best: 0.7482762 (895)	total: 13.1s	remaining: 2m 12s
1000:	learn: 0.9821971	test: 0.7493316	best: 0.7494332 (973)	total: 14.6s	remaining: 2m 11s
1

<catboost.core.CatBoostRegressor at 0x1819bf6a4b0>

In [34]:
predict(model, 'TH')

### 3. QT

In [33]:
X_train, X_val, y_train, y_val = data_prep('QT')

In [37]:
model = CatBoostRegressor(iterations=400, verbose=False)
GS = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='r2', verbose=False, n_jobs=-1)
GS.fit(X_train, y_train)

In [38]:
best_params = GS.best_params_
best_params

{'depth': 7, 'l2_leaf_reg': 4, 'learning_rate': 0.01}

In [39]:
model = CatBoostRegressor(iterations=10000,
                          learning_rate=best_params['learning_rate'], 
                          depth=best_params['depth'], 
                          l2_leaf_reg=best_params['l2_leaf_reg'], 
                          eval_metric='R2', early_stopping_rounds=1000,
                          verbose=100)
model.fit(X_train, y_train, eval_set=(X_val, y_val)) 

0:	learn: 0.0023068	test: -0.0563206	best: -0.0563206 (0)	total: 15ms	remaining: 2m 30s
100:	learn: 0.1745322	test: 0.0136295	best: 0.0136295 (100)	total: 1.45s	remaining: 2m 21s
200:	learn: 0.2925811	test: 0.0307762	best: 0.0312356 (194)	total: 2.68s	remaining: 2m 10s
300:	learn: 0.3786109	test: 0.0425567	best: 0.0429626 (299)	total: 3.74s	remaining: 2m
400:	learn: 0.4442069	test: 0.0465884	best: 0.0483084 (395)	total: 4.88s	remaining: 1m 56s
500:	learn: 0.5025219	test: 0.0436609	best: 0.0483084 (395)	total: 6.13s	remaining: 1m 56s
600:	learn: 0.5518448	test: 0.0458519	best: 0.0483084 (395)	total: 7.3s	remaining: 1m 54s
700:	learn: 0.5950416	test: 0.0430030	best: 0.0483084 (395)	total: 8.68s	remaining: 1m 55s
800:	learn: 0.6368808	test: 0.0426572	best: 0.0483084 (395)	total: 9.83s	remaining: 1m 52s
900:	learn: 0.6751273	test: 0.0333622	best: 0.0483084 (395)	total: 11.1s	remaining: 1m 52s
1000:	learn: 0.7104479	test: 0.0252582	best: 0.0483084 (395)	total: 12.3s	remaining: 1m 50s
1100:	

<catboost.core.CatBoostRegressor at 0x18d918f8560>

In [40]:
predict(model, 'QT')

### 4. CK

In [41]:
X_train, X_val, y_train, y_val = data_prep('CK')

In [45]:
model = CatBoostRegressor(iterations=400, verbose=False)
GS = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='r2', verbose=False, n_jobs=-1)
GS.fit(X_train, y_train)

In [46]:
best_params = GS.best_params_
best_params

{'depth': 7, 'l2_leaf_reg': 4, 'learning_rate': 0.01}

In [47]:
model = CatBoostRegressor(iterations=10000,
                          learning_rate=best_params['learning_rate'], 
                          depth=best_params['depth'], 
                          l2_leaf_reg=best_params['l2_leaf_reg'], 
                          eval_metric='R2', early_stopping_rounds=1000,
                          verbose=100)
model.fit(X_train, y_train, eval_set=(X_val, y_val)) 

0:	learn: 0.0028853	test: -0.0200396	best: -0.0200396 (0)	total: 13.7ms	remaining: 2m 17s
100:	learn: 0.2112657	test: 0.1000500	best: 0.1000500 (100)	total: 1.55s	remaining: 2m 31s
200:	learn: 0.3408863	test: 0.1581473	best: 0.1581473 (200)	total: 2.69s	remaining: 2m 11s
300:	learn: 0.4321182	test: 0.1818821	best: 0.1818833 (298)	total: 3.99s	remaining: 2m 8s
400:	learn: 0.5022476	test: 0.2049790	best: 0.2051372 (399)	total: 5.05s	remaining: 2m
500:	learn: 0.5581715	test: 0.2159180	best: 0.2162802 (498)	total: 6.27s	remaining: 1m 58s
600:	learn: 0.6061620	test: 0.2254111	best: 0.2255482 (597)	total: 7.46s	remaining: 1m 56s
700:	learn: 0.6438953	test: 0.2376683	best: 0.2377919 (699)	total: 8.67s	remaining: 1m 54s
800:	learn: 0.6764504	test: 0.2477411	best: 0.2478232 (798)	total: 9.76s	remaining: 1m 52s
900:	learn: 0.7047304	test: 0.2540833	best: 0.2540904 (897)	total: 10.9s	remaining: 1m 50s
1000:	learn: 0.7307675	test: 0.2585942	best: 0.2591278 (997)	total: 12s	remaining: 1m 47s
1100:	

<catboost.core.CatBoostRegressor at 0x18d918f9a30>

In [48]:
predict(model, 'CK')