In [50]:
import numpy as np # linear algebra
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


# Loading the data

In [51]:
train_data = pd.read_csv("poker-hand-testing.csv")
test_data = pd.read_csv("poker-hand-testing.csv")

In [52]:
train_data.head()

Unnamed: 0,Suit of Card 1,Rank of Card 1,Suit of Card 2,Rank of Card 2,Suit of Card 3,Rank of Card 3,Suit of Card 4,Rank of Card 4,Suit of Card 5,Rank of Card 5,Poker Hand
0,1,1,1,13,2,4,2,3,1,12,0
1,3,12,3,2,3,11,4,5,2,5,1
2,1,9,4,6,1,4,3,2,3,9,1
3,1,4,3,13,2,13,2,1,3,6,1
4,3,10,2,7,1,2,2,11,4,9,0


In [53]:
train_data.columns = ["SuitCard1","RC1", "SuitCard2","RC2","SuitCard3","RC3","SuitCard4","RC4","SuitCard5", "RC5","PH"]
test_data.columns = ["SuitCard1","RC1", "SuitCard2","RC2","SuitCard3","RC3","SuitCard4","RC4","SuitCard5", "RC5","PH"]


# Handling missing values

We check if there are any missing values in the data as weel as its types, which can easily be done through the "info()" function or through isnull()


In [54]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   SuitCard1  1000000 non-null  int64
 1   RC1        1000000 non-null  int64
 2   SuitCard2  1000000 non-null  int64
 3   RC2        1000000 non-null  int64
 4   SuitCard3  1000000 non-null  int64
 5   RC3        1000000 non-null  int64
 6   SuitCard4  1000000 non-null  int64
 7   RC4        1000000 non-null  int64
 8   SuitCard5  1000000 non-null  int64
 9   RC5        1000000 non-null  int64
 10  PH         1000000 non-null  int64
dtypes: int64(11)
memory usage: 83.9 MB


In [55]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   SuitCard1  1000000 non-null  int64
 1   RC1        1000000 non-null  int64
 2   SuitCard2  1000000 non-null  int64
 3   RC2        1000000 non-null  int64
 4   SuitCard3  1000000 non-null  int64
 5   RC3        1000000 non-null  int64
 6   SuitCard4  1000000 non-null  int64
 7   RC4        1000000 non-null  int64
 8   SuitCard5  1000000 non-null  int64
 9   RC5        1000000 non-null  int64
 10  PH         1000000 non-null  int64
dtypes: int64(11)
memory usage: 83.9 MB


In [56]:
test_data.isnull().sum()

SuitCard1    0
RC1          0
SuitCard2    0
RC2          0
SuitCard3    0
RC3          0
SuitCard4    0
RC4          0
SuitCard5    0
RC5          0
PH           0
dtype: int64

In [57]:
train_data.head()

Unnamed: 0,SuitCard1,RC1,SuitCard2,RC2,SuitCard3,RC3,SuitCard4,RC4,SuitCard5,RC5,PH
0,1,1,1,13,2,4,2,3,1,12,0
1,3,12,3,2,3,11,4,5,2,5,1
2,1,9,4,6,1,4,3,2,3,9,1
3,1,4,3,13,2,13,2,1,3,6,1
4,3,10,2,7,1,2,2,11,4,9,0


# Handling categorical data

In [58]:
train_X = train_data.drop("PH", axis = 1)
test_X  = test_data.drop("PH", axis = 1)

In [59]:
train_y = train_data.PH
test_y = test_data.PH

In [60]:
train_y.value_counts()



PH
0    501209
1    422498
2     47622
3     21121
4      3885
5      1996
6      1424
7       230
8        12
9         3
Name: count, dtype: int64

In [61]:
train_X.head()

Unnamed: 0,SuitCard1,RC1,SuitCard2,RC2,SuitCard3,RC3,SuitCard4,RC4,SuitCard5,RC5
0,1,1,1,13,2,4,2,3,1,12
1,3,12,3,2,3,11,4,5,2,5
2,1,9,4,6,1,4,3,2,3,9
3,1,4,3,13,2,13,2,1,3,6
4,3,10,2,7,1,2,2,11,4,9


In [62]:
test_X.head()

Unnamed: 0,SuitCard1,RC1,SuitCard2,RC2,SuitCard3,RC3,SuitCard4,RC4,SuitCard5,RC5
0,1,1,1,13,2,4,2,3,1,12
1,3,12,3,2,3,11,4,5,2,5
2,1,9,4,6,1,4,3,2,3,9
3,1,4,3,13,2,13,2,1,3,6
4,3,10,2,7,1,2,2,11,4,9


# Splitting the training data

In [63]:
len(train_y)

1000000

In [64]:
import sklearn.model_selection
train2_X_dum, val_X, train2_y, val_y = sklearn.model_selection.train_test_split(train_X, train_y)

# Tree-based model - XGBoost

Sử dụng một trong những mô hình dựa trên cây hiệu quả nhất: XGBoost

In [65]:
import xgboost as xgb

And now we just implement it to our train2 data, using the validation data to track the performance of the model

In [66]:
xgb_p = xgb.XGBClassifier(n_estimators = 4000, early_stopping_rounds = 50)
xgb_p.fit(train2_X_dum, train2_y,
             eval_set = [(train2_X_dum, train2_y), (val_X, val_y)],
             verbose = 100)

[0]	validation_0-mlogloss:1.77387	validation_1-mlogloss:1.77385
[100]	validation_0-mlogloss:0.59115	validation_1-mlogloss:0.59898
[200]	validation_0-mlogloss:0.41515	validation_1-mlogloss:0.42822
[300]	validation_0-mlogloss:0.30590	validation_1-mlogloss:0.32143
[400]	validation_0-mlogloss:0.23341	validation_1-mlogloss:0.25029
[500]	validation_0-mlogloss:0.16988	validation_1-mlogloss:0.18754
[600]	validation_0-mlogloss:0.12558	validation_1-mlogloss:0.14341
[700]	validation_0-mlogloss:0.09530	validation_1-mlogloss:0.11302
[800]	validation_0-mlogloss:0.07268	validation_1-mlogloss:0.09005
[900]	validation_0-mlogloss:0.05473	validation_1-mlogloss:0.07185
[1000]	validation_0-mlogloss:0.04034	validation_1-mlogloss:0.05725
[1100]	validation_0-mlogloss:0.03234	validation_1-mlogloss:0.04918
[1200]	validation_0-mlogloss:0.02563	validation_1-mlogloss:0.04240
[1300]	validation_0-mlogloss:0.01958	validation_1-mlogloss:0.03607
[1400]	validation_0-mlogloss:0.01549	validation_1-mlogloss:0.03169
[1500]	

In [67]:
prediction = xgb_p.predict(val_X)

In [68]:
np.mean(prediction==val_y)

np.float64(0.99722)

 - XGBoost có khả năng học tốt, đặc biệt với tập dữ liệu dạng bảng


# Testing the models with the test data

In [69]:
final_prediction = xgb_p.predict(test_X)

In [70]:
np.mean(test_y== final_prediction)

np.float64(0.999305)

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X = train_X[0: len(train_X)//10]
y = train_y[0: len(train_X)//10]

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


# Hyperparameter tuning

In [74]:
param_distributions = {
    "n_estimators": np.arange(50, 501, 50),
    "max_depth": np.arange(5, 51, 5),
    "min_samples_split": np.arange(2, 21, 2),
    "min_samples_leaf": np.arange(1, 21, 2),
    "max_features": np.arange(1, X.shape[1] + 1, 1)
}

In [75]:
from sklearn.ensemble import RandomForestClassifier


In [76]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)


In [77]:
random_search = RandomizedSearchCV(
    rf,
    param_distributions,
    n_iter=30,
    cv=2,  # Cross-validation k=3
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

In [78]:
random_search.fit(X_train, y_train)




In [79]:
best_model = random_search.best_estimator_

In [80]:
y_pred_best = best_model.predict(X_test)

In [81]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [82]:
accuracy_best = accuracy_score(y_test, y_pred_best)

accuracy_best, random_search.best_params_

(0.6834,
 {'n_estimators': np.int64(250),
  'min_samples_split': np.int64(20),
  'min_samples_leaf': np.int64(3),
  'max_features': np.int64(8),
  'max_depth': np.int64(20)})

Accuracy giảm xuống khoảng 68,34 % nhờ tìm số lượng cây (n_estimators), độ sâu (max_depth), và learning rate tối ưu.

Nhận xét:
-  Mô hình tổng quát hơn, nhưng tuning có thể chưa tối ưu, làm accuracy giảm mạnh
- Accuracy giảm xuống còn 68%, chứng tỏ RandomForest không phù hợp bằng.
- RandomizedSearchCV có thể chưa tìm ra bộ siêu tham số tối ưu nhất
- Tốn nhiều tài nguyên hơn do chạy RandomizedSearchCV nhiều lần.

# Regularization

In [83]:
xgb_p_reg = xgb.XGBClassifier(
    n_estimators=4000,
    early_stopping_rounds=50,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 Regularization
    reg_lambda=0.5,  # L2 Regularization
    gamma=0.2,
    min_child_weight=3,
    scale_pos_weight=1,
    eval_metric="mlogloss",
    random_state=42
)

In [84]:
param_distributions["ccp_alpha"] = np.linspace(0, 0.05, 10)  # Complexity parameter cho pruning
random_search_reg = RandomizedSearchCV(
    rf,
    param_distributions,
    n_iter=30,
    cv=2,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)


#

In [85]:
random_search_reg.fit(X_train, y_train)



In [86]:
best_model_reg = random_search_reg.best_estimator_
y_pred_best_reg = best_model_reg.predict(X_test)

In [87]:
accuracy_best_reg = accuracy_score(y_test, y_pred_best_reg)
accuracy_best_reg , random_search_reg.best_params_


(0.6707,
 {'n_estimators': np.int64(450),
  'min_samples_split': np.int64(20),
  'min_samples_leaf': np.int64(9),
  'max_features': np.int64(8),
  'max_depth': np.int64(45),
  'ccp_alpha': np.float64(0.0)})

Accuracy tiếp tục giảm xuống còn 67.07%

Nhận xét:
- Thêm Regularization giúp giảm overfitting.
- Tối ưu hóa siêu tham số với RandomizedSearchCV
- RandomizedSearchCV không thử hết mọi tổ hợp, chưa tìm được bộ tốt nhất


#  Optimization

In [88]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 0.5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),

    }
    model = xgb.XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [89]:
import optuna

In [90]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-05-19 10:49:33,155] A new study created in memory with name: no-name-14f100ca-f229-4e7b-8c52-8d9d35c78dc9
[I 2025-05-19 10:50:20,084] Trial 0 finished with value: 0.68985 and parameters: {'n_estimators': 1274, 'max_depth': 4, 'learning_rate': 0.037484019467042405, 'subsample': 0.5547962148241199, 'colsample_bytree': 0.8450273179460314, 'gamma': 0.17948922376494209, 'min_child_weight': 9, 'reg_alpha': 0.7842384871744689, 'reg_lambda': 0.8756547987983047}. Best is trial 0 with value: 0.68985.
[I 2025-05-19 10:51:10,354] Trial 1 finished with value: 0.75365 and parameters: {'n_estimators': 1029, 'max_depth': 15, 'learning_rate': 0.02515272820948573, 'subsample': 0.9368823234504279, 'colsample_bytree': 0.9034922075009992, 'gamma': 0.31297974271087037, 'min_child_weight': 9, 'reg_alpha': 0.6284101448447325, 'reg_lambda': 0.49069457311275977}. Best is trial 1 with value: 0.75365.
[I 2025-05-19 10:52:12,526] Trial 2 finished with value: 0.91185 and parameters: {'n_estimators': 1116, '

In [91]:
best_params = study.best_params
best_params

{'n_estimators': 1582,
 'max_depth': 5,
 'learning_rate': 0.29147920800780686,
 'subsample': 0.7253787183374711,
 'colsample_bytree': 0.6094796282679398,
 'gamma': 0.06674779403528944,
 'min_child_weight': 5,
 'reg_alpha': 0.9978686629690103,
 'reg_lambda': 0.5463955287045978}

In [92]:
final_model = xgb.XGBClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)

In [93]:
y_pred_final = final_model.predict(X_test)

In [94]:
final_accuracy = accuracy_score(y_test, y_pred_final)
final_accuracy

0.93195

Accuracy tăng lên khoảng 93-94%.

Nhận xét:
- Optuna tối ưu hóa siêu tham số rất mạnh, giúp tìm bộ tốt hơn.
- Giảm overfitting của XGBoost cơ bản, giúp mô hình tổng quát hóa tốt hơn.
- Tìm siêu tham số tối ưu hiệu quả hơn RandomizedSearchCV.
- Giúp mô hình phục hồi accuracy lên 93%
- Chưa đạt mức accuracy của XGBoost ban đầu (99%). Nhưng có thể tránh overfitting tốt hơn