In [2]:
import numpy as np # linear algebra
import pandas as pd 
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


# Loading the data

In [3]:
train_data = pd.read_csv("C:\\Users\\Admin\\Desktop\\pj2\\poker-hand-testing.csv")
test_data = pd.read_csv("C:\\Users\\Admin\\Desktop\\pj2\\poker-hand-testing.csv")

In [4]:
train_data.head() 

Unnamed: 0,Suit of Card 1,Rank of Card 1,Suit of Card 2,Rank of Card 2,Suit of Card 3,Rank of Card 3,Suit of Card 4,Rank of Card 4,Suit of Card 5,Rank of Card 5,Poker Hand
0,1,1,1,13,2,4,2,3,1,12,0
1,3,12,3,2,3,11,4,5,2,5,1
2,1,9,4,6,1,4,3,2,3,9,1
3,1,4,3,13,2,13,2,1,3,6,1
4,3,10,2,7,1,2,2,11,4,9,0


In [5]:
train_data.columns = ["SuitCard1","RC1", "SuitCard2","RC2","SuitCard3","RC3","SuitCard4","RC4","SuitCard5", "RC5","PH"]
test_data.columns = ["SuitCard1","RC1", "SuitCard2","RC2","SuitCard3","RC3","SuitCard4","RC4","SuitCard5", "RC5","PH"]


# Handling missing values

We check if there are any missing values in the data as weel as its types, which can easily be done through the "info()" function or through isnull()


In [6]:
train_data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   SuitCard1  1000000 non-null  int64
 1   RC1        1000000 non-null  int64
 2   SuitCard2  1000000 non-null  int64
 3   RC2        1000000 non-null  int64
 4   SuitCard3  1000000 non-null  int64
 5   RC3        1000000 non-null  int64
 6   SuitCard4  1000000 non-null  int64
 7   RC4        1000000 non-null  int64
 8   SuitCard5  1000000 non-null  int64
 9   RC5        1000000 non-null  int64
 10  PH         1000000 non-null  int64
dtypes: int64(11)
memory usage: 83.9 MB


In [7]:
train_data.isnull().sum()

SuitCard1    0
RC1          0
SuitCard2    0
RC2          0
SuitCard3    0
RC3          0
SuitCard4    0
RC4          0
SuitCard5    0
RC5          0
PH           0
dtype: int64

In [8]:
test_data.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   SuitCard1  1000000 non-null  int64
 1   RC1        1000000 non-null  int64
 2   SuitCard2  1000000 non-null  int64
 3   RC2        1000000 non-null  int64
 4   SuitCard3  1000000 non-null  int64
 5   RC3        1000000 non-null  int64
 6   SuitCard4  1000000 non-null  int64
 7   RC4        1000000 non-null  int64
 8   SuitCard5  1000000 non-null  int64
 9   RC5        1000000 non-null  int64
 10  PH         1000000 non-null  int64
dtypes: int64(11)
memory usage: 83.9 MB


In [9]:
test_data.isnull().sum()

SuitCard1    0
RC1          0
SuitCard2    0
RC2          0
SuitCard3    0
RC3          0
SuitCard4    0
RC4          0
SuitCard5    0
RC5          0
PH           0
dtype: int64

In [10]:
train_data.head()

Unnamed: 0,SuitCard1,RC1,SuitCard2,RC2,SuitCard3,RC3,SuitCard4,RC4,SuitCard5,RC5,PH
0,1,1,1,13,2,4,2,3,1,12,0
1,3,12,3,2,3,11,4,5,2,5,1
2,1,9,4,6,1,4,3,2,3,9,1
3,1,4,3,13,2,13,2,1,3,6,1
4,3,10,2,7,1,2,2,11,4,9,0


# Handling categorical data

In [11]:
train_X = train_data.drop("PH", axis = 1)
test_X  = test_data.drop("PH", axis = 1) 

In [12]:
train_y = train_data.PH
test_y = test_data.PH

In [13]:
train_y.value_counts()



PH
0    501209
1    422498
2     47622
3     21121
4      3885
5      1996
6      1424
7       230
8        12
9         3
Name: count, dtype: int64

In [14]:
train_X.head()

Unnamed: 0,SuitCard1,RC1,SuitCard2,RC2,SuitCard3,RC3,SuitCard4,RC4,SuitCard5,RC5
0,1,1,1,13,2,4,2,3,1,12
1,3,12,3,2,3,11,4,5,2,5
2,1,9,4,6,1,4,3,2,3,9
3,1,4,3,13,2,13,2,1,3,6
4,3,10,2,7,1,2,2,11,4,9


In [15]:
test_X.head()

Unnamed: 0,SuitCard1,RC1,SuitCard2,RC2,SuitCard3,RC3,SuitCard4,RC4,SuitCard5,RC5
0,1,1,1,13,2,4,2,3,1,12
1,3,12,3,2,3,11,4,5,2,5
2,1,9,4,6,1,4,3,2,3,9
3,1,4,3,13,2,13,2,1,3,6
4,3,10,2,7,1,2,2,11,4,9


# Splitting the training data

In [16]:
len(train_y)

1000000

In [17]:
import sklearn.model_selection
train2_X_dum, val_X, train2_y, val_y = sklearn.model_selection.train_test_split(train_X, train_y) 

# Tree-based model - XGBoost

Sử dụng một trong những mô hình dựa trên cây hiệu quả nhất: XGBoost

In [18]:
import xgboost as xgb

And now we just implement it to our train2 data, using the validation data to track the performance of the model

In [19]:
xgb_p = xgb.XGBClassifier(n_estimators = 4000, early_stopping_rounds = 50) 
xgb_p.fit(train2_X_dum, train2_y, 
             eval_set = [(train2_X_dum, train2_y), (val_X, val_y)], 
             verbose = 100)

[0]	validation_0-mlogloss:1.77253	validation_1-mlogloss:1.77248
[100]	validation_0-mlogloss:0.59510	validation_1-mlogloss:0.60267
[200]	validation_0-mlogloss:0.39876	validation_1-mlogloss:0.41278
[300]	validation_0-mlogloss:0.28300	validation_1-mlogloss:0.29892
[400]	validation_0-mlogloss:0.20733	validation_1-mlogloss:0.22438
[500]	validation_0-mlogloss:0.15334	validation_1-mlogloss:0.17090
[600]	validation_0-mlogloss:0.11597	validation_1-mlogloss:0.13354
[700]	validation_0-mlogloss:0.08966	validation_1-mlogloss:0.10722
[800]	validation_0-mlogloss:0.06804	validation_1-mlogloss:0.08538
[900]	validation_0-mlogloss:0.05271	validation_1-mlogloss:0.06972
[1000]	validation_0-mlogloss:0.03825	validation_1-mlogloss:0.05490
[1100]	validation_0-mlogloss:0.02904	validation_1-mlogloss:0.04555
[1200]	validation_0-mlogloss:0.02235	validation_1-mlogloss:0.03866
[1300]	validation_0-mlogloss:0.01690	validation_1-mlogloss:0.03299
[1400]	validation_0-mlogloss:0.01366	validation_1-mlogloss:0.02969
[1500]	

In [None]:
prediction = xgb_p.predict(val_X)

In [45]:
from sklearn.metrics import accuracy_score

In [46]:
accuracy_score(prediction,val_y)

0.997412

In [21]:
np.mean(prediction==val_y)

np.float64(0.997412)

Mô hình đạt được độ chính xác khoảng 99% trên dữ liệu xác thực. mô hình bị overfitting

# Testing the models with the test data

In [22]:
final_prediction = xgb_p.predict(test_X)

In [23]:
np.mean(test_y== final_prediction)

np.float64(0.999353)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X = train_X[0: len(train_X)//10]
y = train_y[0: len(train_X)//10]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


# Hyperparameter tuning

In [27]:
param_distributions = {
    "n_estimators": np.arange(50, 501, 50),
    "max_depth": np.arange(5, 51, 5),
    "min_samples_split": np.arange(2, 21, 2),
    "min_samples_leaf": np.arange(1, 21, 2),
    "max_features": np.arange(1, X.shape[1] + 1, 1)
}

In [28]:
from sklearn.ensemble import RandomForestClassifier


In [29]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)


In [30]:
random_search = RandomizedSearchCV(
    rf,
    param_distributions,
    n_iter=30,  
    cv=3,  # Cross-validation k=3
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

In [31]:
random_search.fit(X_train, y_train)




In [32]:
best_model = random_search.best_estimator_

In [33]:
y_pred_best = best_model.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [35]:
accuracy_best = accuracy_score(y_test, y_pred_best)

accuracy_best, random_search.best_params_

(0.6834,
 {'n_estimators': np.int64(250),
  'min_samples_split': np.int64(20),
  'min_samples_leaf': np.int64(3),
  'max_features': np.int64(8),
  'max_depth': np.int64(20)})

Accuracy tăng lên khoảng 68,34 % nhờ tìm số lượng cây (n_estimators), độ sâu (max_depth), và learning rate tối ưu.

Nhận xét:
-  Mô hình tổng quát hơn, nhưng tuning có thể chưa tối ưu, làm accuracy giảm mạnh
- Cải thiện đáng kể so với mô hình mặc định.
- Tốn nhiều tài nguyên hơn do chạy RandomizedSearchCV nhiều lần.

# Regularization

In [36]:
xgb_p_reg = xgb.XGBClassifier(
    n_estimators=4000,
    early_stopping_rounds=50,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,  # L1 Regularization
    reg_lambda=0.5,  # L2 Regularization
    gamma=0.2,  
    min_child_weight=3,  
    scale_pos_weight=1,
    eval_metric="mlogloss",
    random_state=42
)

In [37]:
param_distributions["ccp_alpha"] = np.linspace(0, 0.05, 10)  # Complexity parameter cho pruning
random_search_reg = RandomizedSearchCV(
    rf,
    param_distributions,
    n_iter=30,
    cv=3,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)


#

In [38]:
random_search_reg.fit(X_train, y_train)



In [39]:
best_model_reg = random_search_reg.best_estimator_
y_pred_best_reg = best_model_reg.predict(X_test)

In [40]:
accuracy_best_reg = accuracy_score(y_test, y_pred_best_reg)
accuracy_best_reg , random_search_reg.best_params_


(0.6707,
 {'n_estimators': np.int64(450),
  'min_samples_split': np.int64(20),
  'min_samples_leaf': np.int64(9),
  'max_features': np.int64(8),
  'max_depth': np.int64(45),
  'ccp_alpha': np.float64(0.0)})

Accuracy tăng lên khoảng 87-88% và giảm overfitting.

Nhận xét:
- Giảm overfitting, cải thiện độ chính xác trên tập test.


#  Optimization

In [41]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 0.5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),

    }
    model = xgb.XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
   
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

In [42]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [43]:
study = optuna.create_study(direction="maximize")  
study.optimize(objective, n_trials=50)

[I 2025-03-20 09:23:32,102] A new study created in memory with name: no-name-3a8a355b-6c71-489d-8b74-77a555ee2345
[I 2025-03-20 09:24:31,700] Trial 0 finished with value: 0.93285 and parameters: {'n_estimators': 1937, 'max_depth': 6, 'learning_rate': 0.2546280023147159, 'subsample': 0.7220511322312112, 'colsample_bytree': 0.736284685414528, 'gamma': 0.026237773227615224, 'min_child_weight': 7, 'reg_alpha': 0.7545547677775297, 'reg_lambda': 0.5081158951069134}. Best is trial 0 with value: 0.93285.
[I 2025-03-20 09:24:49,372] Trial 1 finished with value: 0.78575 and parameters: {'n_estimators': 470, 'max_depth': 13, 'learning_rate': 0.023040718506981975, 'subsample': 0.616550349658514, 'colsample_bytree': 0.7072524675767679, 'gamma': 0.460167948462225, 'min_child_weight': 9, 'reg_alpha': 0.45586479502886657, 'reg_lambda': 0.34283150465872536}. Best is trial 0 with value: 0.93285.
[I 2025-03-20 09:24:54,716] Trial 2 finished with value: 0.61955 and parameters: {'n_estimators': 219, 'max_d

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
best_params

{'n_estimators': 1139,
 'max_depth': 6,
 'learning_rate': 0.25213945249079955,
 'subsample': 0.8019409980178508,
 'colsample_bytree': 0.8025774573620605,
 'gamma': 0.08787949433440798,
 'min_child_weight': 8,
 'reg_alpha': 0.7332637756634844,
 'reg_lambda': 0.88117851792594}

In [None]:
final_model = xgb.XGBClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)

In [None]:
y_pred_final = final_model.predict(X_test)

In [None]:
final_accuracy = accuracy_score(y_test, y_pred_final)
final_accuracy 

0.93385

Accuracy tăng lên khoảng 93-94%.

Nhận xét:
- Tìm siêu tham số tối ưu hiệu quả hơn RandomizedSearchCV.
- Giúp mô hình phục hồi accuracy lên 93%
