# Import Libraies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
train = pd.read_csv("/mnt/hdd/Datasets/poker-hand-training-true.data", names=["S1", "C1", "S2", "C2", "S3", "C3", "S4", "C4", "S5", "C5", "Label"])
train.head()

In [None]:
test = pd.read_csv("/mnt/hdd/Datasets/poker-hand-testing.data", names=["S1", "C1", "S2", "C2", "S3", "C3", "S4", "C4", "S5", "C5", "Label"])
test.head()

In [None]:
print("Train dataset shape:", train.shape)
print("Test dataset shape:", test.shape)

# Extract Features

In [None]:
X_train = train.iloc[:, :-1]
X_test = test.iloc[:, :-1]

y_train = train.iloc[:, -1]
y_test = test.iloc[:, -1]

# Train

In [None]:
kf = KFold(n_splits=5, shuffle=True)

In [None]:
def cross_validation(model, X, y):
    model_log = pd.DataFrame(columns=["Accuracy", "F1", "Precision", "Recall"])
    for fold, (train_, test_) in enumerate(kf.split(X=X, y=y)):
        xtrain = X.iloc[train_, :]
        xtest = X.iloc[test_, :]
        ytrain = y.iloc[train_]
        ytest = y.iloc[test_]

        model.fit(xtrain, ytrain)

        y_pred = model.predict(xtest)
        print(f"The fold is: {fold}")
        #print(classification_report(ytest, y_pred))

        cm = confusion_matrix(ytest, y_pred)
        
        model_log.loc[fold, "Accuracy"] = accuracy_score(ytest, y_pred)
        model_log.loc[fold, "F1"] = f1_score(ytest, y_pred, average="weighted")
        model_log.loc[fold, "Precision"] = precision_score(ytest, y_pred, average="weighted")
        model_log.loc[fold, "Recall"] = recall_score(ytest, y_pred, average="weighted")
    
    return model_log, cm

In [None]:
dt = DecisionTreeClassifier()
dt_log, dt_cm = cross_validation(dt, X_train, y_train)
plot_confusion_matrix(conf_mat=dt_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
xgb = XGBClassifier()
xgb_log, xgb_cm = cross_validation(xgb, X_train, y_train)
plot_confusion_matrix(conf_mat=xgb_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
rf = RandomForestClassifier()
rf_log, rf_cm = cross_validation(rf, X_train, y_train)
plot_confusion_matrix(conf_mat=rf_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
lgbm = LGBMClassifier()
lgbm_log, lgbm_cm = cross_validation(lgbm, X_train, y_train)
plot_confusion_matrix(conf_mat=lgbm_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
def preprocess(df):
    suits = df[["S1", "S2", "S3", "S4", "S5"]]
    cards = df[["C1", "C2", "C3", "C4", "C5"]]
    
    df["Diff_1"] = df["C5"] - df["C4"]
    df["Diff_2"] = df["C4"] - df["C3"]
    df["Diff_3"] = df["C3"] - df["C2"]
    df["Diff_4"] = df["C2"] - df["C1"]

    df["Unique_Suit"] = suits.apply(lambda x: len(np.unique(x)), axis=1)

    df["sum_S1"] = suits.apply(lambda x: sum(x == x[0]), axis=1)
    df["sum_C1"] = cards.apply(lambda x: sum(x == x[0]), axis=1)
    df["sum_S2"] = suits.apply(lambda x: sum(x == x[1]), axis=1)
    df["sum_C2"] = cards.apply(lambda x: sum(x == x[1]), axis=1)
    df["sum_S3"] = suits.apply(lambda x: sum(x == x[2]), axis=1)
    df["sum_C3"] = cards.apply(lambda x: sum(x == x[2]), axis=1)
    df["sum_S4"] = suits.apply(lambda x: sum(x == x[3]), axis=1)
    df["sum_C4"] = cards.apply(lambda x: sum(x == x[3]), axis=1)
    df["sum_S5"] = suits.apply(lambda x: sum(x == x[4]), axis=1)
    df["sum_C5"] = cards.apply(lambda x: sum(x == x[4]), axis=1)

    return df

In [None]:
train_pre = preprocess(train)
train_pre.sample(5)

In [None]:
test_pre = preprocess(test)
test_pre.sample(5)

In [None]:
X_train_pre = train_pre.drop("Label", axis=1)
X_test_pre = test_pre.drop("Label", axis=1)

y_train_pre = train_pre["Label"]
y_test_pre = test_pre["Label"]

In [None]:
dt2 = DecisionTreeClassifier()
dt_log2 dt2_cm = cross_validation(dt2, X_train_pre, y_train_pre)
plot_confusion_matrix(conf_mat=dt2_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
xgb2 = XGBClassifier()
xgb_log2, xgb2_cm = cross_validation(xgb2, X_train_pre, y_train_pre)
plot_confusion_matrix(conf_mat=xgb2_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
rf2 = RandomForestClassifier()
rf_log2, rf2_cm = cross_validation(rf2, X_train_pre, y_train_pre)
plot_confusion_matrix(conf_mat=rf2_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

In [None]:
lgbm2 = LGBMClassifier()
lgbm_log2, lgbm2_cm = cross_validation(lgbm2, X_train_pre, y_train_pre)
plot_confusion_matrix(conf_mat=lgbm2_cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8));

# Results

In [None]:
result_df = pd.DataFrame({
    "Model": ["DT", "XGB", "RF", "LGBM", "DT2", "XGB2", "RF2", "LGBM"],
    "Accuracy": [dt_log["Accuracy"].mean(), xgb_log["Accuracy"].mean(), rf_log["Accuracy"].mean(), lgbm_log["Accuracy"].mean(),
                 dt_log2["Accuracy"].mean(), xgb_log2["Accuracy"].mean(), rf_log2["Accuracy"].mean(), lgbm_log2["Accuracy"].mean()],
    "F1": [dt_log["F1"].mean(), xgb_log["F1"].mean(), rf_log["F1"].mean(), lgbm_log["F1"].mean(),
                 dt_log2["F1"].mean(), xgb_log2["F1"].mean(), rf_log2["F1"].mean(), lgbm_log2["F1"].mean()],
    "Precision": [dt_log["Precision"].mean(), xgb_log["Precision"].mean(), rf_log["Precision"].mean(), lgbm_log["Precision"].mean(),
                 dt_log2["Precision"].mean(), xgb_log2["Precision"].mean(), rf_log2["Precision"].mean(), lgbm_log2["Precision"].mean()],
    "Recall": [dt_log["Recall"].mean(), xgb_log["Recall"].mean(), rf_log["Recall"].mean(), lgbm_log["Recall"].mean(),
                 dt_log2["Recall"].mean(), xgb_log2["Recall"].mean(), rf_log2["Recall"].mean(), lgbm_log2["Recall"].mean()]
})

result_df.head(8)

In [None]:
result_df.sort_values(by="Accuracy", ascending=False)[["Model", "Accuracy"]]

In [None]:
result_df.sort_values(by="F1", ascending=False)[["Model", "F1"]]

In [None]:
result_df.sort_values(by="Precision", ascending=False)[["Model", "Precision"]]

In [None]:
result_df.sort_values(by="Recall", ascending=False)[["Model", "Recall"]]

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(221)
ax = sns.barplot(data=result_df, x="Model", y="Accuracy")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Accuracy Score")

plt.subplot(222)
ax = sns.barplot(data=result_df, x="Model", y="F1")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / F1 Score")

plt.subplot(223)
ax = sns.barplot(data=result_df, x="Model", y="Precision")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Precision Score")

plt.subplot(224)
ax = sns.barplot(data=result_df, x="Model", y="Recall")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Recall Score")

plt.show()