In [41]:
# 載入必要的套件
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 讀取CSV檔案
file_path = r'..\..\csv\飲料店總表0307final01_補上人氣_補值_hg.csv.csv'
data = pd.read_csv(file_path, sep=",", encoding="UTF-8-sig")

# 分割特徵和目標變數
X = data.drop('popularity', axis=1)  
y = data['popularity']

# 將y中的所有值四捨五入為整數
y = y.round().astype(int)

# 1. 轉換 object 資料類型的欄位
X['name'] = X['name'].astype('category')
X['class'] = X['class'].astype('category')
X['address'] = X['address'].astype('category')
X['district'] = X['district'].astype('category')
X['neighborhood'] = X['neighborhood'].astype('category')
X['brand'] = X['brand'].astype('category')

# 2. 設定 categorical 欄位，同時啟用 enable_categorical
categorical_cols = ['name', 'class', 'address', 'district', 'neighborhood', 'brand']
X[categorical_cols] = X[categorical_cols].astype('category')

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)




In [42]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)



In [45]:
# 建立XGBoost模型
model = xgb.XGBClassifier(
    enable_categorical=True,
)

# 訓練模型
model.fit(X_train, y_train)


In [56]:
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
import seaborn as sn

y_test_pred_xg = model.predict(X_test)

cm = confusion_matrix(y_test, y_test_pred_xg)
class_names = pd.unique(y_test).astype(str)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
plt.figure(figsize = (28,20))

fig, ax = plt.subplots()
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, fmt='g')

class_names=[0,1]
tick_marks = np.arange(len(class_names))    
plt.tight_layout()
plt.title('Confusion matrix\n', y=1.1)
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
ax.xaxis.set_label_position("top")
plt.ylabel('Actual label\n')
plt.xlabel('Predicted label\n')

ValueError: Shape of passed values is (961, 961), indices imply (711, 711)

In [57]:
y_test_pred_xg

array([  1, 588,  22, ...,   0,  25, 391], dtype=int64)

In [58]:
y_test

2574       0
3121     503
1957     588
429     1530
465      153
        ... 
2727       1
1718     341
4252       0
3128      25
108      422
Name: popularity, Length: 1423, dtype: int32

In [46]:
# 預測測試集
y_pred = model.predict(X_test)

print(f'Accuracy : {round((178+926)/(178+126+926+177),3)}')    # (TP + TN) / (TP + FP + TN + FN) 
print(f'Precision : {round((178)/(178+126), 3)}')              # TP / (TP + FP)
print(f'Recall/Sensitivity : {round((178)/(178+177), 3)}')     # TP / (TP + FN)
print(f'Specificity : {round((958)/(958+126), 3)}')            # TN / (TN + FP)

Accuracy : 0.785
Precision : 0.586
Recall/Sensitivity : 0.501
Specificity : 0.884
