In [53]:
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score,confusion_matrix
from sklearn.model_selection import cross_val_score
def score(m, x_train, y_train, x_test, y_test, train=True):
    if train:
        pred=m.predict(x_train)
        print('Train Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_train, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_train, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_train, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_train, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_train, pred)}")
    elif train == False:
        pred=m.predict(x_test)
        print('Test Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_test, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_test, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_test, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_test, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, pred)}")

In [54]:
import pandas as pd

df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
null_counts = df.isnull().sum()
nan_counts = df.isna().sum()
# print(null_counts,nan_counts)
df = df.drop("index",axis=1)
null_counts = df.isnull().sum()
nan_counts = df.isna().sum()
# print(null_counts,nan_counts)
print(len(df))

156076


In [55]:
df = df[df["0"] < 100].reset_index(drop=True)
print(len(df))

114514


In [69]:
# !pip install xgboost
# !pip install imblearn
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
base_rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=6,
        min_samples_split=5,
        min_samples_leaf=2,
        max_samples=0.5,       
        max_features='sqrt', 
        bootstrap=True,     
        class_weight='balanced',
        random_state=42,
    )

ada_model = AdaBoostClassifier(
    estimator=base_rf,
    n_estimators=200,      
    learning_rate=0.1,
    random_state=42
)

In [48]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 計算類別權重（輕度不平衡調整）
X, y = df.drop(["target"], axis=1), df["target"]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
scale_pos_weight = (len(y) - sum(y)) / sum(y)  # 約 0.71
model = XGBClassifier(
    # 核心參數
    # early_stopping_rounds=20,
    n_estimators=401,
    max_depth=9,                # 捕捉局部結構
    learning_rate=0.1,         # 小學習率配合更多樹
    
    # 不平衡調整
    scale_pos_weight=scale_pos_weight,
    
    # 正則化
    subsample=0.8,
    colsample_bytree=0.7,
    gamma=0.1,

    tree_method='hist',         # 加速訓練
    random_state=42
)

# 訓練與評估  eval_set=[(X_test, y_test)],,verbose=True
model.fit(X_train, y_train)

In [49]:
score(model, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy Score: 88.05%
Precision Score: 89.06%
Recall Score: 87.03%
F1 score: 88.03%
Confusion Matrix:
 [[16401  2009]
 [ 2439 16363]]


In [70]:
m2 = ada_model.fit(X, y)

In [71]:
y_pred = m2.predict(df_test.drop("index",axis=1))

results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

results.to_csv("predictions.csv", index=False)

In [65]:
print(y_resampled.value_counts())

target
0    93028
1    93028
Name: count, dtype: int64


## Neural network

In [36]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

## 1. 數據準備 (假設你已經有 X 和 y)
print(df["target"].value_counts())
X, y = df.drop(["target"], axis=1), df["target"]  # 請替換為你的數據加載方式
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## 2. 計算類別權重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

## 3. 定義模型架構
def create_model(input_dim):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.5),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    return model

model = create_model(X_train.shape[1])

## 4. 第一階段：在不平衡數據上預訓練
print("=== 第一階段：原始不平衡數據訓練 ===")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy', 
                       tf.keras.metrics.AUC(name='auc'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

# 定義早停和模型檢查點
callbacks = [
    EarlyStopping(monitor='val_auc', patience=10, mode='max', verbose=1),
    ModelCheckpoint('phase1_best_model.h5', monitor='val_auc', 
                   save_best_only=True, mode='max', verbose=1)
]

history1 = model.fit(
    X, y,
    epochs=100,
    batch_size=64,
    class_weight=class_weight_dict,
    callbacks=callbacks,
    verbose=1
)

target
0    93028
1    21486
Name: count, dtype: int64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


=== 第一階段：原始不平衡數據訓練 ===
Epoch 1/100
[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.5803 - auc: 0.5957 - loss: 0.7033 - precision: 0.2335 - recall: 0.5434
Epoch 2/100
[1m   8/1790[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13s[0m 7ms/step - accuracy: 0.6411 - auc: 0.6718 - loss: 0.6452 - precision: 0.2923 - recall: 0.5387   

  current = self.get_monitor_value(logs)
  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.6494 - auc: 0.6771 - loss: 0.6401 - precision: 0.2864 - recall: 0.5955
Epoch 3/100
[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.6542 - auc: 0.7031 - loss: 0.6272 - precision: 0.3018 - recall: 0.6423
Epoch 4/100
[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.6601 - auc: 0.7213 - loss: 0.6166 - precision: 0.3135 - recall: 0.6712
Epoch 5/100
[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.6628 - auc: 0.7300 - loss: 0.6085 - precision: 0.3160 - recall: 0.6834
Epoch 6/100
[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.6627 - auc: 0.7328 - loss: 0.6088 - precision: 0.3201 - recall: 0.6874
Epoch 7/100
[1m1790/1790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.6720 - auc: 0.7397 - loss: 0.6011

In [37]:
## 5. 第二階段：在平衡數據上微調
print("\n=== 第二階段：平衡數據微調 ===")

# 使用SMOTE平衡訓練數據
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X, y)

# 重新加載第一階段最佳模型
model = tf.keras.models.load_model('phase1_best_model.h5')

# 降低學習率進行微調
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy', 
                       tf.keras.metrics.AUC(name='auc'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

# 第二階段不需要類別權重，因為數據已經平衡
history2 = model.fit(
    X_train_balanced, y_train_balanced,
    epochs=50,
    batch_size=64,
    callbacks=[
        EarlyStopping(monitor='val_f1_score', patience=5, mode='max', verbose=1,
                      restore_best_weights=True),
        ModelCheckpoint('final_model.h5', monitor='val_f1_score', 
                       save_best_only=True, mode='max', verbose=1)
    ],
    verbose=1
)


=== 第二階段：平衡數據微調 ===




Epoch 1/50
[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7023 - auc: 0.7726 - loss: 0.5720 - precision: 0.7080 - recall: 0.6958
Epoch 2/50
[1m  82/2908[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 2ms/step - accuracy: 0.7212 - auc: 0.7994 - loss: 0.5424 - precision: 0.7182 - recall: 0.7170 

  current = self.get_monitor_value(logs)
  self._save_model(epoch=epoch, batch=None, logs=logs)


[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7208 - auc: 0.7957 - loss: 0.5477 - precision: 0.7199 - recall: 0.7237
Epoch 3/50
[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7278 - auc: 0.8040 - loss: 0.5385 - precision: 0.7271 - recall: 0.7306
Epoch 4/50
[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7316 - auc: 0.8079 - loss: 0.5343 - precision: 0.7312 - recall: 0.7331
Epoch 5/50
[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7338 - auc: 0.8122 - loss: 0.5288 - precision: 0.7337 - recall: 0.7339
Epoch 6/50
[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7383 - auc: 0.8166 - loss: 0.5239 - precision: 0.7389 - recall: 0.7355
Epoch 7/50
[1m2908/2908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7407 - auc: 0.8186 - loss: 0.5217 - precisio

In [38]:

model.save('final_model.h5') 

## 6. 評估最終模型
print("\n=== 最終評估 ===")
final_model = tf.keras.models.load_model('final_model.h5')

# # 預測概率
# y_pred_proba = final_model.predict(X_val)
# # 轉換為類別預測 (默認閾值0.5)
# y_pred = (y_pred_proba > 0.5).astype(int)

# print(classification_report(y_val, y_pred))
# print(f"Validation F1 Score: {f1_score(y_val, y_pred):.4f}")

# # 可以調整閾值來優化F1分數
# from sklearn.metrics import precision_recall_curve
# precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
# f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
# best_threshold = thresholds[np.argmax(f1_scores)]
# print(f"Best threshold for F1: {best_threshold:.4f}")

# # 用最佳閾值重新預測
# y_pred_optimized = (y_pred_proba > best_threshold).astype(int)
# print("\nOptimized Classification Report:")
# print(classification_report(y_val, y_pred_optimized))




=== 最終評估 ===




In [43]:
import tensorflow as tf
import numpy as np

# 1. 載入模型
model = tf.keras.models.load_model('./final_model.h5')

# 2. 準備新數據（這裡創建一個虛擬數據範例）
# 替換為您的實際數據，確保特徵數量與訓練時相同
X_new = df_test.drop("index",axis=1)  # 假設模型預期10個特徵
# print(X_new)
# 3. 進行預測
y_pred_proba = model.predict(X_new)
best_threshold = 0.5  # 使用您的最佳閾值
y_pred = (y_pred_proba > best_threshold).astype(int).flatten()

# 4. 輸出結果
print("\n=== 預測結果 ===")
print(f"輸入數據形狀: {X_new.shape}")
print(f"預測概率: {y_pred_proba[0][0]:.4f}")
# print(f"預測類別 (閾值={best_threshold}): {y_pred[0][0]}")
# print(f"預測標籤: {'1' if y_pred[0][0] else '0'}")



[1m6073/6073[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 511us/step

=== 預測結果 ===
輸入數據形狀: (194330, 96)
預測概率: 0.3358


In [44]:
print(y_pred.sum())

results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

results.to_csv("predictions.csv", index=False)

42834


## Useless

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

X, y = df.drop(["target"], axis=1), df["target"]

# 分割訓練集和測試集
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(
    n_estimators=100,  # 樹的數量
    learning_rate=0.1,  # 學習率
    random_state=42,
    eval_metric='logloss'  # 二分類常用 logloss
)

xgb_model.fit(X_train, y_train)


In [11]:
score(xgb_model, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy Score: 88.94%
Precision Score: 97.31%
Recall Score: 80.06%
F1 score: 87.85%
Confusion Matrix:
 [[18217   411]
 [ 3706 14878]]


In [12]:
m = XGBClassifier(
    n_estimators=100,  # 樹的數量
    learning_rate=0.1,  # 學習率
    random_state=42,
    eval_metric='logloss'  # 二分類常用 logloss
)
cv_f1_scores = cross_val_score(m, X_resampled, y_resampled, cv=5, scoring="f1")

In [13]:
cv_f1_scores

array([0.82301575, 0.8164022 , 0.81841528, 0.9220514 , 0.95858277])

In [28]:
final_m = m.fit(X_resampled, y_resampled)
y_pred = final_m.predict(df_test.drop("index",axis=1))

results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

results.to_csv("predictions.csv", index=False)

## Get outliers

In [15]:
# 將重採樣後的數據合併回DataFrame
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['target'] = y_resampled

# 檢查新的類別分佈
print(resampled_df['target'].value_counts())

target
1    93028
0    93028
Name: count, dtype: int64


In [22]:
from sklearn.cluster import DBSCAN
import numpy as np
dbscan = DBSCAN(eps=10, min_samples=5)  # 調整參數
clusters = dbscan.fit_predict(resampled_df)

outlier_indices_dbscan = np.where(clusters == -1)[0]  # -1 代表異常點
print(f"Outliers 索引: {len(outlier_indices_dbscan)}")

Outliers 索引: 18544


KeyError: np.int64(156096)

In [30]:
cnt_1,cnt_0=0,0
for id in outlier_indices_dbscan:
    if resampled_df["target"][id] == 1:
        cnt_1 += 1
    if resampled_df["target"][id] == 0:
        cnt_0 += 1
print(f"DBSCAN : count_1 = {cnt_1}, count_0 = {cnt_0}")

df_dbscan = resampled_df[~resampled_df.index.isin(outlier_indices_dbscan)]
print(df_dbscan["target"].value_counts())

DBSCAN : count_1 = 18532, count_0 = 12
target
0    93016
1    74496
Name: count, dtype: int64


In [33]:
X, y = df_dbscan.drop(["target"], axis=1), df_dbscan["target"]

# 分割訓練集和測試集
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_resampled2, y_resampled2 = smote.fit_resample(X, y)
X_resampled2 = X_resampled2.drop("is_outlier",axis=1)

In [35]:
final_m = m.fit(X_resampled2, y_resampled2)
y_pred = final_m.predict(df_test.drop("index",axis=1))

results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

results.to_csv("predictions2.csv", index=False)

In [21]:
# from sklearn.ensemble import IsolationForest
# import numpy as np

# clf = IsolationForest(contamination=0.05, random_state=42)  # contamination 是異常值比例
# outliers = clf.fit_predict(df)  # 回傳 1（正常）, -1（異常）

# outlier_indices_isoF = np.where(outliers == -1)[0]
# print(f"Outliers 索引: {len(outlier_indices_isoF)}")
# cnt_1,cnt_0=0,0
# for id in outlier_indices_isoF:
#     if df["target"][id] == 1:
#         cnt_1 += 1
#     if df["target"][id] == 0:
#         cnt_0 += 1
# print(f"count_1 = {cnt_1}, count_0 = {cnt_0}")
# df_isoF = df[~df.index.isin(outlier_indices_isoF)]

Outliers 索引: 7804
count_1 = 7802, count_0 = 2


In [22]:
# from sklearn.neighbors import LocalOutlierFactor

# lof = LocalOutlierFactor(n_neighbors=15, contamination=0.05)
# outliers = lof.fit_predict(df)

# outlier_indices_local = np.where(outliers == -1)[0]
# print(f"Outliers 索引: {len(outlier_indices_local)}")
# cnt_1,cnt_0=0,0
# for id in outlier_indices_local:
#     if df["target"][id] == 1:
#         cnt_1 += 1
#     if df["target"][id] == 0:
#         cnt_0 += 1
# print(f"DBSCAN : count_1 = {cnt_1}, count_0 = {cnt_0}")

# df_local = df[~df.index.isin(outlier_indices_local)]
# print(df_local["target"].value_counts())

Outliers 索引: 7804
DBSCAN : count_1 = 4699, count_0 = 3105
target
0    89923
1    58349
Name: count, dtype: int64


In [3]:
pip install xgboost



XGBoost1 測試集準確率: 0.84
XGBoost1 F1-score: 0.6160
XGBoost2 測試集準確率: 0.87
XGBoost2 F1-score: 0.7914
XGBoost3 測試集準確率: 0.88
XGBoost3 F1-score: 0.8170


In [26]:
y_pred = xgb_model1.predict(df_test.drop("index",axis=1))
results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

# 輸出 CSV
results.to_csv("predictions1.csv", index=False)

y_pred = xgb_model2.predict(df_test.drop("index",axis=1))
results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

# 輸出 CSV
results.to_csv("predictions2.csv", index=False)
y_pred = xgb_model3.predict(df_test.drop("index",axis=1))
results = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y_pred
})

# 輸出 CSV
results.to_csv("predictions3.csv", index=False)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import numpy as np
from xgboost import XGBClassifier

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
learning_rate=[round(float(x),2) for x in np.linspace(start=0.01, stop=0.2, num=10)]
colsample_bytree =[round(float(x),2) for x in np.linspace(start=0.1, stop=1, num=10)]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'learning_rate': learning_rate,
               'colsample_bytree': colsample_bytree}

xg4 = XGBClassifier(random_state=42)


# 加載數據
X, y = df.drop(["target"], axis=1), df["target"]

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores
xg_random = RandomizedSearchCV(estimator = xg4, param_distributions=random_grid,
                              n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

xg_random.fit(X_train,y_train)
xg_random.best_params_

xg5 = XGBClassifier(colsample_bytree= 0.2, learning_rate=0.09, max_depth= 10, n_estimators=1200)
xg5=xg5.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits




In [None]:
from sklearn.metrics import f1_score

score(xg5, X_train, y_train, X_test, y_test, train=False)

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import f1_score
# from xgboost import XGBClassifier
# from imblearn.combine import SMOTEENN

# smote_enn = SMOTEENN(random_state=42)
# X_resampled, y_resampled = smote_enn.fit_resample(X, y)
# # 加載數據
# X1, y1 = df_dbscan.drop(["target"], axis=1), df_dbscan["target"]
# X2, y2 = df_isoF.drop(["target"], axis=1), df_isoF["target"]
# X3, y3 = df_local.drop(["target"], axis=1), df_local["target"]

# # 分割訓練集和測試集
# X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
# X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)
# X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42)
# # 初始化 XGBoost
# xgb_model1 = XGBClassifier(
#     n_estimators=100,  # 樹的數量
#     learning_rate=0.1,  # 學習率
#     random_state=42,
#     eval_metric='logloss'  # 二分類常用 logloss
# )

# xgb_model2 = XGBClassifier(
#     n_estimators=100,  # 樹的數量
#     learning_rate=0.1,  # 學習率
#     random_state=42,
#     eval_metric='logloss'  # 二分類常用 logloss
# )

# xgb_model3 = XGBClassifier(
#     n_estimators=100,  # 樹的數量
#     learning_rate=0.1,  # 學習率
#     random_state=42,
#     eval_metric='logloss'  # 二分類常用 logloss
# )
# # 訓練模型
# xgb_model1.fit(X1_train, y1_train)
# xgb_model2.fit(X2_train, y2_train)
# xgb_model3.fit(X3_train, y3_train)

# # 預測並評估
# y_pred1 = xgb_model1.predict(X1_test)
# y_pred2 = xgb_model2.predict(X2_test)
# y_pred3 = xgb_model3.predict(X3_test)
# print(f"XGBoost1 測試集準確率: {xgb_model1.score(X1_test, y1_test):.2f}")
# print(f"XGBoost1 F1-score: {f1_score(y1_test, y_pred1):.4f}")
# print(f"XGBoost2 測試集準確率: {xgb_model2.score(X2_test, y2_test):.2f}")
# print(f"XGBoost2 F1-score: {f1_score(y2_test, y_pred2):.4f}")
# print(f"XGBoost3 測試集準確率: {xgb_model3.score(X3_test, y3_test):.2f}")
# print(f"XGBoost3 F1-score: {f1_score(y3_test, y_pred3):.4f}")