# functiong說明

save_and_open_excel 存成EXCEL 然後 打開
>save_and_open_excel(dataframe, file_name="模型相關數值.xlsx")

analyze_nan_and_zero_values 檢視空值
>analyze_nan_and_zero_values(data, threshold, sort_column="零值+NAN佔比 (%)", ascending=False)

output_coefficients 輸出模型係數
>output_coefficients(model, poly_features, feature_names)

plot_confusion_matrix 繪製混淆矩陣的熱力圖
>plot_confusion_matrix(y_test, y_pred_lr)

In [None]:
def save_and_open_excel(dataframe, file_name="模型相關數值.xlsx"):
    """
    將給定的 dataframe 保存到指定的 Excel 工作表中，如果工作表已存在則覆蓋，
    並嘗試在支持的應用程式中打開該 Excel 文件。

    :param dataframe: 要保存的 pandas DataFrame。
    :param file_name: Excel 文件的名稱，默認為 'vif_data.xlsx'。
    """
    import pandas as pd
    import os

    # 獲取 dataframe 的變數名稱
    frame_name = [name for name, var in globals().items() if var is dataframe][0]

    # 檢查工作表是否存在，如果不存在則創建
    if not os.path.isfile(file_name):
        with pd.ExcelWriter(file_name, engine="openpyxl") as writer:
            dataframe.to_excel(writer, sheet_name=frame_name, index=False)
    else:
        # 工作表存在，加載工作簿
        with pd.ExcelWriter(
            file_name, engine="openpyxl", mode="a", if_sheet_exists="replace"
        ) as writer:
            dataframe.to_excel(writer, sheet_name=frame_name, index=False)

    # 嘗試打開創建的 Excel 文件
    try:
        os.startfile(file_name)
    except AttributeError:
        # 如果 os.startfile() 不可用（例如在非 Windows 系統上），使用適當的替代方案
        if os.name == "posix":
            os.system(f'open "{file_name}"')
        else:
            os.system(f'start "{file_name}"')
    except Exception as e:
        print(f"打開 Excel 文件時發生錯誤: {e}")

In [None]:
"""檢視空值及零值"""

import pandas as pd


def analyze_nan_and_zero_values(
    data, threshold, sort_column="零值+NAN佔比 (%)", ascending=False
):
    """
    檢查0值和 nan 的數量
    threshold 為 百分比，例threshold=20，會回傳 零值+NAN佔比 (%) 大於20% 的
    當 threshold = 0 會回傳全部
    """

    # 如果輸入是文件路徑，讀取CSV文件並轉換成Pandas DataFrame

    if isinstance(data, str):

        file_source_pd = pd.read_csv(data)

    # 如果輸入是Pandas DataFrame，直接使用它

    elif isinstance(data, pd.DataFrame):

        file_source_pd = data

    else:

        raise ValueError(
            "Invalid input type. Input must be either file path or Pandas DataFrame."
        )

    # 計算這個範圍內每一欄的空白值(NaN)數量

    nan_values_per_column_in_range = file_source_pd.isnull().sum(axis=0)

    # 計算每個欄位的總數據量

    total_data_per_column = len(file_source_pd)

    # 計算每個欄位的空白值(NaN)佔比

    nan_percentage_per_column = (
        nan_values_per_column_in_range / total_data_per_column
    ) * 100

    # 計算這個範圍內每一欄的零值數量

    zero_values_per_column_in_range = (file_source_pd == 0).sum(axis=0)

    # 計算每個欄位的零值佔比

    zero_percentage_per_column = (
        zero_values_per_column_in_range / total_data_per_column
    ) * 100

    # 計算每個欄位的零值和NaN值的總數

    total_zero_and_nan_per_column = (
        zero_values_per_column_in_range + nan_values_per_column_in_range
    )

    # 計算每個欄位的零值和NaN值總數的佔比

    total_zero_and_nan_percentage_per_column = (
        total_zero_and_nan_per_column / total_data_per_column
    ) * 100

    # 將結果轉換為 DataFrame

    values_df = pd.DataFrame(
        {
            "欄位名稱": nan_values_per_column_in_range.index,
            "零值+NAN佔比 (%)": total_zero_and_nan_percentage_per_column.values.round(
                2
            ),
            "空白值(NaN)數量": nan_values_per_column_in_range.values,
            "空白值(NaN)佔比 (%)": nan_percentage_per_column.values.round(2),
            "零值數量": zero_values_per_column_in_range.values,
            "零值佔比 (%)": zero_percentage_per_column.values.round(2),
        }
    )

    # 將 DataFrame 存儲到 CSV 檔案中
    values_df.to_csv("零值與空白值統計.csv", index=False, encoding="utf-8-sig")

    # 如果 threshold 等於 0，則返回所有欄位
    if threshold == 0:
        return values_df

    # 篩選出 '零值+NAN佔比 (%)' 大於指定閾值的欄位

    filtered_values_df = values_df[values_df["零值+NAN佔比 (%)"] > threshold]

    # 根據指定的列進行排序

    sorted_values_df = filtered_values_df.sort_values(
        by=sort_column, ascending=ascending
    )

    return sorted_values_df


# 測試


# result_df = analyze_nan_and_zero_values(clean_data, threshold=0, sort_column='欄位名稱', ascending=False)


# print(result_df)

In [None]:
"""輸出模型係數"""

import pandas as pd
import os


def output_coefficients(model, poly_features, feature_names):
    """
    model：你的模型物件，通常是訓練好的機器學習模型，例如線性回歸模型、支持向量機模型等。

    poly_features：多項式特徵物件，這通常是一個 PolynomialFeatures 物件，用於產生多項式特徵。

    feature_names：特徵名稱列表，這是一個包含特徵名稱的字串列表，用於識別每個特徵
    """
    coef = model.coef_
    intercept = model.intercept_

    # 創建包含係數和特徵名稱的DataFrame
    coef_data = {
        "係數": [coef[i] for i in range(len(coef)) if coef[i] != 0],
        "特徵": [feature_names[i] for i in range(len(coef)) if coef[i] != 0],
    }
    coef_df = pd.DataFrame(coef_data)
    coef_df = pd.concat(
        [pd.DataFrame({"係數": [intercept], "特徵": ["截距"]}), coef_df],
        ignore_index=True,
    )

    # 創建包含特徵名稱的DataFrame
    feature_names_df = pd.DataFrame({"特徵名稱": feature_names})

    # 輸出表格
    print("係數和截距表：")
    print(coef_df)
    print("\n特徵名稱表：")
    print(feature_names_df)

    # 將 DataFrame 存入 Excel
    coef_df.to_excel("coefficients.xlsx", index=False)

    # 使用 os 模組打開 Excel 檔案
    os.system("start coefficients.xlsx")

In [None]:
"""創建混合矩陣"""

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np  # 確保導入 numpy


def plot_confusion_matrix(y_true, y_pred, figsize=(10, 7), cmap="Blues"):
    """
    繪製混淆矩陣的熱力圖。

    參數:
    y_true -- 真實標籤
    y_pred -- 預測標籤
    figsize -- 圖形的大小 (預設為 (10, 7))
    cmap -- 熱力圖的顏色映射 (預設為 'Blues')
    """
    # 生成混淆矩陣
    cm = confusion_matrix(y_true, y_pred)

    # 轉換為比例
    cm_ratio = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    # 使用 Seaborn 畫出混淆矩陣的熱力圖
    plt.figure(figsize=figsize)
    sns.heatmap(
        cm_ratio, annot=True, fmt=".2f", cmap=cmap
    )  # fmt='.2f' 指定顯示兩位小數
    plt.title("Confusion Matrix")
    plt.ylabel("Actual label")
    plt.xlabel("Predicted label")
    plt.show()


# 使用此函數的示例
# plot_confusion_matrix(y_test, y_pred_lr)  # 調用函數繪製混淆矩陣

In [None]:
"""二類分類指標"""


from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

import matplotlib.pyplot as plt

import seaborn as sn
import numpy as np
import pandas as pd


# 忽略警告用的

import warnings


warnings.filterwarnings("ignore", category=UserWarning)


# 解決字體問題

plt.rcParams["font.family"] = ["Microsoft YaHei"]

plt.rcParams["axes.unicode_minus"] = False



def evaluate_model_multi_class(y_test, y_pred, figsize=(10, 7), cmap="Blues"):

    # 從y_test自動讀取類別數

    num_classes = len(np.unique(y_test))


    # 計算和顯示混淆矩陣

    cm = confusion_matrix(y_test, y_pred)

    df_cm = pd.DataFrame(cm, index=range(num_classes), columns=range(num_classes))

    sn.heatmap(df_cm, annot=True, fmt="g")

    plt.title("Confusion matrix (混淆矩陣)\n", y=1.1)

    plt.ylabel("Actual label (實際標籤)\n")

    plt.xlabel("Predicted label (預測標籤)\n")

    plt.show()


    # 計算和顯示每個類的Accuracy、Precision、Recall和F1 Score
    accuracy = accuracy_score(y_test, y_pred)

    precision = precision_score(y_test, y_pred, average="macro")

    recall = recall_score(y_test, y_pred, average="macro")

    f1 = f1_score(y_test, y_pred, average="macro")

    print(f"Accuracy (準確率): {accuracy}")

    print(f"Precision (精確率) - Macro Average: {precision}")

    print(f"Recall (召回率) - Macro Average: {recall}")

    print(f"F1 Score (F1分數) - Macro Average: {f1}")



# 使用範例

# y_pred = model.predict(X_test)

# evaluate_model_multi_class(y_test, y_pred)

# 輸入資料

In [None]:
import pandas as pd

# file_path = r"..\飲料店總表0307final01_補上人氣_補值_xlsxclustered_HG_data.xlsx"
file_path = r"..\飲料店總表0307final01_補上人氣_補值_hg.xlsx"
# dataset = pd.read_csv(file_path, sep=",", encoding="UTF-8")

dataset = pd.read_excel(file_path)
# print(dataset.info())

In [None]:
dataset.describe().round(2)

In [None]:
analyze_nan_and_zero_values(dataset, 0, sort_column="零值+NAN佔比 (%)", ascending=False)
dataset

## 選擇需要的資料

In [None]:
"""selected_data
"""

selected_data = dataset[
    [
        "star",
        "school_counts",
        "drink_counts",
        "train_counts",
        "youbike_counts",
        "bus_counts",
        "park_counts",
        "night_market_counts",
        "sports_facilities_counts",
        "mrt_counts",
        "movie_theater_counts",
        "hospital_counts",
        "salary_income_median",
        "people_flow_mean",
        "knock_down_price_mean",
        "weekend_open",
        "road_area_ratio",
        "age",
        "weekday_working_hours_average",
        # "comment",
        # "people_flow_average",
        "popularity",
        # "KMEANS",
    ]
]
# selected_data

In [None]:
# 計算相關矩陣
import seaborn as sns
import matplotlib.pyplot as plt

corrmat = dataset.corr()

# 繪製熱力圖
plt.figure(figsize=(28, 24))
sns.heatmap(corrmat, annot=True, annot_kws={"size": 12})
plt.show()

## 顯示關係性太低的


In [None]:
"""篩選關係姓小的"""

# 計算與 'KMEANS' 欄位相關性的系列
corr_with_kmeans = selected_data.corr()["popularity"]

# 篩選出與 'KMEANS' 相關性小於0.1的欄位
low_corr_with_kmeans = corr_with_kmeans[abs(corr_with_kmeans) < 0.1]

# 列出這些欄位的名稱
print("與 'popularity' 欄位相關性小於0.1的欄位：")
print(low_corr_with_kmeans.index.tolist())
selected_data = selected_data.drop(low_corr_with_kmeans.index.tolist(), axis=1)

In [None]:
"""將連續數值popularity 轉成 類別數值 popularity_category
"""

# 1. 轉換 object 資料類型的欄位
dataset["name"] = dataset["name"].astype("category")
dataset["class"] = dataset["class"].astype("category")
dataset["address"] = dataset["address"].astype("category")
dataset["district"] = dataset["district"].astype("category")
dataset["neighborhood"] = dataset["neighborhood"].astype("category")
dataset["brand"] = dataset["brand"].astype("category")


# '''將缺失值補成-1'''
# dataset['brand'] = dataset['brand'].fillna(-1)
# dataset['Saturday_open_hours'] = dataset['Saturday_open_hours'].fillna(-1)
# dataset['Sunday_open_hours'] = dataset['Sunday_open_hours'].fillna(-1)

In [None]:
"""依照數量"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# 假設的 selected_data 和 'popularity' 欄位數據
np.random.seed(0)


# 使用 pd.qcut 進行分類
Y_classification_pd = pd.DataFrame({"value": selected_data["popularity"]})


Y_classification_pd["category"] = pd.qcut(
    Y_classification_pd["value"],
    q=5,
    labels=["Category 0", "Category 1", "Category 2", "Category 3", "Category 4"],
)


# 分組並獲得統計信息
summary_df = (
    Y_classification_pd.groupby("category")["value"]
    .agg([("最小值", "min"), ("最大值", "max"), ("數量", "size")])
    .reset_index()
)


# 建立分類
value_category_ranges = [
    summary_df.loc[0, "最小值"],  # 第一類別的最小值
    summary_df.loc[1, "最小值"],  # 第二類別的最小值
    summary_df.loc[2, "最小值"],  # 第三類別的最小值
    summary_df.loc[3, "最小值"],  # 第四類別的最小值
    summary_df.loc[4, "最小值"],  # 第五類別的最小值
    summary_df.loc[4, "最大值"],  # 第五類別的最大值
]


# 使用 pd.cut 根據新的範圍劃分 'popularity' 欄位
selected_data["popularity_category"] = pd.cut(
    selected_data["popularity"],
    bins=value_category_ranges,
    right=False,  # 包括左邊界，排除右邊界
    labels=[0, 1, 2, 3, 4],
)


# 將超出範圍的值填充為最大範圍的類別（4）
selected_data["popularity_category"].fillna(4, inplace=True)


# 將 'popularity_category' 列轉換為整數類型
selected_data["popularity_category"] = selected_data["popularity_category"].astype(int)


# 打印新的分類結果和每類的數量
print(selected_data["popularity_category"].value_counts().sort_index())
print(summary_df)

In [None]:
"""因為數值分布差很多 刪掉極端"""

# 直接在原始 DataFrame 中刪除 'popularity' 小於 8726.779 的行
selected_data = selected_data[selected_data["popularity"] <= 2213.641425]
# selected_data = selected_data[selected_data["popularity"] != 0]

# 顯示過濾後的數據
# print(selected_data)

In [None]:
import numpy as np

# 使用 pd.cut 根據數值大小劃分 'popularity' 欄位

# 計算數據範圍

min_val = selected_data["popularity"].min()

max_val = selected_data["popularity"].max()


# 創建五個分組的邊界值

bins = np.linspace(min_val, max_val, 6)


# 使用 pd.cut 來分組

selected_data["popularity_category"] = pd.cut(
    selected_data["popularity"],
    bins=bins,
    include_lowest=True,  # 確保包括最低值
    labels=[0, 1, 2, 3, 4],  # 這是每個範圍的標籤
)


# 打印新的分類結果和每類的數量

# print(selected_data['popularity_category'].value_counts().sort_index())


# 建立統計信息表格

summary_df = (
    selected_data.groupby("popularity_category")["popularity"]
    .agg(最小值="min", 最大值="max", 數量="size")
    .reset_index()
)

print(summary_df)

In [None]:
"""轉位數"""

selected_data["age"] = selected_data["age"].round(2)
selected_data["road_area_ratio"] = selected_data["road_area_ratio"].round(3)


# dataset['brand'] = dataset['brand'].fillna(-1)
# dataset['Saturday_open_hours'] = dataset['Saturday_open_hours'].fillna(-1)
# dataset['Sunday_open_hours'] = dataset['Sunday_open_hours'].fillna(-1)


# X = dataset.drop(
#     ['comment','star','people_flow_average','popularity',"KMEANS"], axis=1
# )
y = selected_data["popularity_category"]

X = selected_data.drop(["popularity", "popularity_category"], axis=1)

# y = dataset["KMEANS"]
y = y.to_frame()
print(type(y))

### 輸入前最後一次確認參數型態

In [None]:
analyze_nan_and_zero_values(X, 0, sort_column="零值+NAN佔比 (%)", ascending=False)
# analyze_nan_and_zero_values(y, 0, sort_column="零值+NAN佔比 (%)", ascending=False)

In [None]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split

# test_size 切的尺寸 30% random_state讓抽取可以是穩定的結果(第一次抽根第十次抽是一樣的)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=25
)

# 模型

## LogisticRegression

In [None]:
# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)


# 導入並訓練邏輯回歸(Logistic Regression)模型
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# 預測測試集並計算準確率
y_pred_lr = lr_model.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

accuracy_lr = accuracy_score(y_test, y_pred_lr)
classification_report_lr = classification_report(y_test, y_pred_lr)

# 輸出結果
print(accuracy_lr)
print(classification_report_lr)

# plot_confusion_matrix(y_test, y_pred_lr)

### 混淆矩陣(Confusion Matrix)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 使用模型預測測試集
y_pred = lr_model.predict(X_test)

# 生成混淆矩陣
cm = confusion_matrix(y_test, y_pred_lr)


# 轉換為比例
cm_ratio = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

# 使用Seaborn畫出混淆矩陣的熱力圖
plt.figure(figsize=(10, 7))
sns.heatmap(cm_ratio, annot=True, fmt=".2f", cmap="Blues")  # fmt='.2f' 指定顯示兩位小數
plt.title("Confusion Matrix")
plt.ylabel("Actual label")
plt.xlabel("Predicted label")
plt.show()

### 交叉驗證(Cross-Validation)

In [None]:
# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)


from sklearn.model_selection import cross_val_score

# 定義模型
lr_model_cv = LogisticRegression(max_iter=1000, random_state=42)

# 進行5-fold交叉驗證
scores = cross_val_score(lr_model_cv, X_train, y_train, cv=5)

# 輸出每一輪的準確率以及平均準確率
print("每一輪的準確率:", scores)
print("平均準確率:", scores.mean())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

C_values = np.logspace(-4, 3, 10)

# 定義原始參數範圍和模型（僅使用 L1 和 L2 正則化）
original_param_grid = {"C": C_values, "penalty": ["l1", "l2"]}
original_grid_search = GridSearchCV(
    LogisticRegression(
        max_iter=1000, random_state=42, solver="liblinear"
    ),  # 'liblinear' 支持 'l1' 和 'l2'
    original_param_grid,
    cv=5,
    scoring="accuracy",
)

# 執行原始網格搜索
original_grid_search.fit(X_train, y_train)
original_best_params = original_grid_search.best_params_
original_best_score = original_grid_search.best_score_

# 定義 Elastic Net 參數範圍和模型
elastic_param_grid = {
    "C": C_values,
    "l1_ratio": np.linspace(0, 1, 5),  # 從 0 到 1 均勻分布的 5 個點
}
elastic_grid_search = GridSearchCV(
    LogisticRegression(
        max_iter=1000, random_state=42, penalty="elasticnet", solver="saga"
    ),
    elastic_param_grid,
    cv=5,
    scoring="accuracy",
)

# 執行 Elastic Net 網格搜索
elastic_grid_search.fit(X_train, y_train)
elastic_best_params = elastic_grid_search.best_params_
elastic_best_score = elastic_grid_search.best_score_

# 比較結果並顯示
print("原始最佳參數:", original_best_params)
print("原始最高準確率:", original_best_score)
print("Elastic Net 最佳參數:", elastic_best_params)
print("Elastic Net 最高準確率:", elastic_best_score)

# 判斷哪一個模型更好
if original_best_score > elastic_best_score:
    print("原始模型較好。")
else:
    print("Elastic Net 模型較好。")

In [None]:
import matplotlib.pyplot as plt

# 解決字體問題
plt.rcParams["font.family"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False

In [None]:
# 獲取參數組合和對應的準確率
params = original_grid_search.cv_results_["params"]
mean_scores = original_grid_search.cv_results_["mean_test_score"]

# 繪製原始模型的準確率與 C 值的關係圖表
plt.figure(figsize=(10, 6))
for penalty, marker in zip(["l1", "l2"], ["o", "s"]):
    penalty_mask = [param["penalty"] == penalty for param in params]
    plt.plot(
        C_values, mean_scores[penalty_mask], marker=marker, label=f"penalty={penalty}"
    )

    # 標記每個參數組合
    for i, c in enumerate(C_values):
        plt.text(
            c,
            mean_scores[penalty_mask][i],
            f"C={c:.2f}",
            fontsize=8,
            ha="center",
            va="bottom",
        )

plt.xscale("log")
plt.xlabel("C 值")
plt.ylabel("平均準確率")
plt.title("C 值與平均準確率的關係")
plt.legend()
plt.grid(True)
plt.show()

### 使用最好的C

In [None]:
"""使用最好的C"""

# 導入並訓練邏輯回歸(Logistic Regression)模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 使用原始模型的最佳參數來設定模型
lr_model_best = LogisticRegression(
    C=278.2559402207126,
    penalty="l1",
    solver="liblinear",
    max_iter=1000,
    random_state=42,
)
lr_model_best.fit(X_train, y_train)

# 預測測試集並計算準確率
y_pred_best_lr = lr_model_best.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

accuracy_lr = accuracy_score(y_test, y_pred_best_lr)
classification_report_lr = classification_report(y_test, y_pred_best_lr)

# 輸出結果
print("準確率:", accuracy_lr)
print("分類報告:\n", classification_report_lr)

"""建立混淆矩陣"""

plot_confusion_matrix(y_test, y_pred_best_lr)


"""交叉驗證(Cross-Validation)"""
# 進行5-fold交叉驗證
scores = cross_val_score(lr_model_best, X_train, y_train, cv=5)

# 輸出每一輪的準確率以及平均準確率
print("每一輪的準確率:", scores)
print("平均準確率:", scores.mean())

"""輸出模型"""
from joblib import dump

# 保存模型
dump(lr_model_best, "lr_model_best.joblib")


evaluate_model_multi_class(y_test, y_pred_best_lr)

### 模型係數(Coefficients)

In [None]:
# 查看模型係數
import pandas as pd

feature_names = X_train.columns
coefficients = lr_model_best.coef_[0]
feature_importance = pd.DataFrame(
    {"Feature": feature_names, "Coefficient": coefficients}
)
feature_importance = feature_importance.sort_values(by="Coefficient", ascending=False)
print(feature_importance)

# 確保DataFrame已經根據Coefficient進行降序排序
feature_importance_sorted = feature_importance.sort_values(
    by="Coefficient", ascending=False
)

# 繪製特徵重要性的水平條形圖，重要性較高的特徵會顯示在最上面
plt.figure(figsize=(10, 6))
plt.barh(
    feature_importance_sorted["Feature"][:10],
    feature_importance_sorted["Coefficient"][:10],
)
plt.xlabel("Coefficient")
plt.ylabel("Feature")
plt.title("Top 10 Features Importance - Logistic Regression")
plt.gca().invert_yaxis()  # 確保重要性較高的特徵顯示在最上面
plt.show()

## SVM

In [None]:
from sklearn.svm import SVC

# 訓練支持向量機模型，默認 高斯核（RBF核）
# svm_model = SVC(kernel='rbf', random_state=42)
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# 預測測試集
y_pred_svm = svm_model.predict(X_test)

# 計算準確率和顯示分類報告
accuracy_svm = accuracy_score(y_test, y_pred_svm)
classification_report_svm = classification_report(y_test, y_pred_svm)

print(accuracy_svm)
print(classification_report_svm)

plot_confusion_matrix(y_test, y_pred_svm)

### 交叉驗證(Cross-Validation)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# 初始化支持向量機模型
svm_model = SVC(random_state=42)

# 進行交叉驗證，這裡假設使用5折交叉驗證
scores = cross_val_score(svm_model, X_train, y_train, cv=5)

# 打印出每一折的準確率以及平均準確率
print("每一折的準確率: ", scores)
print("交叉驗證結果：平均準確率", scores.mean())

### 參數調整(Parameter Tuning)

In [None]:
import warnings
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import expon, reciprocal
import numpy as np
import matplotlib.pyplot as plt

# 忽略警告
warnings.filterwarnings("ignore", category=UserWarning)

grid_C_values = [0.1, 1, 10, 100]
dist_C_values = reciprocal(0.1, 100)

# 設定參數範圍和分佈
param_grid = {"C": grid_C_values, "gamma": [1, 0.1, 0.01, 0.001]}
param_dist = {"C": dist_C_values, "gamma": expon(scale=1.0)}

# 創建 GridSearchCV 和 RandomizedSearchCV 物件
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=0)
random_search = RandomizedSearchCV(
    SVC(), param_distributions=param_dist, n_iter=100, refit=True, verbose=0
)

# 執行擬合
grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)

# 整理出圖的數據
results_grid = grid_search.cv_results_
results_random = random_search.cv_results_

In [None]:
# 顯示最佳參數和最佳準確率
best_grid_params = grid_search.best_params_
best_random_params = random_search.best_params_
best_grid_score = grid_search.best_score_
best_random_score = random_search.best_score_


# 比較準確率並指出哪個更高，然後創建相應的 SVC 實例
if best_grid_score > best_random_score:
    print("最佳參數 (Grid Search CV): ", best_grid_params)
    print("最佳準確率 (Grid Search CV): ", best_grid_score)
    # 直接使用具體的參數值來創建 SVC 實例
    svc_best = SVC(C=best_grid_params["C"], gamma=best_grid_params["gamma"])
elif best_grid_score < best_random_score:
    print("最佳參數 (Randomized Search CV): ", best_random_params)
    print("最佳準確率 (Randomized Search CV): ", best_random_score)
    # 直接使用具體的參數值來創建 SVC 實例
    svc_best = SVC(C=best_random_params["C"], gamma=best_random_params["gamma"])
else:
    print("兩種搜索方法提供了相同的準確率。")
    # 可以選擇任何一組參數來創建 SVC 實例，這裡選擇了 Grid Search 的參數
    svc_best = SVC(C=best_grid_params["C"], gamma=best_grid_params["gamma"])

"""交叉驗證"""

from sklearn.model_selection import cross_val_score

# 進行交叉驗證
cross_val_scores_best = cross_val_score(svc_best, X_train, y_train, cv=5)


# 使用最佳參數和全部訓練數據來訓練 SVC
svc_best.fit(X_train, y_train)

# 現在 svc_best 是被訓練過的，可以計算交叉驗證分數
cross_val_scores_best = cross_val_score(svc_best, X_train, y_train, cv=5)

# 計算平均交叉驗證準確率
avg_score_best = np.mean(cross_val_scores_best)
print("平均交叉驗證準確率: ", avg_score_best)


"""輸出模型"""
from joblib import dump

# 保存模型
dump(svc_best, "svc_best.joblib")

In [None]:
y_pred_best_SVC = lr_model_best.predict(X_test)

evaluate_model_multi_class(y_test, y_pred_best_SVC)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 訓練 AdaBoost 模型
# 這裡我們設置 50 個弱學習器，並且使用決策樹作為默認的基學習器
ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)
ada_model.fit(X_train, y_train)

# 進行預測
y_pred_ada_transformed = ada_model.predict(X_test)

# 計算準確率和其他性能指標
accuracy_ada = accuracy_score(y_test, y_pred_ada_transformed)
classification_report_ada = classification_report(y_test, y_pred_ada_transformed)

# 將預測結果轉換回原始範圍（1~5）
y_pred_ada_original = y_pred_ada_transformed + 1

# 輸出結果
print(f"Accuracy (準確率): {accuracy_ada}")
print("Classification Report (分類報告):\n", classification_report_ada)

# 如果您需要使用預測結果進行其他操作，請使用 y_pred_ada_original

# 計算混淆矩陣
cm = confusion_matrix(y_test, y_pred_ada_transformed)

# 轉換為比例
cm_ratio = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

# # 使用Seaborn畫出混淆矩陣的熱力圖
# plt.figure(figsize=(10, 7))
# sns.heatmap(cm_ratio, annot=True, fmt=".2f", cmap="Blues")  # fmt='.2f' 指定顯示兩位小數
# plt.title("Confusion Matrix (混淆矩陣)")
# plt.ylabel("Actual label (實際標籤)")
# plt.xlabel("Predicted label (預測標籤)")
# plt.show()

### 交叉驗證(Cross-Validation)

In [None]:
from sklearn.model_selection import cross_val_score

# 定義 AdaBoost 分類器
ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)

# 進行交叉驗證，cv 參數指定交叉驗證的折數
# 如果資料集較小，您可以考慮使用較大的折數，例如 cv=10
# 請確保 X 和 y 是您的特徵和標籤資料
# 這個過程可能會花費一些時間，特別是當資料集很大時
cv_scores = cross_val_score(ada_model, X, y, cv=5)

# 輸出交叉驗證結果
print("交叉驗證準確率:", cv_scores)
print("平均交叉驗證準確率:", np.mean(cv_scores))

### 參數調整(Parameter Tuning)

In [None]:
# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# 定義參數範圍
param_grid_ada = {
    "n_estimators": [10, 50, 100, 200],
    "learning_rate": [0.01, 0.1, 1, 10],
}
param_dist_ada = {"n_estimators": randint(50, 500), "learning_rate": [0.01, 0.1, 1, 10]}

# 創建 GridSearchCV 和 RandomizedSearchCV 物件
grid_search_ada = GridSearchCV(
    AdaBoostClassifier(random_state=42), param_grid_ada, refit=True, verbose=0
)
random_search_ada = RandomizedSearchCV(
    AdaBoostClassifier(random_state=42),
    param_distributions=param_dist_ada,
    n_iter=100,
    refit=True,
    verbose=0,
)

# 執行擬合
grid_search_ada.fit(X_train, y_train)
random_search_ada.fit(X_train, y_train)

# 選擇最佳模型
best_model_grid_ada = grid_search_ada.best_estimator_
best_model_random_ada = random_search_ada.best_estimator_

# 打印最佳參數
print(
    "Best parameters (GridSearchCV) (最佳參數-網格搜索):", grid_search_ada.best_params_
)
print(
    "Best parameters (RandomizedSearchCV) (最佳參數-隨機搜索):",
    random_search_ada.best_params_,
)

# 使用最佳模型進行預測
y_pred_ada_best_grid = best_model_grid_ada.predict(X_test)
y_pred_ada_best_random = best_model_random_ada.predict(X_test)

# 您可以基於 y_pred_ada_best_grid 和 y_pred_ada_best_random 進行進一步的分析和評估

# 計算準確率
accuracy_ada_best_grid = accuracy_score(y_test, y_pred_ada_best_grid)
accuracy_ada_best_random = accuracy_score(y_test, y_pred_ada_best_random)

# 打印準確率
print("Accuracy (GridSearchCV) (準確率-網格搜索):", accuracy_ada_best_grid)
print("Accuracy (RandomizedSearchCV) (準確率-隨機搜索):", accuracy_ada_best_random)

# 比較準確率並選擇最好的模型
if accuracy_ada_best_grid > accuracy_ada_best_random:
    AdaBoost_model_best = best_model_grid_ada
    print("GridSearchCV 的模型表現較好。")
else:
    AdaBoost_model_best = best_model_random_ada
    print("RandomizedSearchCV 的模型表現較好。")

# 使用最佳模型進行其他操作
# 例如，您可以使用 best_model 進行更多預測或進行深入分析

In [None]:
# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.model_selection import cross_val_score

# 使用交叉驗證評估模型
cross_val_scores = cross_val_score(AdaBoost_model_best, X, y, cv=5)  # 使用5折交叉驗證

# 打印每次交叉驗證的準確度
print("Cross Validation Scores:", cross_val_scores)
# 打印平均交叉驗證準確度
print("平均準確率:", cross_val_scores.mean())

# 使用最佳模型進行預測
y_pred_best_ada = AdaBoost_model_best.predict(X_test)


evaluate_model_multi_class(y_test, y_pred_best_ada)
plot_confusion_matrix(y_test, y_pred_best_ada)
"""輸出模型"""
from joblib import dump, load

# 保存模型
dump(AdaBoost_model_best, "AdaBoost_model_best.joblib")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 假设 AdaBoost_model_best 是您的训练好的模型
feature_names = X_train.columns  # X_train 是训练数据集
importances = AdaBoost_model_best.feature_importances_  # 获取特征重要性
feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": importances})

# 按照重要性进行降序排列
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# 显示排序后的DataFrame
print(feature_importance)

# 绘制特征重要性的水平条形图，重要性较高的特征会显示在最上面
plt.figure(figsize=(10, 6))
plt.barh(
    feature_importance["Feature"][:10],  # 只显示前10个特征
    feature_importance["Importance"][:10],
)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Top 10 Feature Importance - AdaBoost")
plt.gca().invert_yaxis()  # 确保重要性较高的特征显示在最上面
plt.show()

## XGboost

In [None]:
pip install xgboost -q


In [None]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

le = LabelEncoder()
y_train = le.fit_transform(y_train)

# 建立XGBoost模型
model = xgb.XGBClassifier(
    enable_categorical=True,
)
# 訓練模型
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)


from sklearn.metrics import (
    precision_score,
    recall_score,
    confusion_matrix,
    accuracy_score,
)

# 計算準確率(Accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"準確率: {accuracy}")  # 印出準確率

# 計算精確率(Precision)
precision = precision_score(
    y_test, y_pred, average="weighted"
)  # 使用 'weighted' 方法來處理類別不平衡的情況
print(f"精確率: {precision}")  # 印出精確率

# 計算召回率(Recall)
recall = recall_score(
    y_test, y_pred, average="weighted"
)  # 使用 'weighted' 方法來處理類別不平衡的情況
print(f"召回率: {recall}")  # 印出召回率

### 交叉驗證(Cross-Validation)

In [None]:
from sklearn.model_selection import cross_val_score


# 定義 XGBoost 模型

xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=5, random_state=42)


# 執行交叉驗證

# cv 參數決定折數，例如，cv=5 代表 5 折交叉驗證

# scoring 參數可以根據需要更改，例如使用 'accuracy' 來獲取準確率


y_xgb = le.fit_transform(y)  # 轉換目標變數

scores = cross_val_score(xgb_model, X, y_xgb, cv=5, scoring="accuracy")


# 輸出結果

print("每一輪的準確率:", scores)

print("平均準確率:", scores.mean())

# print('Standard deviation of accuracy:', scores.std())

### 參數調整(Parameter Tuning)

In [None]:
from sklearn.model_selection import GridSearchCV


# 定義超參數網格
param_grid = {
    "max_depth": [3, 4, 5],  # 最大深度
    "learning_rate": [0.1, 0.01, 0.001],  # 學習率
    "n_estimators": [100, 200, 300],  # 樹的數量
    "objective": ["multi:softmax", "multi:softprob"],  # 目標函數
    "subsample": [0.6, 0.8, 1],  # 子樣本比例
    "colsample_bytree": [0.8, 1, 1.2],  # 每棵樹隨機選擇特徵的比例
}

# 初始化XGBoost分類器
xgb_classifier = xgb.XGBClassifier(random_state=42, num_class=5)

# 設置GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    scoring="accuracy",
    n_jobs=-1,
    cv=3,
    verbose=2,
)

# 擬合GridSearchCV
grid_search.fit(X_train, y_train)

# 輸出最佳參數
print("找到的最佳參數: ", grid_search.best_params_)

# 輸出最佳準確率
print("找到的最佳準確率: ", grid_search.best_score_)

### 用最好的參數做

In [None]:
# 使用最佳參數對整個訓練集進行重新訓練
best_xgb_model = grid_search.best_estimator_

# 進行預測
y_pred_best_XG = best_xgb_model.predict(X_test)

# 計算準確率和其他性能指標
accuracy_best = accuracy_score(y_test, y_pred_best_XG)
classification_report_best = classification_report(y_test, y_pred_best_XG)

# 輸出結果
print(f"最佳模型準確率: {accuracy_best}")
print("最佳模型分類報告:\n", classification_report_best)

plot_confusion_matrix(y_test, y_pred_best_XG)

"""輸出模型"""
from joblib import dump, load

# 保存模型
dump(best_xgb_model, "XGBoost model_best.joblib")

### 特徵重要性

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

feature_names = X_train.columns
importances = best_xgb_model.feature_importances_
feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": importances})

# 按照重要性進行降序排列
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# 顯示排序後的DataFrame
print(feature_importance)


# 繪製特徵重要性的水平條形圖，重要性較高的特徵會顯示在最上面
plt.figure(figsize=(10, 6))
plt.barh(
    feature_importance["Feature"][:10],
    feature_importance["Importance"][:10],
)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Top 10 Feature Importance - XGBoost")
plt.gca().invert_yaxis()  # 確保重要性較高的特徵顯示在最上面
plt.show()

中文化圖

In [None]:
# 建立英文特徵名稱到中文的映射字典
feature_name_map = {
    "knock_down_price_mean": "地區租屋平均值",
    "road_area_ratio": "道路面積比率",
    "mrt_counts": "捷運站數量",
    "weekend_open": "週末開放",
    "star": "星級",
    "sports_facilities_counts": "體育設施數量",
    "people_flow_mean": "人流量平均值",
    "salary_income_median": "薪水收入中位數",
    "youbike_counts": "YouBike站點數量",
    "weekday_working_hours_average": "工作日平均工時",
}

# 替換DataFrame中的特徵名稱
feature_importance["Feature"] = feature_importance["Feature"].map(feature_name_map)

# 繪製特徵重要性的水平條形圖，重要性較高的特徵會顯示在最上面
plt.figure(figsize=(10, 6))
plt.barh(
    feature_importance["Feature"][:10],  # 更新為中文特徵名稱
    feature_importance["Importance"][:10],
)
plt.xlabel("重要性(Importance)")
plt.ylabel("特徵(Feature)")
plt.title("前 10 特徵重要性 - XGBoost")
plt.gca().invert_yaxis()  # 確保重要性較高的特徵顯示在最上面
plt.show()

## 高斯貝式分類器 GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# 訓練貝葉斯分類器
Bayesion_classifier_model = GaussianNB()
Bayesion_classifier_model.fit(X_train, y_train)

# 預測測試集
y_pred_bc = Bayesion_classifier_model.predict(X_test)

# 計算準確率和顯示分類報告
# print("Accuracy:", accuracy_score(y_test, y_pred))
accuracy_bc = accuracy_score(y_test, y_pred_bc)
classification_report_bc = classification_report(y_test, y_pred_bc)

print(accuracy_bc)
print(classification_report_bc)

In [None]:
# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

# 定义参数范围
param_grid = {
    "var_smoothing": np.logspace(
        0, -15, num=100
    )  # 控制模型對於數據中的噪聲的容忍度，通過向變異數中加入一個小的值來穩定計算過程
}

# 建立 GridSearchCV 對象
grid_search_bc = GridSearchCV(
    GaussianNB(),
    param_grid,
    cv=5,
    scoring="accuracy",
)

# 執行網格搜索
grid_search_bc.fit(X_train, y_train)

# 輸出最佳參數和最佳交叉驗證分數
print("Best parameters:", grid_search_bc.best_params_)
print("Best cross-validation score:", grid_search_bc.best_score_)


# 繪製結果
mean_scores = grid_search_bc.cv_results_[
    "mean_test_score"
]  # 獲取每個參數的平均測試分數
smoothing_values = param_grid["var_smoothing"]


plt.figure(figsize=(10, 6))
plt.plot(smoothing_values, mean_scores)
plt.scatter(
    grid_search_bc.best_params_["var_smoothing"],
    grid_search_bc.best_score_,
    color="red",
    marker="o",
    label="Best parameter",
)  # 標記最佳參數
plt.xscale("log")
plt.xlabel("Var Smoothing (對數刻度)")
plt.ylabel("Accuracy (準確率)")
plt.title("GridSearchCV Var Smoothing vs. Accuracy (準確率)")
plt.legend()  # 顯示圖例
plt.show()

### 利用最佳參數來做

In [None]:
from sklearn.naive_bayes import GaussianNB

# 使用 GridSearchCV 找到的最佳參數
best_var_smoothing = grid_search_bc.best_params_["var_smoothing"]

# 建立一個新的 GaussianNB 模型，使用最佳的 var_smoothing 參數
Bayesion_classifier_model_best = GaussianNB(var_smoothing=best_var_smoothing)

# 使用訓練數據訓練模型
Bayesion_classifier_model_best.fit(X_train, y_train)

y_pred_bc_best = Bayesion_classifier_model_best.predict(X_test)
# 計算準確率
accuracy_bc_best = accuracy_score(y_test, y_pred_bc_best)

# 生成分類報告
classification_report_bc_best = classification_report(y_test, y_pred_bc_best)

# 打印結果
print("最佳 GaussianNB 模型的準確度:", accuracy_bc_best)
print(classification_report_bc_best)

plot_confusion_matrix(y_test, y_pred_bc_best)


"""交叉驗證(Cross-Validation)"""
# 進行5-fold交叉驗證
scores = cross_val_score(Bayesion_classifier_model_best, X_train, y_train, cv=5)


# 輸出每一輪的準確率以及平均準確率
print("每一輪的準確率:", scores)
print("平均準確率:", scores.mean())

"""輸出模型"""
from joblib import dump, load

# 保存模型
dump(Bayesion_classifier_model_best, "Bayesion_classifier_model_best.joblib")

In [None]:
evaluate_model_multi_class(y_test, y_pred_bc_best)

## 多項式貝氏分類器 MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# 訓練貝葉斯分類器
MultinomialNB_model = MultinomialNB()
MultinomialNB_model.fit(X_train, y_train)

# 預測測試集
y_pred_mnb = MultinomialNB_model.predict(X_test)

# 計算準確率和顯示分類報告
# print("Accuracy:", accuracy_score(y_test, y_pred))
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
classification_report_mnb = classification_report(y_test, y_pred_mnb)

print(accuracy_mnb)
print(classification_report_mnb)

### 參數調整(Parameter Tuning)

In [None]:
# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# 定义参数范围
param_grid = {
    # 'alpha': [0.01, 0.1, 1, 10, 100, 10000]
    "alpha": np.logspace(-2, 14, num=10)
    # 平滑參數，用於解決數據集中未見過的特徵所帶來的零概率問題
}
# 建立 GridSearchCV 對象
grid_search_mnb = GridSearchCV(
    MultinomialNB(),
    param_grid,
    cv=5,
    scoring="accuracy",
)

# 執行網格搜索
grid_search_mnb.fit(X_train, y_train)

# 輸出最佳參數和最佳交叉驗證分數
print("Best parameters:", grid_search_mnb.best_params_)
print("Best cross-validation score:", grid_search_mnb.best_score_)

import matplotlib.pyplot as plt

# 从grid_search_mnb获取alpha值和对应的准确率
alphas = param_grid["alpha"]
mean_scores = grid_search_mnb.cv_results_["mean_test_score"]

# 绘制alpha对应准确率的图
plt.figure(figsize=(10, 6))
plt.plot(alphas, mean_scores, marker="o")
# 确保这里使用了正确的最佳参数和分数
plt.scatter(
    grid_search_mnb.best_params_["alpha"],
    grid_search_mnb.best_score_,
    color="red",
    marker="o",
    label="最佳参数",
)  # 标记最佳点
plt.xlabel("Alpha (平滑参数)")
plt.ylabel("Accuracy (预测率)")
plt.title("MultinomialNB 不同 Alpha 的预测率")
plt.xscale("log")  # 使用对数刻度
plt.legend()  # 显示图例
plt.grid(True)
plt.show()

## 利用最佳參數做

In [None]:
# 忽略警告
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 使用找到的最佳參數建立一個新的 MultinomialNB 模型
MultinomialNB_model_best = MultinomialNB(alpha=grid_search_mnb.best_params_["alpha"])

# 使用訓練數據訓練模型
MultinomialNB_model_best.fit(X_train, y_train)

# 使用測試數據進行預測
y_pred_mnb_best = MultinomialNB_model_best.predict(X_test)

# 計算準確率
accuracy_mnb_best = accuracy_score(y_test, y_pred_mnb_best)

# 生成分類報告
classification_report_mnb_best = classification_report(y_test, y_pred_mnb_best)

# 打印結果
print("最佳 MultinomialNB 模型的準確度:", accuracy_mnb_best)
print(classification_report_mnb_best)

# 繪製混淆矩陣
plot_confusion_matrix(y_test, y_pred_mnb_best)
# 進行5-fold交叉驗證
scores = cross_val_score(MultinomialNB_model_best, X_train, y_train, cv=5)

# 輸出每一輪的準確率以及平均準確率
print("每一輪的準確率:", scores)
print("平均準確率:", scores.mean())

# 保存模型
dump(MultinomialNB_model_best, "MultinomialNB_model_best.joblib")

In [None]:
evaluate_model_multi_class(y_test, y_pred_mnb_best)

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 訓練隨機森林模型
randomforest_model = RandomForestClassifier(random_state=42)
randomforest_model.fit(X_train, y_train)

# 預測測試集
y_pred_rf = randomforest_model.predict(X_test)

# 計算準確率和顯示分類報告
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

print(accuracy_rf)
print(classification_report_rf)

evaluate_model_multi_class(y_test, y_pred_rf)

### 參數調整(Parameter Tuning)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint

# 定義參數範圍
param_grid = {
    "n_estimators": [100, 200, 400, 600],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 4, 8, 16],
    "max_features": ["auto", "sqrt", "log2"],
}

"""
'n_estimators'：您希望測試的決策樹數量。
'max_depth'：每棵決策樹的最大深度。如果為 None，則表示沒有限制。
'min_samples_split'：內部節點分裂所需的最小樣本數。
'min_samples_leaf'：葉節點所需的最小樣本數。
'max_features'：在尋找最佳分割時要考慮的特徵數量。
"""

# 定義隨機搜索的參數範圍
param_dist = {
    "n_estimators": sp_randint(100, 600),
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 17),
    "max_features": ["auto", "sqrt", "log2"],
}

# 建立 RandomForestClassifier 模型
rf = RandomForestClassifier(random_state=42)

# 建立 GridSearchCV 對象
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy")

# 建立 RandomizedSearchCV 對象
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=100, cv=5, scoring="accuracy"
)

# 執行網格搜索
grid_search.fit(X_train, y_train)

# 執行隨機搜索
random_search.fit(X_train, y_train)

# 輸出結果
print("GridSearchCV 最佳參數:", grid_search.best_params_)
print("GridSearchCV 最佳交叉驗證分數:", grid_search.best_score_)
print("RandomizedSearchCV 最佳參數:", random_search.best_params_)
print("RandomizedSearchCV 最佳交叉驗證分數:", random_search.best_score_)

# 比較並輸出最好的參數
if grid_search.best_score_ > random_search.best_score_:
    print("總體最佳參數來自 GridSearchCV:", grid_search.best_params_)
    best_params = grid_search.best_params_
else:
    print("總體最佳參數來自 RandomizedSearchCV:", random_search.best_params_)
    best_params = random_search.best_params_

In [None]:
# 比較並輸出最好的參數
if grid_search.best_score_ > random_search.best_score_:
    print("總體最佳參數來自 GridSearchCV:", grid_search.best_params_)
    best_params = grid_search.best_params_
else:
    print("總體最佳參數來自 RandomizedSearchCV:", random_search.best_params_)
    best_params = random_search.best_params_

In [None]:
"""用最佳參數在做一遍"""

from sklearn.ensemble import RandomForestClassifier


# 建立模型並設置參數
# 重跑要一小時(用補習班的)

best_randomforest_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=10,
    max_features="sqrt",
    min_samples_leaf=8,
    min_samples_split=2,
    random_state=42,
)


# # 建立模型並設置參數
# best_randomforest_model = RandomForestClassifier(n_estimators=n_estimators_value,
#     max_depth=best_params['max_depth'],
#     max_features=best_params['max_features'],
#     min_samples_leaf=best_params['min_samples_leaf'],
#     min_samples_split=best_params['min_samples_split'],
#     random_state=42)


# 訓練模型

best_randomforest_model.fit(X_train, y_train)


# 使用最佳模型進行預測

y_pred_best = best_randomforest_model.predict(X_test)


# 再次生成混淆矩陣並視覺化

cm_best = confusion_matrix(y_test, y_pred_best)

cm_best_ratio = cm_best.astype("float") / cm_best.sum(axis=1)[:, np.newaxis]


# 使用Seaborn畫出混淆矩陣的熱力圖

plt.figure(figsize=(10, 7))

sns.heatmap(cm_best_ratio, annot=True, fmt=".2f", cmap="Blues")

plt.title("Optimized RandomForest Confusion Matrix")

plt.ylabel("Actual label")

plt.xlabel("Predicted label")

plt.show()


"""交叉驗證"""

# 忽略警告用的

import warnings


warnings.filterwarnings("ignore", category=UserWarning)


from sklearn.model_selection import cross_val_score


# 定義模型

rf_model_cv = best_randomforest_model


# 進行5-fold交叉驗證

scores = cross_val_score(rf_model_cv, X_train, y_train, cv=5)


# 輸出每一輪的準確率以及平均準確率

print("每一輪的準確率:", scores)

print("平均準確率:", scores.mean())

"""輸出模型"""
from joblib import dump

# 保存模型
dump(best_randomforest_model, "best_randomforest_model.joblib")

In [None]:
y_pred_best_RF = best_randomforest_model.predict(X_test)
evaluate_model_multi_class(y_test, y_pred_best_RF)

https://chat.openai.com/c/5148fbf7-f239-4393-876b-aa680d7293c5