In [1]:
"""檢視空值及零值"""

import pandas as pd


def analyze_nan_and_zero_values(
    data, threshold, sort_column="零值+NAN佔比 (%)", ascending=False
):
    """
    檢查0值和 nan 的數量
    threshold 為 百分比，例threshold=20，會回傳 零值+NAN佔比 (%) 大於20% 的
    當 threshold = 0 會回傳全部
    """

    # 如果輸入是文件路徑，讀取CSV文件並轉換成Pandas DataFrame

    if isinstance(data, str):

        file_source_pd = pd.read_csv(data)

    # 如果輸入是Pandas DataFrame，直接使用它

    elif isinstance(data, pd.DataFrame):

        file_source_pd = data

    else:

        raise ValueError(
            "Invalid input type. Input must be either file path or Pandas DataFrame."
        )

    # 計算這個範圍內每一欄的空白值(NaN)數量

    nan_values_per_column_in_range = file_source_pd.isnull().sum(axis=0)

    # 計算每個欄位的總數據量

    total_data_per_column = len(file_source_pd)

    # 計算每個欄位的空白值(NaN)佔比

    nan_percentage_per_column = (
        nan_values_per_column_in_range / total_data_per_column
    ) * 100

    # 計算這個範圍內每一欄的零值數量

    zero_values_per_column_in_range = (file_source_pd == 0).sum(axis=0)

    # 計算每個欄位的零值佔比

    zero_percentage_per_column = (
        zero_values_per_column_in_range / total_data_per_column
    ) * 100

    # 計算每個欄位的零值和NaN值的總數

    total_zero_and_nan_per_column = (
        zero_values_per_column_in_range + nan_values_per_column_in_range
    )

    # 計算每個欄位的零值和NaN值總數的佔比

    total_zero_and_nan_percentage_per_column = (
        total_zero_and_nan_per_column / total_data_per_column
    ) * 100

    # 將結果轉換為 DataFrame

    values_df = pd.DataFrame(
        {
            "欄位名稱": nan_values_per_column_in_range.index,
            "零值+NAN佔比 (%)": total_zero_and_nan_percentage_per_column.values.round(
                2
            ),
            "空白值(NaN)數量": nan_values_per_column_in_range.values,
            "空白值(NaN)佔比 (%)": nan_percentage_per_column.values.round(2),
            "零值數量": zero_values_per_column_in_range.values,
            "零值佔比 (%)": zero_percentage_per_column.values.round(2),
        }
    )

    # 將 DataFrame 存儲到 CSV 檔案中
    values_df.to_csv("零值與空白值統計.csv", index=False, encoding="utf-8-sig")

    # 如果 threshold 等於 0，則返回所有欄位
    if threshold == 0:
        return values_df

    # 篩選出 '零值+NAN佔比 (%)' 大於指定閾值的欄位

    filtered_values_df = values_df[values_df["零值+NAN佔比 (%)"] > threshold]

    # 根據指定的列進行排序

    sorted_values_df = filtered_values_df.sort_values(
        by=sort_column, ascending=ascending
    )

    return sorted_values_df


# 測試


# result_df = analyze_nan_and_zero_values(clean_data, threshold=0, sort_column='欄位名稱', ascending=False)


# print(result_df)

In [2]:
"""分類指標 僅適用二元分法"""

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import pandas as pd

# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 解決字體問題
plt.rcParams["font.family"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False


def evaluate_model_multi_class(y_test, y_pred):
    # 從y_test自動讀取類別數
    num_classes = len(np.unique(y_test))

    # # 計算和顯示混淆矩陣
    # cm = confusion_matrix(y_test, y_pred)
    # df_cm = pd.DataFrame(cm, index=range(num_classes), columns=range(num_classes))
    # # 轉換為比例
    # cm_ratio = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    # # plt.figure(figsize=(10,7))
    # # 使用藍色的色調 'Blues'
    # sn.heatmap(cm_ratio, annot=True, fmt=".0%", cmap="Blues")
    # plt.title('Confusion matrix (混淆矩陣)\n', y=1.1)
    # plt.ylabel('Actual label (實際標籤)\n')
    # plt.xlabel('Predicted label (預測標籤)\n')
    # plt.show()

    # 計算和顯示每個類的Accuracy、Precision、Recall和F1 Score
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"Accuracy (準確率): {accuracy:.2%}")
    print(f"Precision (精確率) - Macro Average: {precision:.2%}")
    print(f"Recall (召回率) - Macro Average: {recall:.2%}")
    print(f"F1 Score (F1分數) - Macro Average: {f1:.2%}")


# 使用範例
# y_pred = model.predict(X_test)
# evaluate_model_multi_class(y_test, y_pred)

In [3]:
"""(還沒成功)分類指標 用於多模型"""

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import pandas as pd

# 忽略警告用的
import warnings

warnings.filterwarnings("ignore", category=UserWarning)


def many_models_evaluate_model_multi_class(y_test, y_pred):
    # 從y_test自動讀取類別數
    num_classes = len(np.unique(y_test))

    # 計算和顯示混淆矩陣
    cm = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(cm, index=range(num_classes), columns=range(num_classes))
    # 轉換為比例
    cm_ratio = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(10, 7))
    sn.heatmap(cm_ratio, annot=True, fmt=".0%", cmap="Blues")
    plt.title("Confusion matrix (混淆矩陣)\n", y=1.1)
    plt.ylabel("Actual label (實際標籤)\n")
    plt.xlabel("Predicted label (預測標籤)\n")
    plt.show()

    # 計算各項指標
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")

    # 返回指標值
    return accuracy, precision, recall, f1

In [4]:
import pandas as pd

# file_path = r"..\飲料店總表0307final01_補上人氣_補值_xlsxclustered_HG_data.xlsx"
file_path = r"..\飲料店總表0307final01_補上人氣_補值_hg.xlsx"
# dataset = pd.read_csv(file_path, sep=",", encoding="UTF-8")

dataset = pd.read_excel(file_path)
# print(dataset.info())

In [5]:
"""selected_data
"""

selected_data = dataset[
    [
        "star",
        "school_counts",
        "drink_counts",
        "train_counts",
        "youbike_counts",
        "bus_counts",
        "park_counts",
        "night_market_counts",
        "sports_facilities_counts",
        "mrt_counts",
        "movie_theater_counts",
        "hospital_counts",
        "salary_income_median",
        "people_flow_mean",
        "knock_down_price_mean",
        "weekend_open",
        "road_area_ratio",
        "age",
        "weekday_working_hours_average",
        # "comment",
        # "people_flow_average",
        "popularity",
        # "KMEANS",
    ]
]

# selected_data

In [6]:
"""因為數值分布差很多 刪掉極端"""

import numpy as np

# 直接在原始 DataFrame 中刪除 'popularity' 小於 8726.779 的行
selected_data = selected_data[selected_data["popularity"] <= 2213.641425]

# 使用 pd.cut 根據數值大小劃分 'popularity' 欄位

# 計算數據範圍
min_val = selected_data["popularity"].min()
max_val = selected_data["popularity"].max()

# 創建五個分組的邊界值
bins = np.linspace(min_val, max_val, 6)

# 使用 pd.cut 來分組
selected_data["popularity_category"] = pd.cut(
    selected_data["popularity"],
    bins=bins,
    include_lowest=True,  # 確保包括最低值
    labels=[0, 1, 2, 3, 4],  # 這是每個範圍的標籤
)


# 打印新的分類結果和每類的數量

# print(selected_data['popularity_category'].value_counts().sort_index())


# 建立統計信息表格

summary_df = (
    selected_data.groupby("popularity_category")["popularity"]
    .agg(最小值="min", 最大值="max", 數量="size")
    .reset_index()
)

print(summary_df)

  popularity_category          最小值          最大值    數量
0                   0     0.000000   441.227668  3180
1                   1   441.257427   882.514854   635
2                   2   882.637890  1320.782089   321
3                   3  1325.771070  1758.006505   178
4                   4  1767.739056  2206.287135   112


  selected_data.groupby("popularity_category")["popularity"]


In [7]:
"""轉位數"""

selected_data["age"] = selected_data["age"].round(2)
selected_data["road_area_ratio"] = selected_data["road_area_ratio"].round(3)


# dataset['brand'] = dataset['brand'].fillna(-1)
# dataset['Saturday_open_hours'] = dataset['Saturday_open_hours'].fillna(-1)
# dataset['Sunday_open_hours'] = dataset['Sunday_open_hours'].fillna(-1)


# X = dataset.drop(
#     ['comment','star','people_flow_average','popularity',"KMEANS"], axis=1
# )
y = selected_data["popularity_category"]
X = selected_data.drop(["popularity", "popularity_category"], axis=1)

# y = dataset["KMEANS"]
y = y.to_frame()
print(type(y))

<class 'pandas.core.frame.DataFrame'>


In [12]:
analyze_nan_and_zero_values(X, 0, sort_column="零值+NAN佔比 (%)", ascending=False)
analyze_nan_and_zero_values(y, 0, sort_column="零值+NAN佔比 (%)", ascending=False)

Unnamed: 0,欄位名稱,零值+NAN佔比 (%),空白值(NaN)數量,空白值(NaN)佔比 (%),零值數量,零值佔比 (%)
0,popularity_category,71.85,0,0.0,3180,71.85


In [10]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split

# test_size 切的尺寸 30% random_state讓抽取可以是穩定的結果(第一次抽根第十次抽是一樣的)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=25
)

### 將所有模型結果匯成一張表

In [13]:
import joblib

# 模型檔案位置
model_paths = [
    # "模型\\AdaBoost_model_best.joblib",
    # "模型\\Bayesion_classifier_model_best.joblib",
    "模型\\best_randomforest_model.joblib",
    "模型\\lr_model_best.joblib",
    # "模型\\MultinomialNB_model_best.joblib",
    # "模型\\svc_best.joblib",
    "模型\\XGBoost model_best.joblib",
    "模型\\best_mlp_model.joblib",
]
# 載入模型
models = [joblib.load(model_path) for model_path in model_paths]

# 創建一個空的DataFrame用於存儲結果
results_df = pd.DataFrame

# 計算並顯示每個模型的準確率
for model, path in zip(models, model_paths):
    y_pred = model.predict(X_test)
    evaluate_model_multi_class(y_test, y_pred)
    # results_df
    # acc, prec, rec, f1 = evaluate_model_multi_class(y_test, y_pred)  # 獲取評估指標
    # # 更新DataFrame
    # results_df = results_df.append({'Model': path.split('\\')[-1], 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1 Score': f1}, ignore_index=True)
# 打印結果
# print(results_df)

Accuracy (準確率): 69.50%
Precision (精確率) - Macro Average: 37.33%
Recall (召回率) - Macro Average: 20.28%
F1 Score (F1分數) - Macro Average: 17.18%
Accuracy (準確率): 69.58%
Precision (精確率) - Macro Average: 24.82%
Recall (召回率) - Macro Average: 20.52%
F1 Score (F1分數) - Macro Average: 17.75%
Accuracy (準確率): 69.88%
Precision (精確率) - Macro Average: 30.29%
Recall (召回率) - Macro Average: 20.93%
F1 Score (F1分數) - Macro Average: 18.29%
Accuracy (準確率): 69.65%
Precision (精確率) - Macro Average: 13.93%
Recall (召回率) - Macro Average: 20.00%
F1 Score (F1分數) - Macro Average: 16.42%


## bagging_model

In [None]:
"""非傳統bagging，屬於類似stacking"""

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

bagging_model = BaggingClassifier(n_estimators=len(models), random_state=42)

# # 計算並顯示每個模型的準確率
# for model, path in zip(models, model_paths):
#     y_pred = model.predict(X_test)
#     acc = accuracy_score(y_test, y_pred)
#     print(f"{path.split('\\')[-1]} 的準確率: {acc}")
#     evaluate_model_multi_class(y_test, y_pred)

import pandas as pd

# 訓練 Bagging 模型
X_train_preds = np.array([model.predict(X_train) for model in models]).T
X_test_preds = np.array([model.predict(X_test) for model in models]).T
bagging_model.fit(X_train_preds, y_train)

# 計算 Bagging 模型的準確率
bagging_model.fit(X_train_preds, y_train)

y_pred_bag = bagging_model.predict(X_test_preds)
acc_bag = accuracy_score(y_test, y_pred_bag)
print(f"Bagging 模型的準確率: {acc_bag}")

evaluate_model_multi_class(y_test, y_pred_bag)

In [14]:
import joblib
from sklearn.ensemble import BaggingClassifier
import numpy as np


# 自定義類別來包含所有子模型和Bagging模型
class CompleteModel:
    def __init__(self, base_models, bagging_model):
        self.base_models = base_models  # 存儲所有子模型
        self.bagging_model = bagging_model  # 存倲Bagging模型

    def predict(self, X):
        # 對每個子模型進行預測並將結果堆疊起來
        X_preds = np.array([model.predict(X) for model in self.base_models]).T
        # 使用Bagging模型對堆疊後的預測進行最終預測
        return self.bagging_model.predict(X_preds)

    def fit(self, X, y):
        # 對每個子模型進行預測並將結果堆疊起來
        X_preds = np.array([model.predict(X) for model in self.base_models]).T
        # 使用Bagging模型對堆疊後的數據進行訓練
        self.bagging_model.fit(X_preds, y)


# 載入你的子模型
base_models = [joblib.load(model_path) for model_path in model_paths]
# 創建Bagging模型實例（這個步驟假設你已經有一個訓練好的Bagging模型）
bagging_model = BaggingClassifier(n_estimators=len(base_models), random_state=42)
# 如果你已經訓練好了Bagging模型，你應該這樣初始化你的CompleteModel
bagging_complete_model = CompleteModel(base_models, bagging_model)

bagging_complete_model.fit(X_train, y_train)


# 假設你已經有了訓練好的CompleteModel實例
# 保存整個模型
# joblib.dump(complete_model, 'complete_model.joblib')

# 當需要時，重新載入模型並進行預測
# loaded_model = joblib.load('complete_model.joblib')
# predictions = loaded_model.predict(X_new)  # X_new是新的輸入數據
y_pred_bag = bagging_complete_model.predict(X_train)

In [15]:
"""輸出模型"""

from joblib import dump

# 保存模型
dump(bagging_complete_model, "bagging_complete_model.joblib")

['bagging_complete_model.joblib']

### 載入模型

In [None]:
from joblib import load
import numpy as np
from sklearn.metrics import classification_report

# 假設 base_models 是你所有預訓練好的基模型列表
base_models = [joblib.load(model_path) for model_path in model_paths]

# 載入 Bagging 分類器模型
bagging_model_loaded = load("bagging_model.joblib")

# 使用所有子模型對原始數據X進行預測，並將這些預測結果堆疊起來作為新的特徵集
X_transformed = np.array([model.predict(X) for model in base_models]).T

# 使用轉換後的特徵集進行預測
y_pred = bagging_model_loaded.predict(X_transformed)

# 使用分類指標進行評估
print(classification_report(y, y_pred))

# 如果你有一個評估函數，也可以使用它來評估模型
# evaluate_model_multi_class(y, y_pred)

In [16]:
from joblib import load

# 載入完整模型
bagging_model_loaded = load("bagging_complete_model.joblib")

# 使用載入的模型進行預測，X_new是新的輸入數據
y_pred_bag = bagging_model_loaded.predict(X_train)
evaluate_model_multi_class(y_train, y_pred_bag)

Accuracy (準確率): 75.15%
Precision (精確率) - Macro Average: 88.48%
Recall (召回率) - Macro Average: 29.58%
F1 Score (F1分數) - Macro Average: 33.69%


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 生成混淆矩陣
cm = confusion_matrix(y_test, y_pred)


# 轉換為比例
cm_ratio = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

# 使用Seaborn畫出混淆矩陣的熱力圖
plt.figure(figsize=(10, 7))
sns.heatmap(cm_ratio, annot=True, fmt=".2f", cmap="Blues")  # fmt='.2f' 指定顯示兩位小數
plt.title("Confusion Matrix")
plt.ylabel("Actual label")
plt.xlabel("Predicted label")
plt.show()

## voting

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# 假設 models 是你之前加載的模型列表
estimators = [(f"model_{i}", model) for i, model in enumerate(models)]

# 建立一個硬投票分類器
voting_clf = VotingClassifier(estimators=estimators, voting="hard")

# 訓練 voting_clf
voting_clf.fit(X_train, y_train)

# 進行預測，假設有一個測試資料集 X_test
y_pred_voting = voting_clf.predict(X_test)


# 使用交叉驗證來評估模型
scores = cross_val_score(voting_clf, X, y, cv=5)  # cv 是交叉驗證的摺數


evaluate_model_multi_class(y_test, y_pred_voting)

In [None]:
"""輸出模型"""

from joblib import dump

# 保存模型
dump(voting_clf, "voting_clf.joblib")

### 載入模型

In [None]:
from joblib import load
from sklearn.metrics import classification_report

# 載入模型
voting_clf_loaded = load("voting_clf.joblib")

# 假設你已經有了一組新的預測數據 X_new
# 使用載入的模型進行預測
y_pred_voting = voting_clf_loaded.predict(X)

# 假設你也有相應的真實標籤 y_new（用於評估）
# 使用分類指標進行評估
print(classification_report(y, y_pred_voting))

evaluate_model_multi_class(y, y_pred_voting)

## stacking

In [None]:
from sklearn.ensemble import StackingClassifier  # 導入堆疊分類器(StackingClassifier)
from sklearn.linear_model import LogisticRegression  # 用於最終分類器


# 假設 models 是你之前加載的模型列表，其中每個模型都作為一個基學習器(base learner)
estimators = [(f"model_{i}", model) for i, model in enumerate(models)]

# 建立一個堆疊分類器
stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

# X_train, y_train 是您的訓練數據及其標籤
stacking_clf.fit(X_train, y_train)

# 計算訓練集上的準確率
accuracy = stacking_clf.score(X_train, y_train)
print("Training accuracy:", accuracy)

from sklearn.model_selection import cross_val_score

# 進行5折交叉驗證
scores = cross_val_score(stacking_clf, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("stacking Average score:", scores.mean())

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# 重新訓練模型
stacking_clf.fit(X_train, y_train)

# 進行預測
y_pred_stacking = stacking_clf.predict(X_test)

# """輸出模型"""
# from joblib import dump, load

# # 保存模型
# dump(stacking_clf, "stacking_not_adaboost.joblib")

evaluate_model_multi_class(y_test, y_pred_stacking)

## boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# 初始化梯度提升機模型
# 可以透過調整n_estimators、learning_rate和max_depth等參數進行優化
# 請根據您的數據和需求進行調整
gb_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
)

# 將已經訓練好的模型作為初始估計器傳遞給GradientBoostingClassifier
gb_model.estimators_ = models

# 訓練模型
gb_model.fit(X_train, y_train)

# 預測測試集
y_pred_boosting = gb_model.predict(X_test)

# 評估模型性能
evaluate_model_multi_class(y_test, y_pred_boosting)

In [None]:
"""輸出模型"""

from joblib import dump

# 保存模型
dump(svc_best, "svc_best.joblib")