In [4]:
pip install pandas openpyxl



In [5]:
# 重新導入必要的庫
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# GitHub 上 Excel 文件的原始鏈接
url = 'https://raw.githubusercontent.com/xixa3333/Applying-Machine-Learning-to-Public-Internet-of-Things/main/EMIS_2006-2013.xlsx'

# 載入 Excel 文件
df_sampled = pd.read_excel(url, sheet_name='工作表1')

# 刪除目標變數（災情類別）中有缺失值的行
df_sampled = df_sampled.dropna(subset=['災情類別'])

# 對目標變數（災情類別）進行標籤編碼
label_encoder = LabelEncoder()
df_sampled.loc[:, '災情類別編碼'] = label_encoder.fit_transform(df_sampled['災情類別'])

# 對特徵中的任何缺失值進行填充，填充為'缺失值'
df_sampled = df_sampled.fillna('缺失值')

# 進行分層抽樣，隨機選擇 50% 的資料作為訓練集
df_sampled, _ = train_test_split(df_sampled, test_size=0.5, stratify=df_sampled['災情類別'], random_state=42)

# 計算每個類別的樣本數量
class_counts = df_sampled['災情類別編碼'].value_counts()

# 設置最小樣本數的閾值
threshold = 100  # 例如，只保留樣本數量大於或等於 100 的類別

# 確定要保留的類別
classes_to_keep = class_counts[class_counts >= threshold].index

# 過濾數據框，只保留需要的類別
df = df_sampled[df_sampled['災情類別編碼'].isin(classes_to_keep)]

# 對分類特徵（縣市、災情細項、災情描述）進行 One-Hot 編碼
features = pd.get_dummies(df[['縣市', '災情細項', '災情描述']], drop_first=True)

# 目標變數
target = df['災情類別編碼']

# 輸出各類別的樣本數量
print(target.value_counts())

# 將數據分割為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

# 對特徵數據進行標準化處理
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 建立 MLP 模型
mlp = MLPClassifier(hidden_layer_sizes=(100,),
                    max_iter=100,
                    random_state=42,
                    verbose=True)

# 訓練模型
mlp.fit(X_train, y_train)

# 在測試數據上進行預測
y_pred = mlp.predict(X_test)

# 評估模型
accuracy = accuracy_score(y_test, y_pred)  # 計算準確率

# 計算分類報告，確保 target_names 與實際預測的類別數量匹配
# 這裡使用 `unique` 方法來獲取實際出現過的類別名稱
target_names = label_encoder.inverse_transform(sorted(set(y_train) | set(y_pred)))

# 計算分類報告
classification_rep = classification_report(y_test, y_pred, target_names=target_names)

print(f"準確率: {accuracy:.4f}")
print("分類報告:")
print(classification_rep)

災情類別編碼
0     6073
10    2992
9     2475
5     2135
2     1122
1      746
11     737
3      180
6      148
4      117
Name: count, dtype: int64
Iteration 1, loss = 3.11647743
Iteration 2, loss = 0.87242647
Iteration 3, loss = 0.25712303
Iteration 4, loss = 0.17176940
Iteration 5, loss = 0.12991059
Iteration 6, loss = 0.10125439
Iteration 7, loss = 0.08665353
Iteration 8, loss = 0.07880298
Iteration 9, loss = 0.07348958
Iteration 10, loss = 0.06986185
Iteration 11, loss = 0.06688112
Iteration 12, loss = 0.06480935
Iteration 13, loss = 0.06289304
Iteration 14, loss = 0.06141509
Iteration 15, loss = 0.06075204
Iteration 16, loss = 0.05929850
Iteration 17, loss = 0.05855104
Iteration 18, loss = 0.05691635
Iteration 19, loss = 0.05683030
Iteration 20, loss = 0.05669089
Iteration 21, loss = 0.05617181
Iteration 22, loss = 0.05471987
Iteration 23, loss = 0.05494142
Iteration 24, loss = 0.05451271
Iteration 25, loss = 0.05370589
Iteration 26, loss = 0.05342919
Iteration 27, loss = 0.05363114
It

In [6]:
# 重新導入必要的庫
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# GitHub 上 Excel 文件的原始鏈接
url = 'https://raw.githubusercontent.com/xixa3333/Applying-Machine-Learning-to-Public-Internet-of-Things/main/EMIS_2006-2013.xlsx'

# 載入 Excel 文件
df_sampled = pd.read_excel(url, sheet_name='工作表1')

# 刪除目標變數（災情類別）中有缺失值的行
df_sampled = df_sampled.dropna(subset=['災情類別'])

# 對目標變數（災情類別）進行標籤編碼
label_encoder = LabelEncoder()
df_sampled.loc[:, '災情類別編碼'] = label_encoder.fit_transform(df_sampled['災情類別'])

# 對特徵中的任何缺失值進行填充，填充為'缺失值'
df_sampled = df_sampled.fillna('缺失值')

# 進行分層抽樣，隨機選擇 50% 的資料作為訓練集
df_sampled, _ = train_test_split(df_sampled, test_size=0.5, stratify=df_sampled['災情類別'], random_state=42)

# 計算每個類別的樣本數量
class_counts = df_sampled['災情類別編碼'].value_counts()

# 設置最小樣本數的閾值
threshold = 100  # 例如，只保留樣本數量大於或等於 100 的類別

# 確定要保留的類別
classes_to_keep = class_counts[class_counts >= threshold].index

# 過濾數據框，只保留需要的類別
df = df_sampled[df_sampled['災情類別編碼'].isin(classes_to_keep)]

# 對分類特徵（縣市、災情細項、災情描述）進行 One-Hot 編碼
features = pd.get_dummies(df[['縣市', '災情細項', '災情描述']], drop_first=True)

# 目標變數
target = df['災情類別編碼']

# 輸出各類別的樣本數量
print(target.value_counts())

# 將數據分割為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42, stratify=target)

# 對特徵數據進行標準化處理
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 建立隨機森林模型
rf = RandomForestClassifier(n_estimators=100,  # 設置樹木的數量
                            random_state=42,
                            verbose=True)

# 訓練模型
rf.fit(X_train, y_train)

# 在測試數據上進行預測
y_pred = rf.predict(X_test)

# 評估模型
accuracy = accuracy_score(y_test, y_pred)  # 計算準確率

# 計算分類報告，確保 target_names 與實際預測的類別數量匹配
# 這裡使用 `unique` 方法來獲取實際出現過的類別名稱
target_names = label_encoder.inverse_transform(sorted(set(y_train) | set(y_pred)))

# 計算分類報告
classification_rep = classification_report(y_test, y_pred, target_names=target_names)

print(f"準確率: {accuracy:.4f}")
print("分類報告:")
print(classification_rep)


災情類別編碼
0     6073
10    2992
9     2475
5     2135
2     1122
1      746
11     737
3      180
6      148
4      117
Name: count, dtype: int64


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   26.9s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s


準確率: 0.9326
分類報告:
              precision    recall  f1-score   support

        其他災情       0.84      1.00      0.92      1822
        土石災情       1.00      1.00      1.00       224
      廣告招牌災情       1.00      1.00      1.00       337
        建物毀損       1.00      0.04      0.07        54
        橋梁災情       1.00      0.03      0.06        35
   民生、基礎設施災情       1.00      1.00      1.00       640
      水利設施災情       0.86      0.14      0.24        44
       積淹水災情       1.00      1.00      1.00       743
        路樹災情       1.00      1.00      1.00       898
     道路、隧道災情       1.00      0.04      0.08       221

    accuracy                           0.93      5018
   macro avg       0.97      0.62      0.64      5018
weighted avg       0.94      0.93      0.91      5018

