## 讀檔

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 原始data
# df = pd.read_csv('C:\lab\\aigo\\30_Training Dataset_V2\\training_data.csv')
# test_data = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_dataset.csv')

# 已生成部分特徵的data
df = pd.read_csv('C:\lab\\aigo\\30_Training Dataset_V2\\training_data_processed.csv')
test_data = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_dataset_processed.csv')
df.columns

Index(['ID', '縣市', '鄉鎮市區', '路名', '土地面積', '使用分區', '移轉層次', '總樓層數', '主要用途',
       '主要建材', '建物型態', '屋齡', '建物面積', '車位面積', '車位個數', '橫坐標', '縱坐標', '備註',
       '主建物面積', '陽台面積', '附屬建物面積', '單價', '縣市最低單價', '縣市最高單價', '縣市平均單價',
       '縣市單價中位數', '鄉鎮市區最低單價', '鄉鎮市區最高單價', '鄉鎮市區平均單價', '鄉鎮市區單價中位數', '該路段最低單價',
       '該路段最高單價', '該路段平均單價', '該路段單價中位數'],
      dtype='object')

## 資料前處理

In [2]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [3]:
# 使用 LabelEncoder 將文字資料轉換為數值

def oneHotEncode(df, col_list):
    df = pd.get_dummies(df, columns=col_list,dtype=int)
    return df


def labelEncode(df, col_list):
    for col in col_list:
        df[col] = labelencoder.fit_transform(df[col])
    return df


# 處理數值特徵的偏移值
# 將負數進行平移，使其成為正數，並與正數保持平移前的相對關係
def offset_cal(df, col_list):
    for col in col_list:
        offset = abs(min(df[col]))
        df[col] = df[col] + offset
    return df


In [4]:
# 用於解決因使用dummies導致資料得column不一致的狀態，為data加上不存在的col，並補上0的值
def make_columns_consistent(data_1st, data_2nd, default_value = 0):
    # 取得train data、test data的column name
    train_columns = set(data_1st.columns)
    test_columns = set(data_2nd.columns)

    # 檢查兩個data中是否存在額外的column
    missing_columns_1st = test_columns - train_columns
    missing_columns_2nd = train_columns - test_columns

    # 將存在於data_2nd 但不存在data_1st的column加入 data_1st，並將該值設為0
    for col in missing_columns_1st:
        data_1st[col] = default_value  # 可以根据问题需要设置不同的默认值
    
    # 將存在於data_1st 但不存在data_2nd的column加入 data_2nd，並將該值設為0
    for col in missing_columns_2nd:
        data_2nd[col] = default_value  # 可以根据问题需要设置不同的默认值

    return data_1st,data_2nd

#### train data 前處理

In [5]:
# col_list = ['縣市', '主要用途', '主要建材', '建物型態']
# df = oneHotEncode(df, col_list)

# col_list = ['鄉鎮市區', '路名']
all_col = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
df = labelEncode(df, all_col)

# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
df = offset_cal(df, offset_col)

# 將房子所除樓層進行處理
df['樓層'] = df['移轉層次']/df['總樓層數']

# 刪除ID、string的資料
# df = df.drop(['ID', '使用分區', '備註'], axis=1)
df = df.drop(['ID', '縣市', '鄉鎮市區','使用分區', '移轉層次', '總樓層數', '建物面積', '備註', '陽台面積', '縣市最低單價', '縣市最高單價', '縣市平均單價', '縣市單價中位數'], axis=1)

x = df.drop(['單價'], axis=1)
y = df['單價']

#### test data 前處理

In [6]:
# col_list = ['縣市', '主要用途', '主要建材', '建物型態']
# test_data = labelEncode(test_data, col_list)
# test_data = oneHotEncode(test_data, col_list)

# col_list = ['鄉鎮市區', '路名']
all_col = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
test_data = labelEncode(test_data, all_col)

# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
test_data = offset_cal(test_data, offset_col)

# 將房子所除樓層進行處理
test_data['樓層'] = test_data['移轉層次']/test_data['總樓層數']

# 刪除ID、string的資料
# test_data = test_data.drop(['ID', '使用分區', '備註'], axis=1)
test_data = test_data.drop(['ID', '縣市', '鄉鎮市區','使用分區', '移轉層次', '總樓層數', '建物面積', '備註', '陽台面積', '縣市最低單價', '縣市最高單價', '縣市平均單價', '縣市單價中位數'], axis=1)

In [7]:
print("x.shape: ",x.shape)
print("test_data.shape: ",test_data.shape)

x.shape:  (11751, 21)
test_data.shape:  (5876, 21)


In [8]:
df.columns

Index(['路名', '土地面積', '主要用途', '主要建材', '建物型態', '屋齡', '車位面積', '車位個數', '橫坐標',
       '縱坐標', '主建物面積', '附屬建物面積', '單價', '鄉鎮市區最低單價', '鄉鎮市區最高單價', '鄉鎮市區平均單價',
       '鄉鎮市區單價中位數', '該路段最低單價', '該路段最高單價', '該路段平均單價', '該路段單價中位數', '樓層'],
      dtype='object')

#### 解決因使用dummies導致資料得column不一致的狀態

In [9]:
# 解決因使用dummies導致資料得column不一致的狀態
x, test_data = make_columns_consistent(x, test_data)

In [10]:
print("x.shape: ",x.shape)
print("x.shape: ",x.shape[1])
print("test_data.shape: ",test_data.shape)

x.shape:  (11751, 21)
x.shape:  21
test_data.shape:  (5876, 21)


In [11]:
x.columns

Index(['路名', '土地面積', '主要用途', '主要建材', '建物型態', '屋齡', '車位面積', '車位個數', '橫坐標',
       '縱坐標', '主建物面積', '附屬建物面積', '鄉鎮市區最低單價', '鄉鎮市區最高單價', '鄉鎮市區平均單價',
       '鄉鎮市區單價中位數', '該路段最低單價', '該路段最高單價', '該路段平均單價', '該路段單價中位數', '樓層'],
      dtype='object')

## 模型訓練

### import package、自定義dataset、定義模型

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]


# 自定義 MAPE 評估函數
def mape_eval(y_pred, data):
    y_true = data.get_label()
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return 'MAPE', mape, False



#### 模型

### data進行transform及切割

In [13]:
# 標準化特徵
scaler = StandardScaler()
X_train = scaler.fit_transform(x)

# 將數據劃分為訓練集和測試集
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [14]:
# print('X_train ',type(X_train))
# print('X_val ',type(X_val))

# print('y_train ',type(y_train))
# print('y_val ',type(y_val))


### 查看是否有gpu、定義loss及optimizer

In [15]:
# 检查是否有可用的GPU，如果有，选择第一个可用的GPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")


# model = XGBRegressor(learning_rate=0.1,)
# model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=['rmse', 'mae'], early_stopping_rounds=10) 

# model = SVR(kernel='rbf', C=1.0, epsilon=0.2)

# 创建 LightGBM 模型并使用自定义评估函数
params = {
    'objective': 'regression',
    'metric': 'mae',    # MAE仅用于监控训练过程，而不是最终评估指标
    # 其他参数
}
# model = lgb.LGBMRegressor()
model = lgb.train(params, train_data, valid_sets=[valid_data], feval=mape_eval, early_stopping_rounds=10, num_boost_round=1000)

# model.fit(X_train, y_train) 


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3612
[LightGBM] [Info] Number of data points in the train set: 9400, number of used features: 21
[LightGBM] [Info] Start training from score 1.993345
[1]	valid_0's l1: 0.663902	valid_0's MAPE: 39.337
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 0.603997	valid_0's MAPE: 35.7703
[3]	valid_0's l1: 0.551336	valid_0's MAPE: 32.6156
[4]	valid_0's l1: 0.503711	valid_0's MAPE: 29.778
[5]	valid_0's l1: 0.461076	valid_0's MAPE: 27.2279
[6]	valid_0's l1: 0.423295	valid_0's MAPE: 24.962
[7]	valid_0's l1: 0.389942	valid_0's MAPE: 22.9295
[8]	valid_0's l1: 0.360188	valid_0's MAPE: 21.1357
[9]	valid_0's l1: 0.334079	valid_0's MAPE: 19.5233
[10]	valid_0's l1: 0.311241	valid_0's MAPE: 18.1157
[11]	valid_0's l1: 0.29112	valid_0's MAPE: 16.8702
[12]	valid_0's l1: 0.272994	valid_0's MAPE: 15.7415
[13]	valid_0's l1: 0.257414	valid_0's MAPE: 14.7543
[14]	valid_0's l1: 0.24352	valid_0's MA



## 使用欲預測的data進行test

### 這裡使用public data

In [16]:
# 標準化特徵
scaler = StandardScaler()
X_test = scaler.fit_transform(test_data)

X_test_tensor = torch.Tensor(X_test)

In [17]:
# 使用模型進行預測
y_pred = model.predict(X_test)

# test_predictions = test_outputs.cpu().detach().numpy()
y_pred

array([1.87207299, 1.64577512, 2.71614832, ..., 1.88089578, 3.43835145,
       2.14727519])

In [18]:
# 读取原始CSV文件
result = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_submission.csv')

# 将NumPy数组的数据覆盖到第二列
result['predicted_price'] = y_pred

# 保存DataFrame回到CSV文件
result.to_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_LightGBM_dataProcessed_labelencode_1102.csv', index=False)

## 查看特徵對模型的影響力

In [19]:
feature_importance = model.feature_importance(importance_type='split')  # 或 'gain'
#
#  获取特征名称
feature_names = x.columns

In [22]:
feature_importance_ranking = sorted(
    zip(feature_names, feature_importance),
    key=lambda x: x[1],
    reverse=True
)
feature_importance_ranking

[('該路段最高單價', 723),
 ('屋齡', 598),
 ('該路段最低單價', 495),
 ('主建物面積', 364),
 ('該路段單價中位數', 348),
 ('土地面積', 337),
 ('該路段平均單價', 329),
 ('橫坐標', 316),
 ('車位面積', 275),
 ('縱坐標', 268),
 ('樓層', 263),
 ('附屬建物面積', 166),
 ('路名', 161),
 ('鄉鎮市區最低單價', 99),
 ('鄉鎮市區最高單價', 98),
 ('鄉鎮市區平均單價', 90),
 ('建物型態', 87),
 ('主要用途', 56),
 ('主要建材', 52),
 ('車位個數', 21),
 ('鄉鎮市區單價中位數', 14)]

## shap

In [29]:
import shap

explainer = shap.Explainer(model, X_train)
# shap_values = explainer(X_val.iloc[0])         # 解释单个预测
shap_values = explainer.shap_values(X_val)             # 解释整个数据集的预测


ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. This check failed because for one of the samples the sum of the SHAP values was 3.385044, while the model output was 3.210672. If this difference is acceptable you can set check_additivity=False to disable this check.

In [None]:
# 可視化解釋
shap.summary_plot(shap_values, X_test)

# 快速用圖表探索資料資料

In [None]:
#使用pairplot探索數字型資料之間有沒有任何趨勢
# sns.pairplot(df)

In [None]:
#利用distplot來看房價主要集中的區間
# sns.distplot(df['單價'])

In [None]:
#利用df.corr()先做出各變數間的關係係數，再用heatmap作圖
# sns.heatmap(df.corr(),annot=True)