## 讀檔

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('C:\lab\\aigo\\30_Training Dataset_V2\\training_data.csv')
test_data = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_dataset.csv')

df.columns

Index(['ID', '縣市', '鄉鎮市區', '路名', '土地面積', '使用分區', '移轉層次', '總樓層數', '主要用途',
       '主要建材', '建物型態', '屋齡', '建物面積', '車位面積', '車位個數', '橫坐標', '縱坐標', '備註',
       '主建物面積', '陽台面積', '附屬建物面積', '單價'],
      dtype='object')

## 資料前處理

In [2]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [3]:
# 使用 LabelEncoder 將文字資料轉換為數值
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']

def oneHotEncode(df, col_list):
    # for col in col_list:
        # if(df[col].dtype == np.dtype('object')):
        #     # pandas.get_dummies 可以对分类特征进行One-Hot编码
        #     dummies = pd.get_dummies(df[col],prefix=col)
        #     df = pd.concat([df,dummies],axis=1)

        #     # # drop the encoded column
        #     df.drop([col],axis = 1 , inplace=True)
        # df_encoded = pd.get_dummies(df, columns=['Color'])
    df = pd.get_dummies(df, columns=col_list,dtype=int)
    return df


def labelEncode(df, col_list):
    for col in col_list:
        df[col] = labelencoder.fit_transform(df[col])
    return df


# 處理數值特徵的偏移值
# 將負數進行平移，使其成為正數，並與正數保持平移前的相對關係
def offset_cal(df, col_list):
    for col in col_list:
        offset = abs(min(df[col]))
        df[col] = df[col] + offset
    return df


In [4]:
# 用於解決因使用dummies導致資料得column不一致的狀態，為data加上不存在的col，並補上0的值
def make_columns_consistent(data_1st, data_2nd, default_value = 0):
    # 取得train data、test data的column name
    train_columns = set(data_1st.columns)
    test_columns = set(data_2nd.columns)

    # 檢查兩個data中是否存在額外的column
    missing_columns_1st = test_columns - train_columns
    missing_columns_2nd = train_columns - test_columns

    # 將存在於data_2nd 但不存在data_1st的column加入 data_1st，並將該值設為0
    for col in missing_columns_1st:
        data_1st[col] = default_value  # 可以根据问题需要设置不同的默认值
    
    # 將存在於data_1st 但不存在data_2nd的column加入 data_2nd，並將該值設為0
    for col in missing_columns_2nd:
        data_2nd[col] = default_value  # 可以根据问题需要设置不同的默认值

    return data_1st,data_2nd

#### train data 前處理

In [5]:
col_list = ['縣市', '鄉鎮市區', '主要用途', '主要建材', '建物型態']
df = oneHotEncode(df, col_list)

col_list = ['路名']
df = labelEncode(df, col_list)

# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
df = offset_cal(df, offset_col)

# 刪除ID、string的資料
df = df.drop(['ID', '使用分區', '備註'], axis=1)

x = df.drop(['單價'], axis=1)
y = df['單價']

#### test data 前處理

In [6]:
col_list = ['縣市', '鄉鎮市區', '主要用途', '主要建材', '建物型態']
# test_data = labelEncode(test_data, col_list)
test_data = oneHotEncode(test_data, col_list)

col_list = ['路名']
test_data = labelEncode(test_data, col_list)

# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
test_data = offset_cal(test_data, offset_col)

# 刪除ID、string的資料
test_data = test_data.drop(['ID', '使用分區', '備註'], axis=1)

In [7]:
print("x.shape: ",x.shape)
print("test_data.shape: ",test_data.shape)

x.shape:  (11751, 176)
test_data.shape:  (5876, 168)


#### 解決因使用dummies導致資料得column不一致的狀態

In [8]:
# 解決因使用dummies導致資料得column不一致的狀態
x, test_data = make_columns_consistent(x, test_data)

In [9]:
print("x.shape: ",x.shape)
print("x.shape: ",x.shape[1])
print("test_data.shape: ",test_data.shape)

x.shape:  (11751, 183)
x.shape:  183
test_data.shape:  (5876, 183)


## 模型訓練

### import package、自定義dataset、定義模型

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

#### 模型

### data進行transform及切割

In [11]:
# 標準化特徵
scaler = StandardScaler()
X_train = scaler.fit_transform(x)

# 將數據劃分為訓練集和測試集
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)

In [12]:
# print('X_train ',type(X_train))
# print('X_val ',type(X_val))

# print('y_train ',type(y_train))
# print('y_val ',type(y_val))


### 查看是否有gpu、定義loss及optimizer

In [13]:
# 检查是否有可用的GPU，如果有，选择第一个可用的GPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")


# model = XGBRegressor(learning_rate=0.1,)
model = lgb.LGBMRegressor()
# model = SVR(kernel='rbf', C=1.0, epsilon=0.2)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=['rmse', 'mae'], early_stopping_rounds=10)

[1]	valid_0's rmse: 0.920238	valid_0's l1: 0.669253	valid_0's l2: 0.846838
[2]	valid_0's rmse: 0.8517	valid_0's l1: 0.614279	valid_0's l2: 0.725393
[3]	valid_0's rmse: 0.791071	valid_0's l1: 0.565177	valid_0's l2: 0.625793
[4]	valid_0's rmse: 0.737072	valid_0's l1: 0.522287	valid_0's l2: 0.543275
[5]	valid_0's rmse: 0.688969	valid_0's l1: 0.484426	valid_0's l2: 0.474678
[6]	valid_0's rmse: 0.647057	valid_0's l1: 0.452357	valid_0's l2: 0.418682
[7]	valid_0's rmse: 0.610196	valid_0's l1: 0.423804	valid_0's l2: 0.372339
[8]	valid_0's rmse: 0.577197	valid_0's l1: 0.398507	valid_0's l2: 0.333157
[9]	valid_0's rmse: 0.548111	valid_0's l1: 0.376127	valid_0's l2: 0.300426
[10]	valid_0's rmse: 0.521905	valid_0's l1: 0.357244	valid_0's l2: 0.272384
[11]	valid_0's rmse: 0.500043	valid_0's l1: 0.341028	valid_0's l2: 0.250043
[12]	valid_0's rmse: 0.47938	valid_0's l1: 0.326134	valid_0's l2: 0.229805
[13]	valid_0's rmse: 0.462532	valid_0's l1: 0.313598	valid_0's l2: 0.213936
[14]	valid_0's rmse: 0.4



## 使用欲預測的data進行test

### 這裡使用public data

In [14]:
# 標準化特徵
scaler = StandardScaler()
X_test = scaler.fit_transform(test_data)

X_test_tensor = torch.Tensor(X_test)

In [15]:
# 使用模型進行預測
y_pred = model.predict(X_test)

# test_predictions = test_outputs.cpu().detach().numpy()
y_pred

array([1.46303189, 1.0324117 , 1.75153701, ..., 1.3222326 , 2.99195663,
       1.51529917])

In [17]:
# 读取原始CSV文件
df = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_submission.csv')

# 将NumPy数组的数据覆盖到第二列
df['predicted_price'] = y_pred

# 保存DataFrame回到CSV文件
df.to_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_LightGBM.csv', index=False)

# 快速用圖表探索資料資料

In [None]:
#使用pairplot探索數字型資料之間有沒有任何趨勢
# sns.pairplot(df)

In [None]:
#利用distplot來看房價主要集中的區間
# sns.distplot(df['單價'])

In [None]:
#利用df.corr()先做出各變數間的關係係數，再用heatmap作圖
# sns.heatmap(df.corr(),annot=True)