## 讀檔

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('C:\lab\\aigo\\30_Training Dataset_V2\\training_data.csv')
test_data = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_dataset.csv')

df.columns

Index(['ID', '縣市', '鄉鎮市區', '路名', '土地面積', '使用分區', '移轉層次', '總樓層數', '主要用途',
       '主要建材', '建物型態', '屋齡', '建物面積', '車位面積', '車位個數', '橫坐標', '縱坐標', '備註',
       '主建物面積', '陽台面積', '附屬建物面積', '單價'],
      dtype='object')

## 資料前處理

In [2]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [3]:
# 使用 LabelEncoder 將文字資料轉換為數值
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']

def oneHotEncode(df, col_list):
    # for col in col_list:
        # if(df[col].dtype == np.dtype('object')):
        #     # pandas.get_dummies 可以对分类特征进行One-Hot编码
        #     dummies = pd.get_dummies(df[col],prefix=col)
        #     df = pd.concat([df,dummies],axis=1)

        #     # # drop the encoded column
        #     df.drop([col],axis = 1 , inplace=True)
        # df_encoded = pd.get_dummies(df, columns=['Color'])
    df = pd.get_dummies(df, columns=col_list,dtype=int)
    return df


def labelEncode(df, col_list):
    for col in col_list:
        df[col] = labelencoder.fit_transform(df[col])
    return df


# 處理數值特徵的偏移值
# 將負數進行平移，使其成為正數，並與正數保持平移前的相對關係
def offset_cal(df, col_list):
    for col in col_list:
        offset = abs(min(df[col]))
        df[col] = df[col] + offset
    return df


In [4]:
# 用於解決因使用dummies導致資料得column不一致的狀態，為data加上不存在的col，並補上0的值
def make_columns_consistent(data_1st, data_2nd, default_value = 0):
    # 取得train data、test data的column name
    train_columns = set(data_1st.columns)
    test_columns = set(data_2nd.columns)

    # 檢查兩個data中是否存在額外的column
    missing_columns_1st = test_columns - train_columns
    missing_columns_2nd = train_columns - test_columns

    # 將存在於data_2nd 但不存在data_1st的column加入 data_1st，並將該值設為0
    for col in missing_columns_1st:
        data_1st[col] = default_value  # 可以根据问题需要设置不同的默认值
    
    # 將存在於data_1st 但不存在data_2nd的column加入 data_2nd，並將該值設為0
    for col in missing_columns_2nd:
        data_2nd[col] = default_value  # 可以根据问题需要设置不同的默认值

    return data_1st,data_2nd

#### train data 前處理

In [5]:
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
df = labelEncode(df, col_list)
# df = oneHotEncode(df, col_list)


# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
df = offset_cal(df, offset_col)

# 刪除ID、string的資料
df = df.drop(['ID', '使用分區', '備註'], axis=1)

x = df.drop(['單價'], axis=1)
y = df['單價']

#### test data 前處理

In [6]:
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
test_data = labelEncode(test_data, col_list)
# test_data = oneHotEncode(test_data, col_list)

# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
test_data = offset_cal(test_data, offset_col)

# 刪除ID、string的資料
test_data = test_data.drop(['ID', '使用分區', '備註'], axis=1)

In [7]:
print("x.shape: ",x.shape)
print("test_data.shape: ",test_data.shape)

x.shape:  (11751, 18)
test_data.shape:  (5876, 18)


#### 解決因使用dummies導致資料得column不一致的狀態

In [8]:
# 解決因使用dummies導致資料得column不一致的狀態
# x, test_data = make_columns_consistent(x, test_data)

In [9]:
print("x.shape: ",x.shape)
print("x.shape: ",x.shape[1])
print("test_data.shape: ",test_data.shape)

x.shape:  (11751, 18)
x.shape:  18
test_data.shape:  (5876, 18)


## 模型訓練

### import package、自定義dataset、定義模型

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from xgboost import XGBClassifier

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

#### 模型

In [11]:
# 定義 linear regression
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(x.shape[1], 1)

    def forward(self, x):
        return self.linear(x)

In [12]:
# 定義深度學習模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(x.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# 定義深度學習模型
class Net_v2(nn.Module):
    def __init__(self):
        super(Net_v2, self).__init__()
        self.fc1 = nn.Linear(x.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [13]:
class Net_v3(nn.Module):
    def __init__(self, features):
        super(Net, self).__init__()
        self.linear_relu1 = nn.Linear(features, 128)
        self.linear_relu2 = nn.Linear(128, 256)
        self.linear_relu3 = nn.Linear(256, 256)
        self.linear_relu4 = nn.Linear(256, 256)
        self.linear5 = nn.Linear(256, 1)
        
    def forward(self, x):
        
        y_pred = self.linear_relu1(x)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear_relu2(y_pred)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear_relu3(y_pred)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear_relu4(y_pred)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear5(y_pred)
        return y_pred

### data進行transform及切割

In [14]:
# 標準化特徵
scaler = StandardScaler()
X_train = scaler.fit_transform(x)

# 將數據劃分為訓練集和測試集
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)

In [15]:
# print('X_train ',type(X_train))
# print('X_val ',type(X_val))

# print('y_train ',type(y_train))
# print('y_val ',type(y_val))


### 查看是否有gpu、定義loss及optimizer

In [16]:
# 检查是否有可用的GPU，如果有，选择第一个可用的GPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

model = Net_v2().to(device)
# model = LinearRegression(x.shape[1], 1).to(device)
# model = Net(features = x.shape[1]).to(device)
# model = XGBRegressor()
model = XGBClassifier()

# 定義損失函數和優化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### 給予訓練的參數、將data轉為tensor

In [17]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
# 訓練模型 - 參數
num_epochs = 300
batch_size = 64

# 将NumPy数组转换为PyTorch Tensor
X_train_tensor = torch.Tensor(X_train)
X_val_tensor = torch.Tensor(X_val)

# 将Pandas Series转换为PyTorch Tensor
y_train_tensor = torch.Tensor(y_train.values)
y_val_tensor = torch.Tensor(y_val.values)



# 使用自定义Dataset创建数据集
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
val_dataset = CustomDataset(X_val_tensor, y_val_tensor)


train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle = True)


for epoch in range(num_epochs):
    current_loss = 0.0
    data_loader  = tqdm(train_loader, desc=f'Epoch {epoch + 1} / {num_epochs} ', ncols=100)     # ,loss: {loss.item()}
    for data, targets in data_loader:
        data = data.to(device)
        targets = targets.to(device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()
        # gradient descent or adam step
        optimizer.step()

        # 使用set_postfix来更新进度条中的显示信息
        data_loader.set_postfix(loss=loss.item())

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1 / 300 : 100%|██████████████████████████████████| 147/147 [00:01<00:00, 99.19it/s, loss=1.02]
Epoch 2 / 300 : 100%|████████████████████████████████| 147/147 [00:00<00:00, 474.19it/s, loss=0.984]
Epoch 3 / 300 : 100%|█████████████████████████████████| 147/147 [00:00<00:00, 457.94it/s, loss=1.38]
Epoch 4 / 300 : 100%|████████████████████████████████| 147/147 [00:00<00:00, 446.81it/s, loss=0.512]
Epoch 5 / 300 : 100%|█████████████████████████████████| 147/147 [00:00<00:00, 444.11it/s, loss=1.08]
Epoch 6 / 300 : 100%|█████████████████████████████████| 147/147 [00:00<00:00, 440.12it/s, loss=1.11]
Epoch 7 / 300 : 100%|████████████████████████████████| 147/147 [00:00<00:00, 445.46it/s, loss=0.829]
Epoch 8 / 300 : 100%|████████████████████████████████| 147/147 [00:00<00:00, 448.17it/s, loss=0.666]
Epoch 9 / 300 : 100%|████████████████████████████████| 147/147 [00:00<

### 使用切割的dataset來驗證模型

In [18]:
# 用val數據評估模型
model.eval()
val_loss = 0
correct = 0
with torch.no_grad(): # 關掉梯度計算
    for data, targets in val_loader:
        data, targets = data.to(device), targets.to(device)
        val_outputs = model(data)
        # _, pred = torch.max(val_outputs, 1)
        # correct += (pred == targets).sum().item()
        val_loss = criterion(val_outputs, targets)

# with torch.no_grad():
#     val_inputs = torch.Tensor(X_val)
#     val_labels = torch.Tensor(y_val)
#     val_outputs = model(val_inputs)
#     val_loss = criterion(val_outputs, val_labels)
#     print(f"均方誤差（Mean Squared Error）: {val_loss.item()}")

  return F.mse_loss(input, target, reduction=self.reduction)


## 使用欲預測的data進行test

### 這裡使用public data

In [19]:
# 標準化特徵
scaler = StandardScaler()
X_test = scaler.fit_transform(test_data)

X_test_tensor = torch.Tensor(X_test)

In [20]:
num = len(test_data)
test_loader = DataLoader(dataset = X_test_tensor, batch_size = num , shuffle = True)

# 用val數據評估模型
model.eval()
correct = 0
with torch.no_grad(): # 關掉梯度計算
    for data in test_loader:
        data = data.to(device)
        test_outputs = model(data)


In [21]:
# 使用模型進行預測
test_predictions = test_outputs.cpu().detach().numpy()
test_predictions

array([[1.971584 ],
       [1.9660476],
       [1.9863964],
       ...,
       [1.9863796],
       [1.9756664],
       [1.9781502]], dtype=float32)

In [22]:
# 读取原始CSV文件
df = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_submission.csv')

# 将NumPy数组的数据覆盖到第二列
df['predicted_price'] = test_predictions

# 保存DataFrame回到CSV文件
df.to_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_net2_labelencode_epcho100_batch64.csv', index=False)

# 快速用圖表探索資料資料

In [23]:
#使用pairplot探索數字型資料之間有沒有任何趨勢
# sns.pairplot(df)

In [24]:
#利用distplot來看房價主要集中的區間
# sns.distplot(df['單價'])

In [25]:
#利用df.corr()先做出各變數間的關係係數，再用heatmap作圖
# sns.heatmap(df.corr(),annot=True)