## 讀檔

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('C:\lab\\aigo\\30_Training Dataset_V2\\training_data.csv')
test_data = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_dataset.csv')

df.columns

## 資料前處理

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [None]:
# 使用 LabelEncoder 將文字資料轉換為數值
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']

def oneHotEncode(df, col_list):
    # for col in col_list:
        # if(df[col].dtype == np.dtype('object')):
        #     # pandas.get_dummies 可以对分类特征进行One-Hot编码
        #     dummies = pd.get_dummies(df[col],prefix=col)
        #     df = pd.concat([df,dummies],axis=1)

        #     # # drop the encoded column
        #     df.drop([col],axis = 1 , inplace=True)
        # df_encoded = pd.get_dummies(df, columns=['Color'])
    df = pd.get_dummies(df, columns=col_list,dtype=int)
    return df


def labelEncode(df, col_list):
    for col in col_list:
        df[col] = labelencoder.fit_transform(df[col])
    return df


# 處理數值特徵的偏移值
# 將負數進行平移，使其成為正數，並與正數保持平移前的相對關係
def offset_cal(df, col_list):
    for col in col_list:
        offset = abs(min(df[col]))
        df[col] = df[col] + offset
    return df


In [None]:
# 用於解決因使用dummies導致資料得column不一致的狀態，為data加上不存在的col，並補上0的值
def make_columns_consistent(data_1st, data_2nd, default_value = 0):
    # 取得train data、test data的column name
    train_columns = set(data_1st.columns)
    test_columns = set(data_2nd.columns)

    # 檢查兩個data中是否存在額外的column
    missing_columns_1st = test_columns - train_columns
    missing_columns_2nd = train_columns - test_columns

    # 將存在於data_2nd 但不存在data_1st的column加入 data_1st，並將該值設為0
    for col in missing_columns_1st:
        data_1st[col] = default_value  # 可以根据问题需要设置不同的默认值
    
    # 將存在於data_1st 但不存在data_2nd的column加入 data_2nd，並將該值設為0
    for col in missing_columns_2nd:
        data_2nd[col] = default_value  # 可以根据问题需要设置不同的默认值

    return data_1st,data_2nd

#### train data 前處理

In [None]:
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
# df = labelEncode(df, col_list)
df = oneHotEncode(df, col_list)


# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
df = offset_cal(df, offset_col)

# 刪除ID、string的資料
df = df.drop(['ID', '使用分區', '備註'], axis=1)

x = df.drop(['單價'], axis=1)
y = df['單價']

#### test data 前處理

In [None]:
col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
# test_data = labelEncode(test_data, col_list)
test_data = oneHotEncode(test_data, col_list)

# 處理數值特徵的偏移值
offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
test_data = offset_cal(test_data, offset_col)

# 刪除ID、string的資料
test_data = test_data.drop(['ID', '使用分區', '備註'], axis=1)

In [None]:
print("x.shape: ",x.shape)
print("test_data.shape: ",test_data.shape)

#### 解決因使用dummies導致資料得column不一致的狀態

In [None]:
# 解決因使用dummies導致資料得column不一致的狀態
x, test_data = make_columns_consistent(x, test_data)

In [None]:
print("x.shape: ",x.shape)
print("test_data.shape: ",test_data.shape)

## 模型訓練

### import package、定義模型

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

# 定義深度學習模型
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(x.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# 定義深度學習模型
class Net_v2(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(x.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# 定義 linear regression
class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(x.shape[1], 1)

    def forward(self, x):
        return self.linear(x)


### data進行transform及切割

In [None]:
# 標準化特徵
scaler = StandardScaler()
X_train = scaler.fit_transform(x)

# 將數據劃分為訓練集和測試集
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)

In [None]:
# print('X_train ',type(X_train))
# print('X_val ',type(X_val))

# print('y_train ',type(y_train))
# print('y_val ',type(y_val))


### 查看是否有gpu、定義loss及optimizer

In [None]:
# 检查是否有可用的GPU，如果有，选择第一个可用的GPU
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

# model = Net().to(device)
# model = LinearRegression().to(device)
model = LinearRegression(x.shape[1], 1).to(device)

# 定義損失函數和優化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### 給予訓練的參數、將data轉為tensor

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
# 訓練模型 - 參數
num_epochs = 100
batch_size = 32

# 将NumPy数组转换为PyTorch Tensor
X_train_tensor = torch.Tensor(X_train)
X_val_tensor = torch.Tensor(X_val)

# 将Pandas Series转换为PyTorch Tensor
y_train_tensor = torch.Tensor(y_train.values)
y_val_tensor = torch.Tensor(y_val.values)



# 使用自定义Dataset创建数据集
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
val_dataset = CustomDataset(X_val_tensor, y_val_tensor)


train_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle = True)


for epoch in range(num_epochs):
    current_loss = 0.0
    data_loader  = tqdm(train_loader, desc=f'Epoch {epoch + 1} / {num_epochs} ', ncols=100)     # ,loss: {loss.item()}
    for data, targets in data_loader:
        data = data.to(device)
        targets = targets.to(device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()
        # gradient descent or adam step
        optimizer.step()

        # 使用set_postfix来更新进度条中的显示信息
        data_loader.set_postfix(loss=loss.item())

### 使用切割的dataset來驗證模型

In [None]:
# 用val數據評估模型
model.eval()
val_loss = 0
correct = 0
with torch.no_grad(): # 關掉梯度計算
    for data, targets in val_loader:
        data, targets = data.to(device), targets.to(device)
        val_outputs = model(data)
        # _, pred = torch.max(val_outputs, 1)
        # correct += (pred == targets).sum().item()
        val_loss = criterion(val_outputs, targets)

# with torch.no_grad():
#     val_inputs = torch.Tensor(X_val)
#     val_labels = torch.Tensor(y_val)
#     val_outputs = model(val_inputs)
#     val_loss = criterion(val_outputs, val_labels)
#     print(f"均方誤差（Mean Squared Error）: {val_loss.item()}")

## 使用欲預測的data進行test

### 這裡使用public data

In [None]:
# test_data = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_dataset.csv')

# col_list = ['縣市', '鄉鎮市區', '路名', '主要用途', '主要建材', '建物型態']
# # test_data = labelEncode(test_data, col_list)
# test_data = oneHotEncode(test_data, col_list)

# # 處理數值特徵的偏移值
# offset_col = ['土地面積', '建物面積', '車位面積', '主建物面積', '陽台面積', '附屬建物面積']
# test_data = offset_cal(test_data, offset_col)


# # 刪除ID、string的資料
# test_data = test_data.drop(['ID', '使用分區', '備註'], axis=1)

# # 假设 df_train 是训练数据，df_test 是测试数据
# # 获取训练和测试数据的列名
# train_columns = set(x.columns)
# test_columns = set(test_data.columns)

# # 检查测试数据中是否有额外的列
# missing_columns = train_columns - test_columns

# # 对于在测试数据中存在但不在训练数据中的列，添加列并设置默认值
# for col in missing_columns:
#     test_data[col] = 0  # 你可以根据问题需要设置不同的默认值

# 標準化特徵
scaler = StandardScaler()
X_test = scaler.fit_transform(test_data)

X_test_tensor = torch.Tensor(X_test)

In [None]:
# X_test.shape[1]
test_data.columns

In [None]:
num = len(test_data)
test_loader = DataLoader(dataset = X_test_tensor, batch_size = num , shuffle = True)

# 用val數據評估模型
model.eval()
correct = 0
with torch.no_grad(): # 關掉梯度計算
    for data in test_loader:
        data = data.to(device)
        test_outputs = model(data)


In [None]:
# 使用模型進行預測
test_predictions = test_outputs.cpu().detach().numpy()
test_predictions

In [None]:
# 读取原始CSV文件
df = pd.read_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_submission.csv')

# 将NumPy数组的数据覆盖到第二列
df['predicted_price'] = test_predictions

# 保存DataFrame回到CSV文件
df.to_csv('C:\lab\\aigo\\30_Public Dataset_Public Sumission Template_v2\public_linearRegression_dummies.csv', index=False)

## 快速用圖表探索資料資料

In [None]:
#使用pairplot探索數字型資料之間有沒有任何趨勢
# sns.pairplot(df)

In [None]:
#利用distplot來看房價主要集中的區間
# sns.distplot(df['單價'])

In [None]:
#利用df.corr()先做出各變數間的關係係數，再用heatmap作圖
# sns.heatmap(df.corr(),annot=True)