# 모듈 호출

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# 데이터 프레임을 넘파이 배열로 만들기

In [3]:
df = pd.read_csv('./data/reg.csv', index_col=[0])
df

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,Price
0,0.034633,0.206919,0.137057,0.540526,0.193941,0.699239,0.630532,0.239410,0.027375,0.209857,0.347609,0.996394,0.102644,0.422222
1,0.028920,0.014315,0.276113,0.255945,0.618886,0.555407,0.782263,0.482977,0.103031,0.106690,0.520776,0.996650,0.187120,0.368889
2,0.020627,0.033230,0.281116,0.525591,0.165269,0.624102,0.586005,0.272713,0.036010,0.106986,0.595301,0.983284,0.084079,0.660000
3,0.022749,0.033801,0.125044,0.263253,0.251509,0.658532,0.432160,0.344932,0.150018,0.068317,0.651297,0.989989,0.015990,0.631111
4,0.022148,0.029374,0.121057,0.521126,0.399670,0.448086,0.520158,0.495342,0.104383,0.069360,0.560116,0.998723,0.092782,0.693333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.028702,0.019528,0.455716,0.097575,0.576859,0.588769,0.654701,0.188444,0.007595,0.165409,0.736795,0.982923,0.219891,0.386667
502,0.034217,0.009498,0.490485,0.372934,0.532351,0.580505,0.750547,0.143776,0.051186,0.166428,0.867950,0.995114,0.207453,0.346667
503,0.035352,0.021495,0.423918,0.397988,0.349407,0.610529,0.907637,0.087385,0.083448,0.164870,0.782704,0.995791,0.094044,0.420000
504,0.026182,0.028603,0.443442,0.509663,0.229142,0.667841,0.867135,0.236241,0.016177,0.167554,0.749186,0.986855,0.107399,0.377778


In [4]:
X = df.drop(columns=["Price"]).to_numpy()
Y = df['Price'].to_numpy().reshape(-1, 1)
print(X.shape, Y.shape)

(506, 13) (506, 1)


# 텐서 데이터 만들기

In [5]:
class TensorData(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = torch.FloatTensor(x_data)
        self.y_data = torch.FloatTensor(y_data)
        self.len = self.y_data.shape[0]
        
    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]
    
    def __len__(self):
        return self.len

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.7)
trainset = TensorData(X_train, Y_train)
testset = TensorData(X_test, Y_test)

testloader = DataLoader(testset, batch_size = 32, shuffle = True)

In [10]:
print(X_train.shape, X_test.shape)

(151, 13) (355, 13)


# 모델 구축

In [11]:
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(13, 50, bias = True)
        self.fc2 = nn.Linear(50, 30, bias = True)
        self.fc3 = nn.Linear(30, 1, bias = True)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

# 손실 함수와 교차 검증 정의

In [14]:
kfold = KFold(n_splits = 3, shuffle = True)
criterion = nn.MSELoss()

# 평가 함수 정의

In [47]:
def evaluation(dataloader):
    pred = torch.tensor([], dtype = torch.float)
    actual = torch.tensor([], dtype = torch.float)
    
    with torch.no_grad():
        model.eval() # 평가를 할 때는 .eval()을 반드시 사용해야 함.
        for data in dataloader:
            inputs, values = data
            outputs = model(inputs)
            
            pred = torch.cat((pred, outputs), 0) # 예측값 누적
            actual = torch.cat((actual ,values), 0) # 실제값 누적
            
    pred = pred.numpy()
    actual = actual.numpy()
    rmse = np.sqrt(mean_squared_error(pred, actual))
    model.train()
    
    return rmse

# 교차 검증을 이용한 학습 및 평가

In [49]:
validation_loss = []
for fold, (train_idx, val_idx) in enumerate(kfold.split(trainset)):
    # 3개의 폴드로 나눠진 훈련세트와 검증세트를 Tensor 형태로 되어있는 것들을 불러오기 위해 index 역할을 하도록
    # train_idx와 val_idx를 섞는 SubsetRandomSampler()를 사용.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
    
    trainloader = DataLoader(trainset, batch_size = 32, sampler = train_subsampler)
    valloader = DataLoader(trainset, batch_size = 32, sampler = val_subsampler)
    
    model = Regressor()
    optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay = 1e-7)
    
    for epoch in range(400):
        for data in trainloader:
            inputs, values = data
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, values)
            loss.backward()
            optimizer.step()
            
    train_rmse = evaluation(trainloader)
    val_rmse = evaluation(valloader)
    print("k-fold {} Train Loss : {:.4f}, Validation Loss : {:.4f}".format(fold, train_rmse, val_rmse))
    validation_loss.append(val_rmse)

k-fold 0 Train Loss : 0.1200, Validation Loss : 0.0802
k-fold 1 Train Loss : 0.0917, Validation Loss : 0.1488
k-fold 2 Train Loss : 0.1034, Validation Loss : 0.1337


# 검증 점수 산출

In [51]:
validation_loss = np.array(validation_loss)
mean = np.mean(validation_loss)
std = np.std(validation_loss)
print("Validation Score : {:.4f}, ± {:.4f}".format(mean, std))

Validation Score : 0.1209, ± 0.0294


# 모델 평가

In [53]:
# 
trainloader = DataLoader(trainset, batch_size = 32, shuffle = True)
train_rmse = evaluation(trainloader)
test_rmse = evaluation(testloader)

print("Train RMSE : {}".format(train_rmse))
print("Test RMSE : {}".format(test_rmse))

Train RMSE : 0.11435050517320633
Test RMSE : 0.1415221393108368
