In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt



df = pd.read_csv('reg.csv',index_col = [0])
X = df.drop('Price', axis=1).to_numpy()
Y = df['Price'].to_numpy().reshape((-1,1))


class TensorData(Dataset):

    def __init__(self, x_data, y_data):
        self.x_data = torch.FloatTensor(x_data)
        self.y_data = torch.FloatTensor(y_data)
        self.len = self.y_data.shape[0]

    def __getitem__(self,index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

kfold = KFold(n_splits=3, shuffle= True) # k=3으로 훈련 데이터 2개, 검증 데이터 1개로 만들어서 총 3번 검증 후 검증 성능의 평균을 내어 계산한다.
criterion = nn.MSELoss()
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.7)
trainset = TensorData(X_train, Y_train)
testset = TensorData(X_test, Y_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False)

class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(13, 50, bias = True)
        self.fc2 = nn.Linear(50, 30, bias = True)
        self.fc3 = nn.Linear(30, 1, bias = True)

    def forward(self,x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x


def evaluation(dataloader): # 매개인자로 데이터로더를 불러온다.

    predictions = torch.tensor([],dtype = torch.float) #예측값을 저장할 빈 텐서
    actual = torch.tensor([],dtype=torch.float) #실제값을 저장할 빈 텐서

    with torch.no_grad():
        model.eval()
        for data in dataloader:
            inputs, values = data
            outputs = model(inputs)

            predictions = torch.cat((predictions,outputs),dim=0) #TODO 왜 이렇게 묶었을까? 0을 추가한 이유가 궁금하다. 꼭 디버깅 돌릴꺼다
            actual = torch.cat((actual,values),dim=0)
    predictions = predictions.numpy()
    actual = actual.numpy()
    rmse = np.sqrt(mean_squared_error(predictions,actual))
    model.train()
    return rmse


validation_loss = [] # 폴드 별 로스 저장 리스트를 만듦
print("\n")
for fold, (train_idx, val_idx) in enumerate(kfold.split(trainset)): #K폴드를 이용하여 나눠진 학습 데이터의 인덱스를 불러온다.ㅌ1
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx) #텐서 데이터의 일부를 가져와 배치 데이터로 활용하도록 서브 샘플러와 함꼐 쓴다.
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size = 32,
                                              sampler = train_subsampler)
    valloader = torch.utils.data.DataLoader(trainset, batch_size = 32, sampler = val_subsampler)

    model = Regressor()
    optimizer = optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-7)

    for epoch in range(400):
        for data in trainloader:
            inputs, values = data
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, values)
            loss.backward()
            optimizer.step()
    train_rmse = evaluation(trainloader)
    val_rmse = evaluation(valloader)

    print("k-fold", fold, "train_loss : %.4f, validation_loss:.%4f"%(train_rmse, val_rmse))
    validation_loss.append(val_rmse)

validation_loss = np.array(validation_loss) # 계산하기 위해서 텐서를 넘파이로 바꿈
mean = np.mean(validation_loss)
std = np.std(validation_loss)

print("\n")
print("[mean,std] Validation Score: %.4f, %.4f" %(mean, std))

trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=False)
train_rmse = evaluation(trainloader)
test_rmse = evaluation(testloader)

print("Train Validation Score : %.4f"% train_rmse)
print("Test Validation Score : %.4f"% test_rmse)



k-fold 0 train_loss : 0.1138, validation_loss:.0.161580
k-fold 1 train_loss : 0.1021, validation_loss:.0.168190
k-fold 2 train_loss : 0.1336, validation_loss:.0.094744


[mean,std] Validation Score: 0.1415, 0.0332
Train Validation Score : 0.1221
Test Validation Score : 0.1262
