# Split into Train / Valid / Test set

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()

In [3]:
df = pd.DataFrame(california.data, columns=california.feature_names)
df['Target'] = california.target
df.tail()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24,0.894


## Convert to PyTorch Tensor

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
data = torch.from_numpy(df.values).float() # numpy value를 float 형태의 tensor type으로 변환

x = data[:, :-1]
y = data[:, -1:]

print(x.size(), y.size())

torch.Size([20640, 8]) torch.Size([20640, 1])


In [6]:
# Train / Valid / Test ratio
ratios = [.6, .2, .2]

In [9]:
train_cnt = int(data.size(0) * ratios[0])
valid_cnt = int(data.size(0) * ratios[1])
test_cnt = int(data.size(0) * ratios[2])

cnts = [train_cnt, valid_cnt, test_cnt]

print(f"Train {cnts[0]} / Valid {cnts[1]} / Test {cnts[2]} ")

Train 12384 / Valid 4128 / Test 4128 


In [10]:
# Shuffle before split.

indices = torch.randperm(data.size(0))
x = torch.index_select(x, dim=0, index=indices)
y = torch.index_select(y, dim=0, index=indices)

# Split train, valid and test set with each count.
x = list(x.split(cnts, dim=0)) # tensor.split([list]) # list index대로 쪼개준다.
y = y.split(cnts, dim=0)


for x_i, y_i in zip(x, y):
    print(x_i.size(), y_i.size())

torch.Size([12384, 8]) torch.Size([12384, 1])
torch.Size([4128, 8]) torch.Size([4128, 1])
torch.Size([4128, 8]) torch.Size([4128, 1])


In [11]:
indices

tensor([ 9483,  6374, 18088,  ...,  7965, 17983, 10515])

# Preprocessing

In [12]:
scaler = StandardScaler()
scaler.fit(x[0].numpy()) # 반드시 train data에 대해서만 fit을 진행해야 함.

In [14]:
x[0] = torch.from_numpy(scaler.transform(x[0].numpy())).float()
x[1] = torch.from_numpy(scaler.transform(x[1].numpy())).float()
x[2] = torch.from_numpy(scaler.transform(x[2].numpy())).float()

df = pd.DataFrame(x[0].numpy(), columns=california.feature_names)
df.tail()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
12379,-2.423824,-2.333177,-2.250677,-2.636867,-1.27659,-0.245926,-16.055618,59.396297
12380,-2.198083,-2.213493,-2.441204,-2.853574,-1.276878,-0.232682,-16.992752,60.148212
12381,-2.029637,-2.263886,-2.373974,-2.976616,-1.27654,-0.247103,-16.292646,59.275993
12382,-1.980859,-2.175698,-2.155669,-2.754537,-1.277268,-0.254385,-16.744751,60.005352
12383,-1.996772,-2.219792,-2.306451,-3.067005,-1.277188,-0.255304,-16.329956,59.268475


# Build Model & Optimizer

In [28]:
model = nn.Sequential(
    nn.Linear(x[0].size(-1), 6),
    nn.LeakyReLU(),
    nn.Linear(6, 5),
    nn.LeakyReLU(),
    nn.Linear(5, 4),
    nn.LeakyReLU(),
    nn.Linear(4, 3),
    nn.LeakyReLU(),
    nn.Linear(3, y[0].size(-1)),
)

model

Sequential(
  (0): Linear(in_features=8, out_features=6, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=6, out_features=5, bias=True)
  (3): ReLU()
  (4): Linear(in_features=5, out_features=4, bias=True)
  (5): ReLU()
  (6): Linear(in_features=4, out_features=3, bias=True)
  (7): ReLU()
  (8): Linear(in_features=3, out_features=1, bias=True)
)

In [29]:
optimizer = optim.Adam(model.parameters())

In [30]:
# figures
n_epochs = 10000
batch_size = 1024
print_interval = 100

In [31]:
from copy import deepcopy

lowest_loss = np.inf
best_model = None

early_stop = 500
lowest_epoch = np.inf 

In [32]:
train_history, valid_history = [], []

for i in range(n_epochs):
    # Shuffle before mini-batch split.
    indices = torch.randperm(x[0].size(0))
    x_ = torch.index_select(x[0], dim=0, index=indices)
    y_ = torch.index_select(x[0], dim=0, index=indices)
    
    # |x_| = (train_data_size, input_dim)
    # |y_| = (train_data_size, output_dim)
    
    x_ = x_.split(batch_size, dim=0)
    y_ = y_.split(batch_size, dim=0)
    
    train_loss, valid_loss = 0, 0
    y_hat = []
    
    for x_i, y_i in zip(x_, y_):
        y_hat_i = model(x_i)
        loss = F.mse_loss(y_hat_i, y_i)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        train_loss += float(loss)
    
    train_loss = train_loss / len(x_)
    
    # You need to declare to PYTORCH to stop build the compuation graph.
    # validation 
    with torch.no_grad(): # gradient를 사용하지 말 것. # 파라미터를 하지 않음
        # You don't need to shuffle the validation set.
        # Only split is needed.
        x_ = x[1].split(batch_size, dim=0)
        y_ = y[1].split(batch_size, dim=0)
        
        valid_loss = 0
        
        # validation_size/batch_size 만큼 반복
        for x_i, y_i in zip(x_, y_):
            y_hat_i = model(x_i)
            loss = F.mse_loss(y_hat_i, y_i)

            valid_loss += loss
            y_hat += [y_hat_i]
    
    valid_loss = valid_loss / len(x_)
    
    # Log each loss to plot after training is done.
    train_history += [train_loss]
    valid_history += [valid_loss]
    
    if (i + 1) % print_interval == 0:
        print(f"Epoch {i+1}: train loss={train_loss:.5f}, valid_loss={valid_loss:.5f}, lowest_loss={lowest_loss:.5f}")
    if valid_loss <= lowest_loss:
        lowest_loss = valid_loss
        lowest_epoch = i
        
        # 'state_dic()' return model weights as key-value
        # Take a deep copy, if the valid loss is lowest ever.
        best_model = deepcopy(model.state_dict())
    else:
        if early_stop > 0 and lowest_epoch + early_stop < i + 1:
            print("There is no improvement during last {} epochs.".format(early_stop))
            break

print("The best validation loss from epoch {} : {:4f}".format(lowest_epoch + 1, lowest_loss))

# Load best epoch's model.
model.load_state_dict(best_model)

  loss = F.mse_loss(y_hat_i, y_i)
  loss = F.mse_loss(y_hat_i, y_i)


Epoch 100: train loss=469.53219, valid_loss=3.86609, lowest_loss=3.86828
Epoch 200: train loss=469.59066, valid_loss=3.62598, lowest_loss=3.62695
Epoch 300: train loss=469.64407, valid_loss=3.55613, lowest_loss=3.55699
Epoch 400: train loss=469.49367, valid_loss=3.48160, lowest_loss=3.48213
Epoch 500: train loss=469.60239, valid_loss=3.43616, lowest_loss=3.43632
Epoch 600: train loss=469.63051, valid_loss=3.42937, lowest_loss=3.42871
Epoch 700: train loss=469.55365, valid_loss=3.43824, lowest_loss=3.42871
Epoch 800: train loss=469.46285, valid_loss=3.45215, lowest_loss=3.42871
Epoch 900: train loss=469.54902, valid_loss=3.47184, lowest_loss=3.42871
Epoch 1000: train loss=469.58745, valid_loss=3.49615, lowest_loss=3.42871
There is no improvement during last 500 epochs.
The best validation loss from epoch 582 : 3.428708


<All keys matched successfully>