# Import libraries

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda")

# Load data

In [5]:
data = pd.read_csv("../data/train_set_scaled.csv", sep="\t", index_col=0)
data = data.rename(columns={"Log price (1 billion VND)": "Log Price", 'Log area (square meters)': "Log Area", '(Log) Bedrooms' : "Log Bedrooms", '(Log) WC': "Log WC", '(Log) Number of floors': "Log Floors"})

In [6]:
data.describe()

Unnamed: 0,Log Price,Log Area,Log Bedrooms,Log WC,Parking,For rent,Infrastructure,Facade,Log Floors,Latitude,Longitude,Cluster,Distance to center 0,Distance to center 1
count,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0,4764.0
mean,-0.001738,-4e-06,0.000266,-0.000957,0.003535,-0.01346,0.014805,-0.001661,0.005422,0.005194,0.009009,0.001927,0.005168,-0.005484
std,0.999911,0.995137,0.998461,0.999261,1.000343,0.979637,0.993868,0.999346,0.998988,1.001049,1.010348,1.000953,1.001016,1.001752
min,-2.492482,-5.430634,-3.161024,-2.674523,-0.933208,-0.303083,-1.494289,-0.643026,-2.025287,-1.03357,-4.670233,-0.651906,-0.734891,-1.574407
25%,-0.618843,-0.668842,-0.351505,-1.123405,-0.933208,-0.303083,-1.494289,-0.643026,-0.698957,-0.685002,-1.098191,-0.651906,-0.693775,-1.540527
50%,-0.045966,-0.086254,0.384194,0.427712,-0.933208,-0.303083,0.669215,-0.643026,0.076897,-0.674515,0.237872,-0.651906,-0.681493,0.671649
75%,0.574668,0.584781,0.384194,0.427712,1.071573,-0.303083,0.669215,1.555148,0.627374,1.522144,0.321129,1.533962,1.521764,0.682305
max,2.624948,6.825509,6.573883,5.988411,1.071573,3.299428,0.669215,1.555148,6.898158,1.821403,4.718773,1.533962,1.900324,1.043507


In [19]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(data, test_size=0.2, random_state=42)

# Define model

The architecture is defined as:
- FFN (Fully connected network): 100 neurons
- FFN: 100 neurons
- FFN: 1 neuron

All with ReLU activation function

In [20]:
model = torch.nn.Sequential()
model.add_module('W0', nn.Linear(data.shape[1] - 1, 100))
model.add_module("relu0", nn.ReLU())
model.add_module('W1', nn.Linear(100, 100))
model.add_module("relu1", nn.ReLU())
model.add_module('W2', nn.Linear(100, 1))

# Define the Early Stopping

In [21]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, model_path='model.pth'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.model_path = model_path

    def __call__(self, val_loss, model, model_path):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.model_path)
        self.val_loss_min = val_loss

# Define DataLoader 

In [22]:
class CustomDataset(Dataset):
    def __init__(self, dataframe:pd.DataFrame, labels:pd.DataFrame) -> None:
        self.data = torch.tensor(dataframe.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx) -> tuple:
        return self.data[idx], self.labels[idx]

In [23]:
train_dataset = CustomDataset(dataframe=train.drop(columns=["Log Price"]), labels=train[["Log Price"]])
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [24]:
valid_dataset = CustomDataset(dataframe=valid.drop(columns=["Log Price"]), labels=valid[["Log Price"]])
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

# Training loop

In [25]:
early_stopping = EarlyStopping(patience=10, verbose=True, delta = 0.0001, model_path='model.pth')
num_epochs = 1000
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [26]:
model = model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_dataloader)}')

    for inputs, labels in valid_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        running_loss += loss.item()
    early_stopping(running_loss, model, model_path='model.pth')
    if early_stopping.early_stop:
        print("Early stopping")
        break

print("Done training")

Epoch 1, Loss: 0.6505088277161122
Validation loss decreased (inf --> 96.346773).  Saving model ...
Epoch 2, Loss: 0.5913411267101765
Validation loss decreased (96.346773 --> 88.730482).  Saving model ...
Epoch 3, Loss: 0.5792119644582272
Validation loss decreased (88.730482 --> 87.877509).  Saving model ...
Epoch 4, Loss: 0.5731042571365833
Validation loss decreased (87.877509 --> 87.008668).  Saving model ...
Epoch 5, Loss: 0.565469708542029
Validation loss decreased (87.008668 --> 85.901856).  Saving model ...
Epoch 6, Loss: 0.5533758183320363
Validation loss decreased (85.901856 --> 85.003110).  Saving model ...
Epoch 7, Loss: 0.5458584981039166
Validation loss decreased (85.003110 --> 83.087383).  Saving model ...
Epoch 8, Loss: 0.5438173890113831
Validation loss decreased (83.087383 --> 82.991355).  Saving model ...
Epoch 9, Loss: 0.5457047405342261
EarlyStopping counter: 1 out of 10
Epoch 10, Loss: 0.5329261425882578
Validation loss decreased (82.991355 --> 81.511733).  Saving mo

# Evaluate the model

In [27]:
# how to load model from .pth

model.load_state_dict(torch.load('model.pth'))
model.eval()

test_data = pd.read_csv("../data/test_set_scaled.csv", sep="\t", index_col=0)
test_data = test_data.rename(columns={"Log price (1 billion VND)": "Log Price", 'Log area (square meters)': "Log Area", '(Log) Bedrooms' : "Log Bedrooms", '(Log) WC': "Log WC", '(Log) Number of floors': "Log Floors"})

test_dataset = CustomDataset(dataframe=test_data.drop(columns=["Log Price"]), labels=test_data[["Log Price"]])
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

test_loss = 0.0
for inputs, labels in test_dataloader:
    inputs, labels = inputs.to(device), labels.to(device)

    outputs = model(inputs)
    loss = loss_fn(outputs, labels)

    test_loss += loss.item()

print(f'Test loss: {test_loss/len(test_dataloader)}')

Test loss: 0.433526189135225
