In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from pandas import DataFrame
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data

In [None]:
raw_data = pd.read_csv('./data/train.csv')

In [None]:
raw_data

In [None]:
numerical_columns = raw_data.select_dtypes(np.number).columns
# numerical_columns = numerical_columns.drop('SalePrice')
numerical_columns = numerical_columns.drop('Id')

# Nan columns
numerical_columns = numerical_columns.drop('LotFrontage')
numerical_columns = numerical_columns.drop('MasVnrArea')
numerical_columns = numerical_columns.drop('GarageYrBlt')

string_columns = raw_data.select_dtypes(include=['object']).columns

numerical_columns, string_columns = list(numerical_columns), list(string_columns)

In [None]:
numerical_df = raw_data[numerical_columns]
string_df = raw_data[string_columns]

numerical_df = numerical_df.dropna(axis=1, how='any')
numerical_df = numerical_df.dropna(axis=0, how='any')

string_df = string_df.dropna(axis=1, how='any')
string_df = string_df.dropna(axis=0, how='any')

numerical_columns = list(numerical_df.columns)
string_columns = list(string_df.columns)

In [None]:
data_means, data_maxs, data_mins = {}, {}, {}

for col in numerical_df:
    data_means[col] = numerical_df[col].mean()
    data_maxs[col] = numerical_df[col].max()
    data_mins[col] = numerical_df[col].min()

normalized_numerical_df = (numerical_df - numerical_df.mean()) / (numerical_df.max() - numerical_df.min())


In [None]:
numerical_columns.remove('SalePrice')
numerical_y_columns = ['SalePrice']

normalized_numerical_x_df = normalized_numerical_df[numerical_columns]
normalized_numerical_y_df = normalized_numerical_df[numerical_y_columns]

normalized_numerical_x = torch.tensor(normalized_numerical_x_df.values, dtype=torch.float)
normalized_numerical_y = torch.tensor(normalized_numerical_y_df.values, dtype=torch.float)

In [None]:
node_indices = {}
index = 0
for column in string_columns:
    unique_values = string_df[column].unique()
    for value in unique_values:
        if value not in node_indices:
            node_indices[value] = index
            index += 1

edge_index = []
for column in string_columns:
    for i in range(len(string_df)):
        for j in range(i+1, len(string_df)):
            if string_df[column][i] == string_df[column][j]:
                edge_index.append([node_indices[string_df[column][i]], node_indices[string_df[column][j]]])
                edge_index.append([node_indices[string_df[column][j]], node_indices[string_df[column][i]]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# num_nodes = len(node_indices)
# edge_index = []

# for i in range(num_nodes):
#     for j in range(num_nodes):
#         if i != j:
#             edge_index.append([i, j])

# edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

In [None]:
train_ratio = 0.1
train_size = int(train_ratio * len(normalized_numerical_x))

indices = torch.randperm(len(normalized_numerical_x))
train_indices = indices[:train_size]
val_indices = indices[train_size:]

x_train, x_val = normalized_numerical_x[train_indices], normalized_numerical_x[val_indices]
y_train, y_val = normalized_numerical_y[train_indices], normalized_numerical_y[val_indices]
edge_index_train = edge_index[:, train_indices]

In [None]:
class GNNLinearRegression(nn.Module):
    def __init__(self, num_features_list, hidden_channels_list, num_layers):
        super(GNNLinearRegression, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(num_features_list[0], hidden_channels_list[0]))
        for i in range(1, num_layers - 1):
            self.convs.append(GCNConv(hidden_channels_list[i-1], hidden_channels_list[i]))
        self.convs.append(GCNConv(hidden_channels_list[-1], 1))

    def forward(self, x, edge_index):
        for conv in self.convs[:-1]:
            x = conv(x, edge_index)
            x = F.relu(x)
        x = self.convs[-1](x, edge_index)
        return x.view(-1)

In [None]:
data = Data(x=x_train, edge_index=edge_index_train, y=y_train)

learning_rate = 0.003
num_features_list = [data.num_features, 16]
hidden_channels_list = [32, 16]
num_layers = len(hidden_channels_list) + 1
model = GNNLinearRegression(num_features_list, hidden_channels_list, num_layers)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 100
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    out = model(data.x, data.edge_index)
    loss = criterion(out, data.y)
    train_losses.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_out = model(x_val, edge_index_train)
        val_loss = criterion(val_out, y_val)
        val_losses.append(val_loss.item())

    # print(f'Epoch [{epoch + 1} / {num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')

In [None]:
plt.figure()
plt.plot(range(num_epochs), train_losses, label='Train Loss')
plt.plot(range(num_epochs), val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')


plt.text(0.975, 0.75, 
         f'Edge: One-hot\nLearning rate: {learning_rate}\nFeatures: {num_features_list}\nChannels: {hidden_channels_list}\nTrain Ratio: {train_ratio}\nOptimizer: {str(type(optimizer).__name__)}\nModel: {model.__class__.__name__}',
         transform=plt.gca().transAxes, ha='right', va='top', fontsize=12, bbox=dict(facecolor='white', alpha=0.8))

min_train_loss = np.min(train_losses)
min_train_epoch = np.argmin(train_losses)
min_val_loss = np.min(val_losses)
min_val_epoch = np.argmin(val_losses)


# if min_train_epoch < min_val_epoch:
#     train_annotation_pos = (min_train_epoch, min_train_loss - 0.01)
#     val_annotation_pos = (min_val_epoch, min_val_loss + 0.01)
# else:
#     train_annotation_pos = (min_train_epoch, min_train_loss + 0.01)
#     val_annotation_pos = (min_val_epoch, min_val_loss - 0.01)

plt.annotate(f'Min: {min_train_loss:.4f}', xy=(min_train_epoch, min_train_loss),
             arrowprops=dict(facecolor='black', arrowstyle='->'), fontsize=10)
plt.annotate(f'Min: {min_val_loss:.4f}', xy=(min_val_epoch, min_val_loss),
             arrowprops=dict(facecolor='black', arrowstyle='->'), fontsize=10)


plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# test_data = pd.read_csv('./data/test.csv')

# test_numerical_df = test_data[numerical_columns]
# test_string_df = test_data[string_columns]

# test_numerical_df = test_numerical_df.fillna(test_numerical_df.mean())

# # Normalize the test data using the mean and range of the training data
# test_normalized_numerical_df = (test_numerical_df - numerical_df.mean()) / (numerical_df.max() - numerical_df.min())
# test_normalized_numerical_x_df = test_normalized_numerical_df[numerical_columns]

# test_normalized_numerical_x = torch.tensor(test_normalized_numerical_x_df.values, dtype=torch.float)

# # Create the edge index for test data based on string columns
# test_edge_index = []
# for column in string_columns:
#     for i in range(len(test_string_df)):
#         for j in range(i+1, len(test_string_df)):
#             if test_string_df[column][i] == test_string_df[column][j]:
#                 if test_string_df[column][i] in node_indices and test_string_df[column][j] in node_indices:
#                     test_edge_index.append([node_indices[test_string_df[column][i]], node_indices[test_string_df[column][j]]])
#                     test_edge_index.append([node_indices[test_string_df[column][j]], node_indices[test_string_df[column][i]]])

# test_edge_index = torch.tensor(test_edge_index, dtype=torch.long).t().contiguous()

# test_data = Data(x=test_normalized_numerical_x, edge_index=test_edge_index)

# model.eval()
# with torch.no_grad():
#     test_outputs = model(test_data.x, test_data.edge_index)

# test_predictions = test_outputs.numpy().flatten()
# test_predictions = test_predictions * (data_maxs['SalePrice'] - data_mins['SalePrice']) + data_means['SalePrice']

# submission_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})
# submission_df.to_csv('./output/submission.csv', index=False)