In [None]:
from utils import *

RANDOM_STATE = 2001
torch.manual_seed(RANDOM_STATE);

# Data preprocessing and loading for fast training

## Load source data

In [None]:
CONST_COLUMNS = [
    "Stellensubart_1",
    "Stellensubart_2",
    "Stellensubart_3",
    "Stellensubart_4",
    *[f"T{i}" for i in range(1, 35)],
    "Preis",
    "Beleuchtet",
    "Laenge",
    "Breite",
    "Eigenfläche",
    "PPSVACWert",
    "Qid",
    "GJ",
]
data = pd.read_pickle("data/df_gesamt_15_08_prepocessed_einworner_added.pkl")
target_year_having_qid = data[data.GJ == 2023].Qid
cleaned_data = data[data.Qid.isin(target_year_having_qid)]
cleaned_data = cleaned_data.sort_values(by=["Qid", "GJ"]).set_index("Qid", drop=False)
cleaned_data = cleaned_data.loc[:, CONST_COLUMNS]

## Load distance data

In [None]:
distance_2018 = pd.read_csv("./data/distances/distances_2018.csv").set_index("Qid1")
distance_2019 = pd.read_csv("./data/distances/distances_2019.csv").set_index("Qid1")
distance_2020 = pd.read_csv("./data/distances/distances_2020.csv").set_index("Qid1")
distance_2021 = pd.read_csv("./data/distances/distances_2021.csv").set_index("Qid1")
distance_2022 = pd.read_csv("./data/distances/distances_2022.csv").set_index("Qid1")

## Normalize cols

In [None]:
cleaned_data.Preis = (
    cleaned_data.Preis - cleaned_data.Preis.mean()
) / cleaned_data.Preis.std()

cleaned_data.Laenge = (cleaned_data.Laenge - (-180.0)) / (2 * 180.0)
cleaned_data.Breite = (cleaned_data.Breite - (-90.0)) / (2 * 90.0)

cleaned_data.PPSVACWert = (
    cleaned_data.PPSVACWert - cleaned_data.PPSVACWert.mean()
) / cleaned_data.PPSVACWert.std()

In [None]:
cleaned_data.loc[:, "T1":"T22"] = cleaned_data.loc[:, "T1":"T22"].replace(-1, 0)

## Split data into years

In [None]:
data_2018 = cleaned_data[cleaned_data.GJ == 2018].drop(columns=["Qid", "GJ"]).copy()
data_2019 = cleaned_data[cleaned_data.GJ == 2019].drop(columns=["Qid", "GJ"]).copy()
data_2020 = cleaned_data[cleaned_data.GJ == 2020].drop(columns=["Qid", "GJ"]).copy()
data_2021 = cleaned_data[cleaned_data.GJ == 2021].drop(columns=["Qid", "GJ"]).copy()
data_2022 = cleaned_data[cleaned_data.GJ == 2022].drop(columns=["Qid", "GJ"]).copy()
data_2023 = cleaned_data[cleaned_data.GJ == 2023].drop(columns=["Qid", "GJ"]).copy()

## Calculate label

In [None]:
data_2023["target"] = data_2023.loc[:, "T1":"T22"].mean(axis=1)
# data_2023["target"] = (data_2023["target"] - data_2023["target"].mean()) / data_2023["target"].std()

## Train / Validation split

In [None]:
all_valid_qids = cleaned_data.Qid.unique()
qid_train, qid_val = train_test_split(
    all_valid_qids, test_size=0.1, random_state=RANDOM_STATE
)

## Data Loading

In [None]:
all_year_data = [
    data_2018,
    data_2019,
    data_2020,
    data_2021,
    data_2022,
    data_2023,
]

all_year_distances = [
    distance_2018,
    distance_2019,
    distance_2020,
    distance_2021,
    distance_2022,
]

### Train data

In [None]:
X_train, x_train, y_train = preprocess_data(
    qid_train, all_year_data, all_year_distances, data_2023
)

torch.save(X_train, "./data/proof_of_concept/X_train_lstm_year_price.pt")
torch.save(x_train, "./data/proof_of_concept/x_train_lstm_year_price.pt")
torch.save(y_train, "./data/proof_of_concept/y_train_lstm_year_price.pt")

### Validation data

In [None]:
X_val, x_val, y_val = preprocess_data(
    qid_val, all_year_data, all_year_distances, data_2023
)

torch.save(X_val, "./data/proof_of_concept/X_val_lstm_year_price.pt")
torch.save(x_val, "./data/proof_of_concept/x_val_lstm_year_price.pt")
torch.save(y_val, "./data/proof_of_concept/y_val_lstm_year_price.pt")

### Test data

In [None]:
X_test, x_test, y_test = preprocess_data(
    [9860, 9673, 9855, 9333], all_year_data, all_year_distances, data_2023
)

torch.save(X_test, "./data/proof_of_concept/X_test_lstm_year_price.pt")
torch.save(x_test, "./data/proof_of_concept/x_test_lstm_year_price.pt")
torch.save(y_test, "./data/proof_of_concept/y_test_lstm_year_price.pt")

# Preparation for training

In [None]:
BATCH_SIZE = 64
LEARNING_RATE = 0.0003
WEIGHT_DECAY = 0.0001

In [None]:
model = LSTMModel(44, 2048, 16, 1).to(DEVICE)
criterion = nn.L1Loss(reduction='none')
criterion_val = nn.L1Loss()

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [None]:
def relative_criterion(y_true, y_pred, eps=1/(4*22)):
    
    global criterion
    pre_loss = criterion(y_true, y_pred)
    pre_loss = pre_loss / torch.maximum(y_true, torch.tensor(eps)).to(device=DEVICE)
    
    return pre_loss.mean()

In [None]:
train_dataset = CustomDataset(
    "./data/proof_of_concept/X_train_lstm_year_price.pt",
    "./data/proof_of_concept/x_train_lstm_year_price.pt",
    "./data/proof_of_concept/y_train_lstm_year_price.pt",
    upsample=True,
    distort_prob=0.1,
    smooth_labels=True,
    shuffle_neighbours=True,
)

val_dataset = CustomDataset(
    "./data/proof_of_concept/X_val_lstm_year_price.pt",
    "./data/proof_of_concept/x_val_lstm_year_price.pt",
    "./data/proof_of_concept/y_val_lstm_year_price.pt",
)

train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=False
)
val_dataloader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False
)

print(f"Total training instances:\t{train_dataset.data_X.shape[0]}")
print(f"Total validation instances:\t{val_dataset.data_X.shape[0]}")

# Training

In [None]:
num_epochs = 200
history = []
val_min_loss = float("+inf")

model_name = ""

while model_name == "":
    model_name = input("Input proper model name:\t")

In [None]:
for epoch in range(num_epochs):
    mean_train_loss = train(model, train_dataloader, relative_criterion, optimizer)
    mean_val_loss = validate(model, val_dataloader, criterion_val)

    history.append([mean_train_loss, mean_val_loss])
    print(
        f"Epoch [{epoch+1}/{num_epochs}]\nTrain Loss: {round(mean_train_loss, 4)}\nVal Loss: {round(mean_val_loss, 4)}"
        f"\nBest Val loss: {round(val_min_loss, 4)}"
    )

    if mean_val_loss < val_min_loss:
        val_min_loss = mean_val_loss
        print("Saving best model....")
        torch.save(model.state_dict(), f"./models/{model_name}_best.pt")

In [None]:
torch.save(model.state_dict(), f"./models/{model_name}_last.pt")

In [None]:
best_weights = torch.load(f"./models/{model_name}_best.pt")
model.load_state_dict(best_weights)

In [None]:
plt.plot(history)
plt.legend(["train", "val"])

# Evaluation

In [None]:
model.eval()

X_val = val_dataset.data_X
x_val = val_dataset.data_x
y_val = val_dataset.data_y

predictions = model(X_val.to(DEVICE), x_val.to(DEVICE)).detach().cpu().numpy()
ground_truth = y_val.numpy()

### MAE

In [None]:
print(np.abs(predictions - ground_truth).mean().round(4))

### $R^{2}$

In [None]:
r2_score(ground_truth, predictions)

## Kernel Density Estimation plot

In [None]:
sns.kdeplot(ground_truth.flatten())
sns.kdeplot( predictions.flatten())
plt.legend(["Ground truth", "Prediction"])
plt.show()

## Scatter plot of ground_truth and preds

In [None]:
x_values = np.linspace(
    min(predictions.min(), ground_truth.min()),
    max(predictions.max(), ground_truth.max()),
    1_000,
)
y_values = x_values

plt.figure(figsize=(8, 6))
sns.scatterplot(x=ground_truth.flatten(), y=predictions.flatten())

plt.plot(x_values, y_values, color="red", linestyle="dashed", label="y = x")

plt.xlabel("Ground Truth")
plt.ylabel("Predictions")
plt.title("Scatter plot - Predictions vs. Ground Truth")
plt.legend()
plt.show()