# Dataloader

In [None]:
from utils import *
from tst import Transformer
from tqdm import tqdm

In [None]:
CONST_COLUMNS = [ 
                  'Stellensubart_1', 
                  'Stellensubart_2',
                  'Stellensubart_3', 
                  'Stellensubart_4', 
                  'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9',
                  'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19',
                  'T20', 'T21', 'T22', 'T23', 'T24', 'T25', 'T26', 'T27', 'T28', 'T29',
                  'T30', 'T31', 'T32', 'T33', 'T34',
                  'Preis',
                  'Beleuchtet', 
                  'Laenge', 
                  'Breite', 
                  'Eigenfl√§che',
                  'PPSVACWert',
                  'Qid',
                  'GJ']

In [None]:
data = pd.read_pickle("data/df_gesamt_15_08_prepocessed_einworner_added.pkl")    

In [None]:
distance_2018 = pd.read_csv("./data/distances/distances_2018_truncated.csv").set_index("Qid1")
distance_2019 = pd.read_csv("./data/distances/distances_2019_truncated.csv").set_index("Qid1")
distance_2020 = pd.read_csv("./data/distances/distances_2020_truncated.csv").set_index("Qid1")
distance_2021 = pd.read_csv("./data/distances/distances_2021_truncated.csv").set_index("Qid1")
distance_2022 = pd.read_csv("./data/distances/distances_2022_truncated.csv").set_index("Qid1")

In [None]:
year_count = data.groupby(["Qid"])["GJ"].count()

In [None]:
cleaned_data = data[data.Qid.isin(year_count[year_count == 6].index)].copy()

In [None]:
cleaned_data = cleaned_data.sort_values(by=["Qid", "GJ"]).set_index("Qid", drop=False)

In [None]:
cleaned_data = cleaned_data.loc[:, CONST_COLUMNS]

In [None]:
cleaned_data.Laenge = (cleaned_data.Laenge - cleaned_data.Laenge.mean()) / cleaned_data.Laenge.std()
cleaned_data.Breite = (cleaned_data.Breite - cleaned_data.Breite.mean()) / cleaned_data.Breite.std()
cleaned_data.PPSVACWert = (cleaned_data.PPSVACWert - cleaned_data.PPSVACWert.mean()) / cleaned_data.PPSVACWert.std()

In [None]:
data_2018 = cleaned_data[cleaned_data.GJ == 2018].drop(columns = ["Qid", "GJ"]).copy()
data_2019 = cleaned_data[cleaned_data.GJ == 2019].drop(columns = ["Qid", "GJ"]).copy()
data_2020 = cleaned_data[cleaned_data.GJ == 2020].drop(columns = ["Qid", "GJ"]).copy()
data_2021 = cleaned_data[cleaned_data.GJ == 2021].drop(columns = ["Qid", "GJ"]).copy()
data_2022 = cleaned_data[cleaned_data.GJ == 2022].drop(columns = ["Qid", "GJ"]).copy()
data_2023 = cleaned_data[cleaned_data.GJ == 2023].drop(columns = ["Qid", "GJ"]).copy()

In [None]:
data_2023.loc[:, "T1":"T22"] = data_2023.loc[:, "T1":"T22"].replace(-1, 0)

In [None]:
all_valid_qids = cleaned_data.Qid.unique()

In [None]:
qid_train, qid_val = train_test_split(all_valid_qids, test_size=0.05, random_state=666)

In [None]:
def DataPreprocessor(qids):
    
    MAX_NEIGH = 10
    global data_2018, data_2019, data_2020, data_2021, data_2022, data_2023
    global distance_2018, distance_2019, distance_2020, distance_2021, distance_2022
    X, x, y  = [], [], []

    for idx, qid in enumerate(tqdm(qids)):

        ##########################################
        neighbours_2018 = distance_2018.loc[qid]

        if isinstance(neighbours_2018, pd.core.series.Series) or (neighbours_2018.shape[0] <= 1):
            continue

        neighbours_2018 = neighbours_2018[neighbours_2018.Qid2 != qid]
        neighbours_2018 = neighbours_2018.Qid2


        #########################################
        neighbours_2019 = distance_2019.loc[qid]

        if isinstance(neighbours_2019, pd.core.series.Series) or (neighbours_2019.shape[0] <= 1):
            continue

        neighbours_2019 = neighbours_2019[neighbours_2019.Qid2 != qid]
        neighbours_2019 = neighbours_2019.Qid2

        #########################################
        neighbours_2020 = distance_2020.loc[qid]

        if isinstance(neighbours_2020, pd.core.series.Series) or (neighbours_2020.shape[0] <= 1):
            continue

        neighbours_2020 = neighbours_2020[neighbours_2020.Qid2 != qid]
        neighbours_2020 = neighbours_2020.Qid2

        #########################################
        neighbours_2021 = distance_2021.loc[qid]

        if isinstance(neighbours_2021, pd.core.series.Series) or (neighbours_2021.shape[0] <= 1):
            continue

        neighbours_2021 = neighbours_2021[neighbours_2021.Qid2 != qid]
        neighbours_2021 = neighbours_2021.Qid2

        #########################################
        neighbours_2022 = distance_2022.loc[qid]

        if isinstance(neighbours_2022, pd.core.series.Series) or (neighbours_2022.shape[0] <= 1):
            continue

        neighbours_2022 = neighbours_2022[neighbours_2022.Qid2 != qid]
        neighbours_2022 = neighbours_2022.Qid2
        #########################################


        neighbours_2018_data = torch.from_numpy(data_2018.loc[neighbours_2018.values].values)
        neighbours_2019_data = torch.from_numpy(data_2019.loc[neighbours_2019.values].values)
        neighbours_2020_data = torch.from_numpy(data_2020.loc[neighbours_2020.values].values)
        neighbours_2021_data = torch.from_numpy(data_2021.loc[neighbours_2021.values].values)
        neighbours_2022_data = torch.from_numpy(data_2022.loc[neighbours_2022.values].values)


        self_data_2018 = torch.from_numpy(data_2018.loc[qid].values)
        self_data_2019 = torch.from_numpy(data_2019.loc[qid].values)
        self_data_2020 = torch.from_numpy(data_2020.loc[qid].values)
        self_data_2021 = torch.from_numpy(data_2021.loc[qid].values)
        self_data_2022 = torch.from_numpy(data_2022.loc[qid].values)
        self_data_2022 = torch.from_numpy(data_2022.loc[qid].values)
        self_data_2023 = torch.from_numpy(data_2023.loc[qid].drop(labels=["PPSVACWert", *[f"T{i}" for i in range(1, 35)]]).values)

        # Pad tensors
        
        neighbours_2018_data_padded = pad(neighbours_2018_data, (0, 0, 0, MAX_NEIGH-neighbours_2018_data.shape[0]), "constant", 0)
        neighbours_2019_data_padded = pad(neighbours_2019_data, (0, 0, 0, MAX_NEIGH-neighbours_2019_data.shape[0]), "constant", 0)
        neighbours_2020_data_padded = pad(neighbours_2020_data, (0, 0, 0, MAX_NEIGH-neighbours_2020_data.shape[0]), "constant", 0)
        neighbours_2021_data_padded = pad(neighbours_2021_data, (0, 0, 0, MAX_NEIGH-neighbours_2021_data.shape[0]), "constant", 0)
        neighbours_2022_data_padded = pad(neighbours_2022_data, (0, 0, 0, MAX_NEIGH-neighbours_2022_data.shape[0]), "constant", 0)
        
        
        data_point_2018 = torch.cat([self_data_2018[None], neighbours_2018_data_padded], dim=0)
        data_point_2019 = torch.cat([self_data_2019[None], neighbours_2019_data_padded], dim=0)
        data_point_2020 = torch.cat([self_data_2020[None], neighbours_2020_data_padded], dim=0)
        data_point_2021 = torch.cat([self_data_2021[None], neighbours_2021_data_padded], dim=0)
        data_point_2022 = torch.cat([self_data_2022[None], neighbours_2022_data_padded], dim=0)
        
        
        neighbours_features = torch.stack([data_point_2018, 
                                           data_point_2019,
                                           data_point_2020, 
                                           data_point_2021, 
                                           data_point_2022,
                                          ])

        label = torch.tensor(data_2023.loc[qid, "T1":"T22"].mean())
        
        X.append(neighbours_features)
        x.append(self_data_2023)
        y.append(label)
        
    X = torch.stack(X, dim=0)
    x = torch.stack(x, dim=0)
    y = torch.stack(y, dim=0)
    
    
    return X, x, y

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, path_X, path_x, path_y, p=0.0):
        
        self.data_X = torch.load(path_X)
        SH = self.data_X.shape
        self.data_X = self.data_X.reshape(SH[0], SH[2], -1)
        
        self.data_x = torch.load(path_x)
        self.data_y = torch.load(path_y)
    
        self.p = p
        
    def __getitem__(self, index):
        if torch.rand(1) < self.p:
            return self.__transform(self.data_X[index].clone(), self.data_x[index].clone(), self.data_y[index].clone())
        else:
            return self.data_X[index], self.data_x[index], self.data_y[index]
    
    def __len__(self):
        return len(self.data_X)
    
    def __transform(self, item_X, item_x, item_y, k=2):
        
        max_price = max(item_X[:, 0, -6])
        item_x[-5] = k * max_price
        item_y = item_y * 0
        
        return item_X, item_x, item_y

In [None]:
X_train, x_train, y_train = DataPreprocessor(qid_train)
X_val, x_val, y_val = DataPreprocessor(qid_val)
torch.save(X_train, "./data/proof_of_concept/X_train.pt")
torch.save(x_train, "./data/proof_of_concept/x_train.pt")
torch.save(y_train, "./data/proof_of_concept/y_train.pt")
torch.save(X_val, "./data/proof_of_concept/X_val.pt")
torch.save(x_val, "./data/proof_of_concept/x_val.pt")
torch.save(y_val, "./data/proof_of_concept/y_val.pt")

In [None]:
X_test, x_test, y_test = DataPreprocessor([9860.0, 9673.0, 9855.0])
torch.save(X_test, "./data/proof_of_concept/X_test.pt")
torch.save(x_test, "./data/proof_of_concept/x_test.pt")
torch.save(y_test, "./data/proof_of_concept/y_test.pt")

In [None]:
class SimpleModel(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        self.transformer = Transformer(d_input=220, d_model=1024, d_output=16, q=8, v=8, h=20, N=20, chunk_mode=None)
        self.target_year_linear = nn.Linear(9, 16)
        self.intermediate_linear = nn.Linear(192, 32)
        self.prediction_head = nn.Linear(32, 1)
        self.activation = nn.ReLU()
        self.batchnorm = nn.BatchNorm1d(32)
    
    def forward(self, X, x):
        
        transformer_features = self.transformer(X)
        
        x_features = self.target_year_linear(x)
                
        transformer_features = transformer_features.reshape(transformer_features.shape[0], -1)
        
        joint_features = torch.cat([transformer_features, x_features], dim=-1)
        
        joint_features = self.activation(self.batchnorm(self.intermediate_linear(joint_features)))
        
        prediction = self.prediction_head(joint_features)
        
        return prediction

In [None]:
BATCH_SIZE = 64
LEARNING_RATE = 0.0003

In [None]:
model = SimpleModel().to(device=DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=LEARNING_RATE) 

In [None]:
train_dataset = CustomDataset("./data/proof_of_concept/X_train.pt", 
                              "./data/proof_of_concept/x_train.pt",
                              "./data/proof_of_concept/y_train.pt")

val_dataset = CustomDataset(  "./data/proof_of_concept/X_val.pt", 
                              "./data/proof_of_concept/x_val.pt",
                              "./data/proof_of_concept/y_val.pt")

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [None]:
num_epochs = 50
history = []
val_min_loss = None
model_name = input("Input proper model name:\t")
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              base_lr=LEARNING_RATE, 
                                              max_lr=0.003, 
                                              cycle_momentum=True,
                                              mode='triangular2',
                                              verbose=False)


for epoch in range(num_epochs):
    
    train_running_loss = 0.0
    model.train()
    
    for i, batch in enumerate(tqdm(train_dataloader, position=0, leave=True)):
        # Every data instance is an input + label pair
        X_train1 = batch[0].to(torch.float32).to(DEVICE)
        x_train1 = batch[1].to(torch.float32).to(DEVICE)
        y_train1 = batch[2].to(torch.float32).to(DEVICE)

        # Make predictions for this batch
        outputs = model(X_train1, x_train1)

        # Compute the loss and its gradients
        loss = criterion(outputs, y_train1.view(-1, 1))
        loss.backward()

        # Adjust learning weights
        optimizer.step()
        scheduler.step()
        
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Gather data and report
        train_running_loss += loss.item()
        
        

    model.eval()
    val_running_loss = 0.
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(val_dataloader, position=0, leave=True)):

            # Every data instance is an input + label pair
            X_val1 = batch[0].to(torch.float32).to(DEVICE)
            x_val1 = batch[1].to(torch.float32).to(DEVICE)
            y_val1 = batch[2].to(torch.float32).to(DEVICE)

            # Make predictions for this batch
            outputs = model(X_val1, x_val1)
            # Compute the loss and its gradients
            loss = criterion(outputs, y_val1.view(-1, 1))
            
            # Gather data and report
            val_running_loss += loss.item()

    mean_train_loss = train_running_loss/len(train_dataloader)
    mean_val_loss = val_running_loss/len(val_dataloader)
    
    if val_min_loss is None:
        val_min_loss = mean_val_loss
    else:
        if mean_val_loss < val_min_loss:
            val_min_loss = mean_val_loss
            torch.save(model.state_dict(), f'./models/{model_name}.pth')
    
    
    history.append([mean_train_loss, mean_val_loss])
    print(f"Epoch [{epoch+1}/{num_epochs}]\nTrain Loss: {round(mean_train_loss, 4)}\nVal Loss: {round(mean_val_loss, 4)}")

In [None]:
model.load_state_dict(torch.load("./models/train_transformer_21_08.pth"))

In [None]:
model.eval().cpu()
X_val = val_dataset.data_X
x_val = val_dataset.data_x
y_val = val_dataset.data_y
preds = model(X_val.to(torch.float32), x_val.to(torch.float32)).detach().cpu().numpy()
ground_truth = (y_val).numpy()

In [None]:
sns.kdeplot(ground_truth, c="red")
sns.kdeplot(preds)
plt.legend(["ground_truth", "prediction"])
plt.show()

In [None]:
# Create a diagonal line for y = x
x_values = np.linspace(min(preds[..., 0].min(), ground_truth.min()), max(preds[..., 0].max(), ground_truth.max()), 100)
y_values = x_values

# Plot the scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=ground_truth, y=preds[..., 0])

# Plot the diagonal line
plt.plot(x_values, y_values, color='red', linestyle='dashed', label='y = x')

# Set labels and title
plt.xlabel('Ground Truth')

plt.ylabel('Predictions')
plt.title('Scatter plot - Predictions vs. Ground Truth')

# Show the plot
plt.legend()
plt.show()