In [100]:
from utils import *
import pickle

In [101]:
CONST_COLUMNS = [
    "Stellensubart_1",
    "Stellensubart_2",
    "Stellensubart_3",
    "Stellensubart_4",
    *[f"T{i}" for i in range(1, 35)],
    *[f"TD{i:02d}" for i in range(1, 35)],
    "Preis",
    "Beleuchtet",
    "Laenge",
    "Breite",
    "Eigenfläche",
    "PPSVACWert",
    "Qid",
    "GJ",
]

In [102]:
data = pd.read_pickle("data/decade_price_data_combined_01_09.pkl")

In [103]:
distance_2018 = pd.read_csv(
    "./data/distances/distances_2018_decade_price_data_01_09.csv"
).set_index("Qid1")
distance_2019 = pd.read_csv(
    "./data/distances/distances_2019_decade_price_data_01_09.csv"
).set_index("Qid1")
distance_2020 = pd.read_csv(
    "./data/distances/distances_2020_decade_price_data_01_09.csv"
).set_index("Qid1")
distance_2021 = pd.read_csv(
    "./data/distances/distances_2021_decade_price_data_01_09.csv"
).set_index("Qid1")
distance_2022 = pd.read_csv(
    "./data/distances/distances_2022_decade_price_data_01_09.csv"
).set_index("Qid1")

In [104]:
cleaned_data = data.sort_values(by=["Qid", "GJ"]).set_index("Qid", drop=False)
cleaned_data = cleaned_data.loc[:, CONST_COLUMNS]

# Normalize columns

In [105]:
list_of_t =['TD01', 'TD02', 'TD03', 'TD04',
           'TD05', 'TD06', 'TD07', 'TD08', 'TD09', 'TD10', 'TD11', 'TD12', 'TD13',
           'TD14', 'TD15', 'TD16', 'TD17', 'TD18', 'TD19', 'TD20', 'TD21', 'TD22',
           'TD23', 'TD24', 'TD25', 'TD26', 'TD27', 'TD28', 'TD29', 'TD30', 'TD31',
           'TD32', 'TD33', 'TD34']

for i in list_of_t:
    cleaned_data[i] = (
        cleaned_data[i] - cleaned_data[i].mean()
    ) / cleaned_data[i].std()

cleaned_data.Preis = (
    cleaned_data.Preis - cleaned_data.Preis.mean()
) / cleaned_data.Preis.std()
cleaned_data.Laenge = (
    cleaned_data.Laenge - cleaned_data.Laenge.mean()
) / cleaned_data.Laenge.std()
cleaned_data.Breite = (
    cleaned_data.Breite - cleaned_data.Breite.mean()
) / cleaned_data.Breite.std()
cleaned_data.PPSVACWert = (
    cleaned_data.PPSVACWert - cleaned_data.PPSVACWert.mean()
) / cleaned_data.PPSVACWert.std()

In [106]:
data_2018 = cleaned_data[cleaned_data.GJ == 2018].drop(columns=["Qid", "GJ"]).copy()
data_2019 = cleaned_data[cleaned_data.GJ == 2019].drop(columns=["Qid", "GJ"]).copy()
data_2020 = cleaned_data[cleaned_data.GJ == 2020].drop(columns=["Qid", "GJ"]).copy()
data_2021 = cleaned_data[cleaned_data.GJ == 2021].drop(columns=["Qid", "GJ"]).copy()
data_2022 = cleaned_data[cleaned_data.GJ == 2022].drop(columns=["Qid", "GJ"]).copy()
data_2023 = cleaned_data[cleaned_data.GJ == 2023].drop(columns=["Qid", "GJ"]).copy()

data_2023.loc[:, "T1":"T22"] = data_2023.loc[:, "T1":"T22"].replace(-1, 0)
all_valid_qids = cleaned_data[cleaned_data.Eigenfläche == 1].Qid.unique()
qid_train, qid_val = train_test_split(all_valid_qids, test_size=0.05, random_state=0)

In [116]:
def DataPreprocessor(qids):
    
    columns_to_drop = [
        "PPSVACWert",
        *[f"T{i}" for i in range(1, 35)],
        *[f"TD{i:02d}" for i in range(1, 35)],
    ]
    
    def pad_to_size(tensor, MAX_NEIGH):

        if len(tensor.shape) != 2:
            raise ValueError("Input tensor should be 2D")

        current_x, current_y = tensor.shape
        pad_left = 0
        pad_right = 0
        pad_top = 0
        pad_bottom = max(0, MAX_NEIGH - current_x)

        return torch.nn.functional.pad(
            tensor, (pad_left, pad_right, pad_top, pad_bottom), "constant", 0
        )

    global data_2018, data_2019, data_2020, data_2021, data_2022, data_2023
    global distance_2018, distance_2019, distance_2020, distance_2021, distance_2022
    X, x, y = [], [], []

    all_year_data = [data_2018, data_2019, data_2020, data_2021, data_2022]
    all_year_distances = [
        distance_2018,
        distance_2019,
        distance_2020,
        distance_2021,
        distance_2022,
    ]

    for idx, qid in enumerate(tqdm(qids)):
        
        if (data_2023.index == qid).sum():

            neighbours_features = []

            for year_data, year_distances in zip(all_year_data, all_year_distances):
                current_distances = year_distances[year_distances.index == qid]
                if current_distances.shape[0] != 0:
                    current_year_neighbours = current_distances[
                        current_distances.Qid2 != qid
                    ]
                    current_year_neighbours_data = year_data.loc[current_year_neighbours.Qid2].values

                    current_year_self_data = year_data.loc[qid].values

                    current_year_data_point = np.concatenate(
                        [current_year_self_data[None], current_year_neighbours_data],
                        axis=0,
                    )

                else:

                    current_year_data_point = np.zeros((1, 78))
                    

                neighbours_features.append(current_year_data_point)
            
            MAX_NEIGH = 0
            for year_features in neighbours_features:
                max_neigh = year_features.shape[0]
                if MAX_NEIGH < max_neigh:
                    MAX_NEIGH = max_neigh
            
            pad_neighbours_features = torch.zeros(0, MAX_NEIGH, 78)        
            for year in neighbours_features:
                pad_neighbours_features=torch.cat([pad_neighbours_features, pad_to_size(torch.tensor(year), MAX_NEIGH)[None]], dim=0)
                
            self_data_2023 = data_2023.loc[qid].drop(labels=columns_to_drop).values
            
            label = torch.tensor(data_2023.loc[qid, "T1":"T22"].mean())

            X.append(pad_neighbours_features)
            x.append(self_data_2023)
            y.append(label)

    return X, x, y

In [125]:
X_train, x_train, y_train = DataPreprocessor(qid_train)

100%|██████████████████████████████████████████████████████████| 63586/63586 [04:09<00:00, 255.03it/s]


In [126]:
with open("./data/X_train_lstm_pad_06_09.pt", "wb") as file_X:
    pickle.dump(X_train, file_X)
with open("./data/x_train_lstm_pad_06_09.pt", "wb") as file_x:
    pickle.dump(x_train, file_x)
with open("./data/y_train_lstm_pad_06_09.pt", "wb") as file_y:
    pickle.dump(y_train, file_y)

In [127]:
X_val, x_val, y_val = DataPreprocessor(qid_val)

100%|███████████████████████████████████████████████████████████████| 3347/3347 [00:13<00:00, 246.58it/s]


In [128]:
with open("./data/X_val_lstm_pad_06_09.pt", "wb") as file_X:
    pickle.dump(X_val, file_X)
    
with open("./data/x_val_lstm_pad_06_09.pt", "wb") as file_x:
    pickle.dump(x_val, file_x)
    
with open("./data/y_val_lstm_pad_06_09.pt", "wb") as file_y:
    pickle.dump(y_val, file_y)

In [136]:
X_test, x_test, y_test = DataPreprocessor([9333, 9855, 9673, 9860])

100%|█████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 145.92it/s]


In [137]:
with open("./data/X_test_lstm_pad_06_09.pt", "wb") as file_X:
    pickle.dump(X_test, file_X)
    
with open("./data/x_test_lstm_pad_06_09.pt", "wb") as file_x:
    pickle.dump(x_test, file_x)
    
with open("./data/y_test_lstm_pad_06_09.pt", "wb") as file_y:
    pickle.dump(y_test, file_y)