In [None]:
from utils import *

In [None]:
class CustomDataLoader2:
    def __init__(
        self, data_path="./data/df_gesamt_10_08_prepocessed.pkl", test_size=0.05
    ):
        self.data = pd.read_pickle(data_path).reset_index(drop=True)

        # Normalize the columns
        self.data.Laenge = self.min_max_norm(self.data.Laenge, -180, 180)
        self.data.Breite = self.min_max_norm(self.data.Breite, -90, 90)
        self.data.PPSVACWert = self.gaussian_norm(self.data.PPSVACWert)

        # Train / Val split
        self.Qid_train, self.Qid_val = train_test_split(
            self.data["Qid"].unique(), test_size=test_size, random_state=23
        )

        # Load distances
        self.distances_2018 = pd.read_csv("data/distances/distances_2018.csv")
        self.distances_2019 = pd.read_csv("data/distances/distances_2019.csv")
        self.distances_2020 = pd.read_csv("data/distances/distances_2020.csv")
        self.distances_2021 = pd.read_csv("data/distances/distances_2021.csv")
        self.distances_2022 = pd.read_csv("data/distances/distances_2022.csv")
        self.distances_2023 = pd.read_csv("data/distances/distances_2023.csv")

        self.decades = pd.read_csv("./data/decades.csv")

        self.data_2023 = self.data[self.data.GJ == 2023].copy()

        # define constants
        self.CHUNK_SIZE = 20_000
        self.MAX_NEIGHBOURS_SIZE = 10
        self.DECADE_WINDOWS_SIZE = 3
        self.NUM_DECADES = 34 * (self.data.GJ.unique().shape[0] - 1)
        self.DECADE_COLUMN_SLICES = [
            [f"T{i}" for i in range(k, k + self.DECADE_WINDOWS_SIZE)]
            for k in range(1, self.NUM_DECADES - self.DECADE_WINDOWS_SIZE + 2)
        ]
        self.CONST_COLUMNS = [
            "Stellensubart_0",
            "Stellensubart_1",
            "Stellensubart_2",
            "Stellensubart_3",
            "Stellensubart_4",
            "Stellensubart_5",
            "Stellensubart_6",
            "Stellensubart_7",
            "Stellensubart_8",
            "Stellensubart_9",
            "Stellensubart_10",
            "Stellensubart_11",
            "Stellensubart_12",
            "Stellensubart_13",
            "Stellensubart_14",
            "Preis",
            "Beleuchtet",
            "Laenge",
            "Breite",
            "Eigenfläche",
            "PPSVACWert",
        ]

    def min_max_norm(self, x, min_, max_):
        x_norm = (x - min_) / (max_ - min_)
        return x_norm

    def gaussian_norm(self, x):
        x_norm = (x - x.mean()) / x.std()

        return x_norm

    def load_and_save(self):
        train_chunks = torch.split(torch.tensor(self.Qid_train), self.CHUNK_SIZE)

        for i, chunk in enumerate(tqdm(train_chunks, position=0, leave=True)):
            X, y = self.load(chunk)

            torch.save(X, f"./data/saved_train_val_data/X_train_{i}.pt")
            torch.save(y, f"./data/saved_train_val_data/y_train_{i}.pt")

            del X, y

        X, y = self.load(self.Qid_val)
        torch.save(X, f"./data/saved_train_val_data/X_val.pt")
        torch.save(y, f"./data/saved_train_val_data/y_val.pt")

        del X, y

    def load(self, qid_chunks):
        X, y = [], []

        for qid in tqdm(qid_chunks, position=0, leave=True):
            qid2_2018 = self.takeTopNeighbours(self.distances_2018, qid)
            qid2_2019 = self.takeTopNeighbours(self.distances_2019, qid)
            qid2_2020 = self.takeTopNeighbours(self.distances_2020, qid)
            qid2_2021 = self.takeTopNeighbours(self.distances_2021, qid)
            qid2_2022 = self.takeTopNeighbours(self.distances_2022, qid)

            qid_complete_T = pd.concat(
                [
                    self.decades[self.decades.Qid.isin(qid2_2018)]
                    .loc[:, "T1":"T34"]
                    .reset_index(drop=True),
                    self.decades[self.decades.Qid.isin(qid2_2019)]
                    .loc[:, "T35":"T68"]
                    .reset_index(drop=True),
                    self.decades[self.decades.Qid.isin(qid2_2020)]
                    .loc[:, "T69":"T102"]
                    .reset_index(drop=True),
                    self.decades[self.decades.Qid.isin(qid2_2021)]
                    .loc[:, "T103":"T136"]
                    .reset_index(drop=True),
                    self.decades[self.decades.Qid.isin(qid2_2022)]
                    .loc[:, "T137":"T170"]
                    .reset_index(drop=True),
                ],
                axis=1,
                ignore_index=True,
            ).fillna(-1)

            qid_complete_T.columns = columns = [
                f"T{i}" for i in range(1, self.NUM_DECADES + 1)
            ]

            currData = self.takeByQid(
                qid,
                qid_complete_T,
                qid2_2018,
                qid2_2019,
                qid2_2020,
                qid2_2021,
                qid2_2022,
            )

            X.append(currData)
            y.append(self.takeTarget(qid))

        return torch.stack(X, dim=0), torch.stack(y, dim=0)

    def pad_to_size(self, tensor):
        # Check if the tensor is 2D
        if len(tensor.shape) != 2:
            raise ValueError("Input tensor should be 2D")

        current_x, current_y = tensor.shape
        pad_left = 0
        pad_right = 0
        pad_top = 0
        pad_bottom = max(0, self.MAX_NEIGHBOURS_SIZE - current_x)

        return torch.nn.functional.pad(
            tensor, (pad_left, pad_right, pad_top, pad_bottom), "constant", -1
        )

    def takeTarget(self, qid):
        targetRows = self.data_2023[self.data_2023.Qid == qid].copy()

        if targetRows.shape[0]:
            T = targetRows.loc[:, "T1":"T22"].copy()
            T[T == -1] = 0  # TODO: check later
            meanOf_T = T.mean().mean()
            return torch.tensor([meanOf_T])

        return torch.tensor([0.0])

    def takeConstPart(self, qids, year):
        # TODO: check the case when there is no `year` data for that Qid

        data_part = self.data[self.data["GJ"] == year]
        one_year_data = data_part[data_part.Qid.isin(qids)]

        if one_year_data.shape[0]:
            one_year_data = one_year_data.loc[:, self.CONST_COLUMNS].copy()
            one_year_data = torch.tensor(one_year_data.astype(float).values)

            # TODO: add pad_to_size
            return self.pad_to_size(one_year_data)
        else:
            return -torch.ones(self.MAX_NEIGHBOURS_SIZE, len(self.CONST_COLUMNS))

    def get2023data(self, qid):
        # TODO: check the case when there is no 2023 data for that Qid

        data_part_2023 = self.data_2023[self.data_2023.Qid == qid]

        if data_part_2023.shape[0]:
            data_part_2023 = data_part_2023.loc[:, self.CONST_COLUMNS].copy()
            data_part_2023["PPSVACWert"] = -1  # TODO: check later.
            data_part_2023 = torch.tensor(data_part_2023.astype(float).values)
            data_part_2023 = data_part_2023.repeat(10, 1)
            return data_part_2023

        else:
            return -torch.ones(self.MAX_NEIGHBOURS_SIZE, len(self.CONST_COLUMNS))

    def get_year(self, index):
        if index <= 34:
            return 2018
        elif index <= 68:
            return 2019
        elif index <= 102:
            return 2020
        elif index <= 136:
            return 2021
        else:
            return 2022

    def takeByQid(
        self, qid, decades, qid2_2018, qid2_2019, qid2_2020, qid2_2021, qid2_2022
    ):
        data_part_2018 = self.takeConstPart(qid2_2018, 2018)
        data_part_2019 = self.takeConstPart(qid2_2019, 2019)
        data_part_2020 = self.takeConstPart(qid2_2020, 2020)
        data_part_2021 = self.takeConstPart(qid2_2021, 2021)
        data_part_2022 = self.takeConstPart(qid2_2022, 2022)
        data_part_2023 = self.get2023data(qid)

        choice = {
            2018: data_part_2018,
            2019: data_part_2019,
            2020: data_part_2020,
            2021: data_part_2021,
            2022: data_part_2022,
        }

        return_value = []

        for decade_columns in self.DECADE_COLUMN_SLICES:
            decade_slice = decades.loc[:, decade_columns].copy()
            decade_slice = torch.tensor(decade_slice.astype(float).values)

            # TODO: check the case when there is no 10 neighbours

            if decade_slice.shape[0] != self.MAX_NEIGHBOURS_SIZE:
                decade_slice = self.pad_to_size(decade_slice)

            current_sequence = [decade_slice]

            for column in decade_columns:
                index = int(column[1:])
                year = self.get_year(index)
                current_sequence.extend([choice[year], data_part_2023])

            current_sequence = torch.cat(current_sequence, dim=-1)

            return_value.append(current_sequence)

        return torch.stack(return_value, dim=0)

    def takeTopNeighbours(self, df, qid, top=10):
        current_data = df[df.Qid1 == qid]
        sorted_data = current_data.sort_values(by="distance")

        self_pair = sorted_data[sorted_data.Qid2 == qid]

        if not self_pair.empty:
            sorted_data = sorted_data[sorted_data.Qid2 != qid]
            sorted_data = pd.concat([self_pair, sorted_data], ignore_index=True)

        return sorted_data.iloc[:top].Qid2.values

In [None]:
dataset = CustomDataLoader2()

In [None]:
dataset.load_and_save()