In [None]:
import time
import math
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# from haversine import haversine, Unit #It can also be used to calculate the haversine distance, but a loop is needed to iterate over lat and lon.

In [None]:
# set option for checking full data rows

# pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_pickle('./data/df_gesamt_15_08_prepocessed_einworner_added.pkl')
data['GJ'] = data['GJ'].astype(float, errors='raise')

# Truncated missing years

In [None]:
target_year_having_qid = data[data.GJ == 2023].Qid
data = data[data.Qid.isin(target_year_having_qid)]

In [None]:
# splitting data to years
data_2018 = data[data["GJ"] == 2018]
data_2019 = data[data["GJ"] == 2019]
data_2020 = data[data["GJ"] == 2020]
data_2021 = data[data["GJ"] == 2021]
data_2022 = data[data["GJ"] == 2022]
data_2023 = data[data["GJ"] == 2023]

In [None]:
# # haversine distance calculator function 

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance between two points on the Earth.

    Parameters:
    - lat1 (float): Latitude of the first point in degrees.
    - lon1 (float): Longitude of the first point in degrees.
    - lat2 (float): Latitude of the second point in degrees.
    - lon2 (float): Longitude of the second point in degrees.

    Returns:
    - float: Distance between the two points in meters.

    Note:
    This function assumes the Earth is a perfect sphere with a radius of 6,371,000 meters.
    """
    
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    # Haversine formula 
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    # Radius of Earth in kilometers
    r = 6371.0
    return r * c


In [None]:
def calc_distances(data, max_dist=0.2):
    # changing type of data for agg
    data.PLZ = data.PLZ.astype('float64')

    # taking unique "Qid"s and max of other (PLZ, AU, etc) columns
#     data = data.groupby("Qid").agg({'PLZ': 'max', 'Au': 'max', 'Laenge': 'max', 'Breite': 'max'}).reset_index()
    data = data.groupby("Qid").agg({'PLZ': 'max', 'Laenge': 'max', 'Breite': 'max'}).reset_index()
    

    # cutting `plz`, e.g. plz[0] is 34678, now plz[0] is 346 (Clustering regions with zip code). 
    data['join'] = data['PLZ'].astype(str).apply(lambda x: x[:3])

    # copying data for distance calculation
    data_copy1 = data.copy()
    data_copy2 = data.copy()

    # merging the same df for commonality
    ergebnis = pd.merge(data_copy1, data_copy2, on='join', suffixes=['1', '2'])

    # defining longitudes and latitudes
    lat1 = np.array(ergebnis['Laenge1'].values)
    lat2 = np.array(ergebnis['Laenge2'].values)
    lon1 = np.array(ergebnis['Breite1'].values)
    lon2 = np.array(ergebnis['Breite2'].values)
    
    # calculating distances and applying to the df
    ergebnis['distance'] = haversine_distance(lat1, lon1, lat2, lon2)

    # clustering data with max_dist param
    ergebnis = ergebnis[(ergebnis['distance'] >= 0) & (ergebnis['distance'] <= max_dist)]

    # dropping unnecessary columns
#     ergebnis.drop(columns=["PLZ1", "Au1", "Laenge1", "Breite1", "join", "PLZ2", "Au2", "Laenge2", "Breite2"], axis=1, inplace=True) 
    ergebnis.drop(columns=["PLZ1", "Laenge1", "Breite1", "join", "PLZ2", "Laenge2", "Breite2"], axis=1, inplace=True) 

    return ergebnis

In [None]:
distances_2018 = calc_distances(data_2018.copy())
distances_2019 = calc_distances(data_2019.copy())
distances_2020 = calc_distances(data_2020.copy())
distances_2021 = calc_distances(data_2021.copy())
distances_2022 = calc_distances(data_2022.copy())
distances_2023 = calc_distances(data_2023.copy())

In [None]:
print("2018: ", distances_2018.shape)
print("2019: ", distances_2019.shape)
print("2020: ", distances_2020.shape)
print("2021: ", distances_2021.shape)
print("2022: ", distances_2022.shape)
print("2023: ", distances_2023.shape)

In [None]:
distances_2018.to_csv("data/distances/distances_2018.csv", index=False)
distances_2019.to_csv("data/distances/distances_2019.csv", index=False)
distances_2020.to_csv("data/distances/distances_2020.csv", index=False)
distances_2021.to_csv("data/distances/distances_2021.csv", index=False)
distances_2022.to_csv("data/distances/distances_2022.csv", index=False)
distances_2023.to_csv("data/distances/distances_2023.csv", index=False)

# GeoDaten_Calculation_03_08_2023

In [None]:
data = pd.read_pickle("./data/df_gesamt_10_08_prepocessed.pkl")

In [None]:
# splitting data to years
data_2018 = data[data["GJ"] == 2018].copy()
data_2019 = data[data["GJ"] == 2019].copy()
data_2020 = data[data["GJ"] == 2020].copy()
data_2021 = data[data["GJ"] == 2021].copy()
data_2022 = data[data["GJ"] == 2022].copy()
data_2023 = data[data["GJ"] == 2023].copy()

#### For each Qid1, ten nearest neighbors Qid2 are stored, distance and original data indexes

In [None]:
all_distances = []
MAX_DISTANCE = 0.200

for current_data, name in zip([data_2018, data_2019, data_2022, data_2023], ["2018", "2019", "2022", "2023"]):
    
    current_data["PLZ_short"] = current_data["PLZ"].apply(lambda x: x[:3])
    current_data.reset_index(inplace=True)
    current_data.set_index("PLZ_short", inplace=True)
    
    distances_current = np.zeros((0, 4))
    
    for idx, (plz_index, row) in tqdm(enumerate(current_data.iterrows()), total=len(current_data)):
        
        potential_neighbours = current_data.loc[plz_index]
        coordinates = potential_neighbours[['Breite', 'Laenge']].values
        this_coordinates = current_data.iloc[idx][['Breite', 'Laenge']].values
        
        distances = haversine_distance(*this_coordinates , *coordinates.T)
        sorted_indices = np.argsort(distances)[1:]
        
        if len(sorted_indices) == 0:
            continue
        elif len(sorted_indices) < 10:
            sorted_distances = distances[sorted_indices]
        else:            
            sorted_distances = distances[sorted_indices[:10]]

        kept_indices = np.where(sorted_distances <= MAX_DISTANCE)[0]
        
        if len(kept_indices) == 0:
            continue
        
        top_N = potential_neighbours.iloc[kept_indices]
        kept_distances = sorted_distances[kept_indices]
        N = top_N.shape[0]
        
        tmp = np.stack([np.ones(N) * row["Qid"], top_N["Qid"].values, kept_distances, top_N["index"]], axis=-1)
        distances_current = np.concatenate([distances_current, tmp], axis=0)
        
    df = pd.DataFrame(columns=["Qid1", "Qid2", "distance"], data=distances_current[:, :-1], index=distances_current[:, -1])
    df.to_csv(f"./data/distances_10_neighbours/distances_{name}.csv", index=False)

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Do broadcasting
    dlat = lat2 - lat1[..., None]
    dlon = lon2 - lon1[..., None]
    
    a = np.sin(dlat/2.0)**2 + np.cos(lat1[..., None]) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

    r = 6371.0
    return r * c