This is a `152 bit` version of the hash code, so it allows tighter clustering of the flows.

The code is essentially similar to the file `clustering_desgments5.ipynb`.

This notebook will create `.hsb` file for each flight route, and the `hash_master.hashb`.

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import numpy as np
from matplotlib import pyplot as plt
import os
import pandas as pd

In [16]:
# Open a waypoint_master file
waypoints_master_df = pd.read_csv('../data/osstate/waypoints_master.csv')
waypoints_master_df.head()
    

Unnamed: 0,ident,lat,lon
0,H882U2Xn5VOd,50.404884,-98.62603
1,P8ToOqoOwy3c,32.375521,130.66352
2,oZYSGKN80id2,41.086054,-77.855456
3,uPuRzZYavxw1,40.908554,-78.007263
4,lEf9PODPqPxe,40.573301,-78.509674


In [17]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import cartopy

def viz_wp(lats: np.ndarray, lons: np.ndarray) -> None:

    # Create a figure and axes with a specific projection
    fig, ax = plt.subplots(figsize=(4, 4), subplot_kw={'projection': ccrs.PlateCarree()})

    # Add country borders for reference
    ax.add_feature(cartopy.feature.BORDERS, linestyle=':')
    ax.add_feature(cartopy.feature.COASTLINE)
    ax.add_feature(cartopy.feature.LAND, edgecolor='black')

    # Plot the point
    for i in range(len(lats)):
        ax.plot(lons[i], lats[i], 'ro', markersize=10, transform=ccrs.PlateCarree())
        ax.text(lons[i], lats[i], f'WP {i}', transform=ccrs.PlateCarree())

    # Set the extent of the map to the area around the point
    #longitude = np.min(lons)
    #latitude = np.min(lats)
    #ax.set_extent([longitude-5, longitude+5, latitude-5, latitude+5])

    # Add gridlines
    ax.gridlines()

    # Show the plot
    plt.show()

In [18]:
from hash_segments import hash_with_hpp, compute_segment_position_hash, latlon2xyz

In [19]:
def strip_file_extension(filename: str) -> str:
    # Get the filename, which is everything before the first period and after the last slash
    filename = filename.split('/')[-1]
    filename = filename.split('.')[0]
    return filename

In [20]:
def get_normal_vectors(wp_lats: np.ndarray, wp_lon: np.ndarray) -> np.ndarray:
    # Convert the lats and lons to xyz
    wp_xyz = latlon2xyz(wp_lats, wp_lon).T # n waypoints x 3
    segment_start = wp_xyz[:-1]
    segment_end = wp_xyz[1:]
    wp_normals = np.cross(segment_start, segment_end)
    # Normalize the normals
    wp_normals /= np.linalg.norm(wp_normals, axis=1)[:, np.newaxis]
    # Fix the orientation of the normals
    wp_normals = np.where(wp_normals[:, 2][:, np.newaxis] >= 0, wp_normals, -wp_normals)
    return wp_normals

In [21]:
def get_midpoints(wp_lats: np.ndarray, wp_lon: np.ndarray) -> np.ndarray:
    segment_start_lats = wp_lats[:-1]
    segment_start_lons = wp_lon[:-1]
    segment_end_lats = wp_lats[1:]
    segment_end_lons = wp_lon[1:]
    midpoints_lats = (segment_start_lats + segment_end_lats) / 2
    midpoints_lons = (segment_start_lons + segment_end_lons) / 2
    return midpoints_lats, midpoints_lons

In [22]:
# Delete the ../data/osstate/routes_hashes/hash_master.hash file if it exists
!rm -rf ../data/osstate/routes_hashes/*.hsb
!rm -rf ../data/osstate/routes_hashes/hash_master.hashb

# Create a hash_master file
with open('../data/osstate/routes_hashes/hash_master.hashb', 'w') as f:
    f.write('wpf,wpt,hash\n')


# Single-thread code

In [None]:
# # List all the subdirectories in the routes directory
# date_dirs = os.listdir('../data/osstate/routes') # states_2022-01-03-00.csv
# # sort the date_dirs
# date_dirs = sorted(date_dirs)
# for date_dir in date_dirs[:1]:
#     print(f"Processing {date_dir}")
#     # List all the files in the subdirectory whose filename ends with .zarr_route.csv
#     files = os.listdir(f'../data/osstate/routes/{date_dir}')
#     files = [f for f in files if f.endswith('.zarr_route.csv')]
#     files = sorted(files)
#     # files: AEE882_46b823.zarr_route.csv
#     for route_file in files[:500]:
#         # print(f"Processing {route_file}")
#         # Open the route file
#         route_df = pd.read_csv(f'../data/osstate/routes/{date_dir}/{route_file}')
#         # route_df contains two columns: ident and to (time over)
#         route_wps = route_df['ident'].values
#         if len(route_wps) < 2:
#             # print(f"Warning: Route {route_file} has 0 or 1 waypoint, which is insufficient to determine segments.")
#             continue
#         # find lons and lats of route_wps waypoints from the waypoints_master_df
#         route_lons = []
#         route_lats = []
#         for wp in route_wps:
#             wp_df = waypoints_master_df[waypoints_master_df['ident'] == wp]
#             if len(wp_df) > 0:
#                 route_lons.append(wp_df['lon'].values[0])
#                 route_lats.append(wp_df['lat'].values[0])

#         # extract the segments from the waypoints
#         route_normals = get_normal_vectors(np.array(route_lats), np.array(route_lons))
#         route_midpoints = get_midpoints(np.array(route_lats), np.array(route_lons))

#         # hash the segments
#         try:
#             normals_hash = hash_with_hpp(route_normals)
#             midpoints_hash = compute_segment_position_hash(latlon2xyz(route_midpoints[0], route_midpoints[1]), route_normals)
#         except:
#             print(f"Warning: Route {route_file} has an error in hashing the segments.")
#             continue

#         # Concatenate the hashes
#         route_hash = [f"{normals_hash[i]}_{midpoints_hash[i]}" for i in range(len(normals_hash))]

#         # Save the route hash
#         # Open a new file in ../data/routes_hashes
#         with open(f'../data/osstate/routes_hashes/hash_master.hash', 'a') as f:
#             for item in route_hash:
#                 wp_from = route_wps[route_hash.index(item)]
#                 wp_to = route_wps[route_hash.index(item)+1]
#                 f.write(f'{wp_from},{wp_to},{item}\n')


# Multiprocessing code

In [None]:
import os
import pandas as pd
import numpy as np
import multiprocessing
from functools import partial

# Hashcode's length parameters
segment_bit_length_custom = 480
localization_bit_length_custom = 12

# Define the function to process each subdirectory
def process_subdirectory(date_dir, waypoints_master_df):
    # print(f"Processing {date_dir}")
    # List all the files in the subdirectory whose filename ends with .zarr_route.csv
    files = os.listdir(f'../data/osstate/routes/{date_dir}')
    files = [f for f in files if f.endswith('.zarr_route.csv')]

    hash_file_tag = strip_file_extension(date_dir)

    hash_file_writer = open(f'../data/osstate/routes_hashes/hash_master_{hash_file_tag}.hsb', 'w')
    
    for route_file in files:
        # Open the route file
        route_df = pd.read_csv(f'../data/osstate/routes/{date_dir}/{route_file}')
        # route_df contains two columns: ident and to (time over)
        route_wps = route_df['ident'].values
        if len(route_wps) < 2:
            continue
        # find lons and lats of route_wps waypoints from the waypoints_master_df
        route_lons = []
        route_lats = []
        for wp in route_wps:
            wp_df = waypoints_master_df[waypoints_master_df['ident'] == wp]
            if len(wp_df) > 0:
                route_lons.append(wp_df['lon'].values[0])
                route_lats.append(wp_df['lat'].values[0])

        # extract the segments from the waypoints
        route_normals = get_normal_vectors(np.array(route_lats), np.array(route_lons))
        route_midpoints = get_midpoints(np.array(route_lats), np.array(route_lons))

        # hash the segments
        try:
            normals_hash = hash_with_hpp(route_normals, num_planes = segment_bit_length_custom)
            midpoints_hash = compute_segment_position_hash(latlon2xyz(route_midpoints[0], route_midpoints[1]), route_normals, random_angles = localization_bit_length_custom)
        except:
            print(f"Warning: Route {route_file} has an error in hashing the segments.")
            continue

        # Concatenate the hashes
        route_hash = [f"{normals_hash[i]}_{midpoints_hash[i]}" for i in range(len(normals_hash))]

        # Save the route hash
        
        for item in route_hash:
            wp_from = route_wps[route_hash.index(item)]
            wp_to = route_wps[route_hash.index(item)+1]
            hash_file_writer.write(f'{wp_from},{wp_to},{item}\n')

    hash_file_writer.close()

# Define the main function to use multiprocessing
def main():
    # List all the subdirectories in the routes directory
    date_dirs = os.listdir('../data/osstate/routes')

    # Read the waypoints master dataframe
    waypoints_master_df = pd.read_csv(f'../data/osstate/waypoints_master.csv')

    # Create a partial function to pass the waypoints_master_df
    process_func = partial(process_subdirectory, waypoints_master_df=waypoints_master_df)

    # Use multiprocessing to process each subdirectory in parallel
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        pool.map(process_func, date_dirs)

if __name__ == "__main__":
    main()


In [None]:
!rm ../data/osstate/routes_hashes/hash_master.hashb

In [None]:
import os
import pandas as pd
import numpy as np

def write_hash_master(n_top_hashes: int = 18):
    # Open the hash_master file
    hash_master_file = open('../data/osstate/routes_hashes/hash_master.hashb', 'w')

    # List all .hs files in the routes_hashes directory
    hash_files = os.listdir('../data/osstate/routes_hashes')
    hash_files = [f for f in hash_files if f.endswith('.hsb')]
    hash_files = sorted(hash_files)

    for hf in hash_files:
        # Open the hash file
        hash_file = pd.read_csv(f'../data/osstate/routes_hashes/{hf}', header=None)
        hash_file.columns = ['wpf', 'wpt', 'hash']
        hash_file_hash = hash_file['hash'].values
        # Count the number of times each hash appears
        hash_file_count = pd.Series(hash_file_hash).value_counts()
        # Sort the hashes by count
        hash_file_count = hash_file_count.sort_values(ascending=False)
        # Get all the hashes that appear more than once
        hash_file_count = hash_file_count[hash_file_count > 1]

        if len(hash_file_count) == 0:
            continue

        if len(hash_file_count) > n_top_hashes:
            hash_file_count = hash_file_count[:n_top_hashes]

        # Write to hash_master_file all rows in hash_file that have a hash in hash_file_count
        for i in range(len(hash_file)):
            if hash_file['hash'][i] in hash_file_count:
                hash_master_file.write(f"{hash_file['wpf'][i]},{hash_file['wpt'][i]},{hash_file['hash'][i]}\n")

    hash_master_file.close()
    

write_hash_master(24)