In [1]:
from dotenv import load_dotenv
import os 

load_dotenv()

True

# Make Catalog Directory

In [2]:
import pandas as pd 
# Get directory paths
PROJECT_ROOT = os.getenv('PROJECT_ROOT')
HOURLY_FLOWS_DIR = os.path.join(PROJECT_ROOT, 'data/hourly')
CATALOG_DIR = os.path.join(PROJECT_ROOT, 'data/catalog')

# Create catalog directory if it doesn't exist
os.makedirs(CATALOG_DIR, exist_ok=True)

def process_csv_file(csv_path):
    # Read CSV file
    df = pd.read_csv(csv_path)
    
    # Get unique IDs, strip whitespace, and convert to lowercase
    unique_ids = df['id'].str.strip().str.lower().unique()
    
    # Create catalog filename
    base_name = os.path.splitext(os.path.basename(csv_path))[0]
    catalog_path = os.path.join(CATALOG_DIR, f"{base_name}.txt")
    
    # Write unique IDs to catalog file
    with open(catalog_path, 'w') as f:
        for id in unique_ids:
            f.write(f"{id}\n")

In [3]:
# Process all CSV files in the hourly flows directory
for filename in os.listdir(HOURLY_FLOWS_DIR):
    if filename.endswith('.csv') and not filename.startswith('._'):
        csv_path = os.path.join(HOURLY_FLOWS_DIR, filename)
        process_csv_file(csv_path)
        print(f"Processed {filename}")

Processed 1716771600.csv
Processed 1717981200.csv
Processed 1717423200.csv
Processed 1717848000.csv
Processed 1718193600.csv
Processed 1716537600.csv
Processed 1716987600.csv
Processed 1718863200.csv
Processed 1717801200.csv
Processed 1719068400.csv
Processed 1718035200.csv
Processed 1718949600.csv
Processed 1717578000.csv
Processed 1718560800.csv
Processed 1717743600.csv
Processed 1717318800.csv
Processed 1717092000.csv
Processed 1717045200.csv
Processed 1716598800.csv
Processed 1718013600.csv
Processed 1717293600.csv
Processed 1716652800.csv
Processed 1717534800.csv
Processed 1717542000.csv
Processed 1719176400.csv
Processed 1717059600.csv
Processed 1716840000.csv
Processed 1716638400.csv
Processed 1716624000.csv
Processed 1718982000.csv
Processed 1719010800.csv
Processed 1717426800.csv
Processed 1717984800.csv
Processed 1718586000.csv
Processed 1716710400.csv
Processed 1717419600.csv
Processed 1717779600.csv
Processed 1719140400.csv
Processed 1717837200.csv
Processed 1717002000.csv


# Create Merge Instructions

In [5]:
# Read all catalog files into a dictionary
catalog = {}
for filename in os.listdir(CATALOG_DIR):
    if filename.endswith('.txt') and not filename.startswith('._'):
        catalog_path = os.path.join(CATALOG_DIR, filename)
        with open(catalog_path, 'r') as f:
            # Strip newlines and store content
            content = f.read().strip()
            # Replace \n with a comma
            content = content.replace('\n', ',')
            # Store with filename (without .txt) as key
            catalog[filename[:-4]] = content


In [36]:

import numpy as np
def find_id_locations(data_dict):
    # Dictionary to store id -> list of keys mapping
    id_locations = {}
    
    # Split values and track which ids appear in which keys
    for key, value in data_dict.items():
        ids = value.split(',')
        for id in ids:
            if id:  # Skip empty strings
                if id not in id_locations:
                    id_locations[id] = []
                id_locations[id].append(int(key)) # convert to int

    # Sort each id's list of locations in ascending order
    for id in id_locations:
        id_locations[id].sort()

    print(f'Found {len(id_locations)} unique flights')

    # Create a list of keys to remove
    keys_to_remove = []
    remove_from_time = []
    for key in id_locations:
        # Get the sorted times for this flight
        times = id_locations[key]
        
        # Calculate gaps between consecutive times
        gaps = np.diff(times)
        
        # Find where gaps are larger than 4000
        large_gaps = np.where(gaps > 4000)[0]
        
        if len(large_gaps) > 0:
            # Get the time right before the first large gap
            split_time = times[large_gaps[0]]
            keys_to_remove.append(key)
            remove_from_time.append(split_time)

    print(f'Adjusting {len(keys_to_remove)} keys due to large gaps')

    # Remove keys that only have a single time value
    single_value_keys = [key for key in id_locations if len(id_locations[key]) == 1]
    print(f'Removing {len(single_value_keys)} keys with only single time value')
    for key in single_value_keys:
        del id_locations[key]
    
    # Update id_locations for flights with large gaps
    for key, split_time in zip(keys_to_remove, remove_from_time):
        # Keep only times up to the split time
        id_locations[key] = [t for t in id_locations[key] if t <= split_time]

    # Remove keys that only have a single time value
    single_value_keys = [key for key in id_locations if len(id_locations[key]) == 1]
    print(f'Removing {len(single_value_keys)} keys with only single time value')
    for key in single_value_keys:
        del id_locations[key]
    
    return id_locations

In [38]:
id_locations = find_id_locations(catalog)

Found 574278 unique flights
Adjusting 194270 keys due to large gaps
Removing 47644 keys with only single time value
Removing 29318 keys with only single time value


In [39]:
id_locations

{'461f31fin7py': [1716768000, 1716771600, 1716775200, 1716778800, 1716782400],
 '04015ceth3725': [1716771600, 1716775200, 1716778800, 1716782400],
 '440bcdbcs12p': [1716771600, 1716775200],
 '48c121rys3908': [1716764400, 1716768000, 1716771600],
 '4baa75thy3ux': [1716771600, 1716775200],
 '4bb4eemgh706': [1716760800, 1716764400, 1716768000, 1716771600],
 '4bcdf6sxs76k': [1716771600, 1716775200, 1716778800, 1716782400],
 '3965a2afr703': [1716771600, 1716775200, 1716778800],
 '4baa88thy4hb': [1716771600, 1716775200],
 '020118mac337e': [1716771600, 1716775200],
 '7815f9chh491': [1716512400, 1716516000, 1716519600, 1716523200],
 '4ac9c6blx178': [1716757200, 1716760800, 1716764400, 1716768000, 1716771600],
 '4bb192thy41j': [1716760800, 1716764400, 1716768000, 1716771600],
 'a4ffb7dal46': [1716771600, 1716775200, 1716778800],
 '3a2e22reu973': [1716512400, 1716516000, 1716519600, 1716523200],
 'a27c78ual83d': [1716764400, 1716768000, 1716771600, 1716775200],
 '4bcdd2sxs3y': [1716771600, 17167

In [40]:
import yaml
with open(f'{PROJECT_ROOT}/data/catalog/merge_instructions.yml', 'w') as f:
    yaml.dump(id_locations, f)