In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import os
import pandas as pd
from geo.drift_compensation import get_track_drift_rate
from get_turn import get_turning_points, plot_changepoints, TurnAndRise, write_turnandrise_to_zarr
import csv
import random
from typing import List

In [None]:
def get_filename_from_filepath(filepath: str) -> str:
    # Get the filename without the extension
    filename = os.path.splitext(os.path.basename(filepath))[0]
    return filename


In [None]:
def process_csv_file(filepath: str, logtag: str, n_idents: int = 0, ident_filter: List[str] = [], ident_mandatory: List[str] = []):
    # Set up logging
    logger = logging.getLogger(logtag)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh = logging.FileHandler(f'{logtag}.log')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # Read the file and preprocess it
    df:pd.DataFrame = pd.read_csv(filepath, compression='gzip')
    logger.info(f"Read {len(df)} rows from {filepath}")
    df.dropna(how='any', inplace=True)
    logger.info(f"Dropped NaN rows, {len(df)} rows remaining")
    # add an ident column by concatenating df['callsign'] and df['icao24']
    df['ident'] = (df['callsign'].str.strip()+'_'+df['icao24'].str.strip())
    # add a column rtime that is df['time'] - df['time'].min()
    df['rtime'] = df['time'] - df['time'].min()
    # Drop the columns we don't need
    df.drop(columns=['onground', 'alert', 'spi', 'squawk'], inplace=True)
    idents = df['ident'].unique()

    # Only keep the idents that are in the ident_filter
    if len(ident_filter) > 0:
        idents = [ident for ident in idents if ident in ident_filter]
    
    # Add the mandatory idents to the list by finding intersection between idents and ident_mandatory
    if len(ident_mandatory) > 0:
        ident_mandatory_collected = [ident for ident in idents if ident in ident_mandatory] # idents that are in both idents and ident_mandatory
        
        if len(ident_mandatory_collected) < len(ident_mandatory):
            print(f"Could not find all mandatory idents")
            print(f"Found only {len(ident_mandatory_collected)} out of {len(ident_mandatory)} mandatory idents")
            logger.error(f"Could not find all mandatory idents: {ident_mandatory}")

        # Keep ident_mandatory_collected and add random idents to the list to make up n_idents
        if n_idents > 0 and len(ident_mandatory) < n_idents:
            new_idents_pool = [ident for ident in idents if ident not in ident_mandatory_collected] # pool of idents to choose from for the remaining slots
            new_idents = random.sample(new_idents_pool, n_idents - len(ident_mandatory)) # Choose n_idents - len(ident_mandatory) random idents
            new_idents = list(set(new_idents)) # Remove duplicates
            idents = ident_mandatory_collected + new_idents
        elif n_idents > 0: # len(ident_mandatory) >= n_idents: too many mandatory idents, keep only first n_idents
            idents = ident_mandatory_collected[:n_idents]
        else:
            raise ValueError("n_idents must be greater than 0")

    else: # len(ident_mandatory) == 0 or no ident_mandatory specified
        if n_idents > 0:
            idents = random.sample(list(idents), n_idents)
        else:
            idents = list(idents)
    
    logger.info(f"Processing {len(idents)} unique idents")

    filename = get_filename_from_filepath(filepath)

    # Create a folder called filename inside the routes folder
    os.makedirs(f'../data/osstate/routes/{filename}', exist_ok=True)

    # To write dangling flights to a separate CSV file
    with open(f'../data/osstate/dangling/{filename}.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['filename', 'ident'])
        for ident in idents:
            try:
                # Get the subdf for the ident
                df_ident = df[df['ident'] == ident]
                if len(df_ident) == 0:
                    logger.error(f"Ident {ident} not found in the dataframe")
                    continue
                turns:TurnAndRise = get_turning_points(df_ident)
                if not turns['landed']:
                    # Aircraft not yet landed, write to a CSV file
                    
                    writer.writerow([filename, ident])
                
                write_turnandrise_to_zarr(turns, f'../data/osstate/routes/{filename}/{ident}.zarr')
                logger.info(f"Processed {ident}")
            except Exception as e:
                logger.error(f"Error processing {ident}: {e}")



In [None]:
def get_dangling_idents(filepath: str) -> List[str]:
    filename = get_filename_from_filepath(filepath)
    try:
        dangling_df = pd.read_csv(f'../data/osstate/dangling/{filename}.csv')
        return dangling_df['ident'].unique().tolist()
    except FileNotFoundError:
        return []

In [None]:
def get_data_file_list() -> List[str]:
    # List all the files in the data folder
    data_files = os.listdir('../data/osstate/extracted')
    # Only keep the .csv.gz files
    data_files = [file for file in data_files if file.endswith('.csv.gz')]
    data_files = [f'../data/osstate/extracted/{file}' for file in data_files]
    # Sort the files alphabetically
    data_files.sort()
    print(f'Found {len(data_files)} files')
    return data_files

In [None]:
# Test before multiprocessing

# file_list = get_data_file_list()
# process_csv_file(file_list[0], 'test', n_idents=100, ident_mandatory=[])
# print('Dangling idents: ', get_dangling_idents(file_list[0]))
# process_csv_file(filepath=file_list[1], logtag='test2', n_idents=100, ident_mandatory=get_dangling_idents(file_list[0]))


In [None]:
# Multiprocessing
# ================

In [None]:
import multiprocessing as mp

In [None]:
def process_file(file_list, thread_number, n_idents = 750):
    print(f"Processing {len(file_list)} files in thread {thread_number}")
    for index, file in enumerate(file_list):
        if index == 0:
            process_csv_file(filepath=file, logtag=file, n_idents=n_idents)
        else:
            process_csv_file(filepath=file, logtag=file, n_idents=n_idents, ident_mandatory=get_dangling_idents(file_list[index - 1]))

do_not_allow_delete = True

if __name__ == '__main__':
    file_list = get_data_file_list()
    num_processes = 4
    processes = []

    # Divide the file list into num_processes chunks
    file_list = [file_list[i:i + len(file_list) // num_processes] for i in range(0, len(file_list), len(file_list) // num_processes)]

    for i in range(num_processes):
        p = mp.Process(target=process_file, args=(file_list[i], i))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()

# CAUTION: DELETE DATA

In [None]:
def wipe_slate():
    # Wipe the slate clean
    !rm -rf ../data/osstate/routes/*
    !rm -rf ../data/osstate/dangling/*
    !rm -rf ../data/osstate/dangling/.ipynb_checkpoints/*
    !rm -rf ../data/osstate/dangling/.ipynb_checkpoints
    !rm -rf ../data/osstate/extracted/*.log
    !rm -rf *.log