In [1]:
# find the most common airport 1-, 2- and 3-grams
# per call-sign from ADS-B “event” files like the sample below.
#
# Each input file contains one JSON object per line:
#
# {"time":"2025-04-21T00:00:25.445Z", "callSign":"N1872J  ",
#  "eventDetail":{"airport":{"airportIdent":"KLGB"}}, ... }
#

from __future__ import annotations

import glob
import json
import re
from collections import Counter, defaultdict
from typing import Iterable, List

import pandas as pd

In [2]:
# $ head landings-takeoffs-2025-05-01.json | head
# {"time":"2025-05-01T09:42:44.829Z","icao":"4050dc","callSign":"GCEGU   ","typeCode":"P28A","typeDesc":null,"reg":"G-CEGU","lon":-0.806467,"lat":51.493347,"eventDetail":{"type":"takeoff","heading":321.20697,"airport":{"airportIdent":"EGLM","runwayIdent":"25L","runwayScore":-3960801.2},"confidence":"medium"},"reverseGeo":null,"url":"https://globe.adsbexchange.com/?icao=4050dc&zoom=13&lat=51.493347&lon=-0.806467&showTrace=2025-05-01&trackLabels&startTime=09:41&endTime=09:44"}
# {"time":"2025-05-01T09:42:54.829Z","icao":"39280f","callSign":"FGKAP   ","typeCode":"P28A","typeDesc":null,"reg":"F-GKAP","lon":7.455943,"lat":47.73506,"eventDetail":{"type":"takeoff","heading":28.28523,"airport":{"airportIdent":"LFGB","runwayIdent":"02R","runwayScore":-24712820.0},"confidence":"medium"},"reverseGeo":null,"url":"https://globe.adsbexchange.com/?icao=39280f&zoom=13&lat=47.73506&lon=7.455943&showTrace=2025-05-01&trackLabels&startTime=09:41&endTime=09:44"}
# {"time":"2025-05-01T09:42:57.829Z","icao":"04019e","callSign":"ETH204  ","typeCode":"DH8D","typeDesc":null,"reg":"ET-AYH","lon":38.823174,"lat":8.98393,"eventDetail":{"type":"takeoff","heading":74.57696,"airport":{"airportIdent":"HAAB","runwayIdent":"07R","runwayScore":-205.102},"confidence":"medium"},"reverseGeo":null,"url":"https://globe.adsbexchange.com/?icao=04019e&zoom=13&lat=8.98393&lon=38.823174&showTrace=2025-05-01&trackLabels&startTime=09:41&endTime=09:44"}

def read_landings_takeoffs(path: str) -> pd.DataFrame:
    """Reads a JSONL file line by line and returns a pandas DataFrame."""
    data = []
    with open(path, 'r') as f:
        for line in f:
            try:
                # Skip empty lines
                if line.strip():
                    data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line in {path}: {e}")
                print(f"Problematic line: {line.strip()}") # Print the line causing trouble
    if not data:
        # Return an empty DataFrame if no valid JSON was found
        return pd.DataFrame()
    return pd.DataFrame(data)


def read_all_landings_takeoffs()-> pd.DataFrame:
    # Read all "landings-takeoffs-*.json" files using the new function.
    files = sorted(glob.glob("landings-takeoffs-*.json"))
    if not files:
        print("No 'landings-takeoffs-*.json' files found.")
        df = pd.DataFrame() # Create an empty DataFrame
    else:
        df = pd.DataFrame()
        for f in files:
            print(f"Reading file: {f}")
            df = pd.concat([df, read_landings_takeoffs(f)], ignore_index=True)

    # Convert the 'time' column to datetime objects
    df['time'] = pd.to_datetime(df['time'], format='ISO8601')
    # Sort the DataFrame by the 'time' column
    df = df.sort_values(by='time').reset_index(drop=True)

    event_details_normalized = pd.json_normalize(df['eventDetail'])

    # Drop the original eventDetail column from df
    df = df.drop(columns=['eventDetail'])

    # Concatenate the original DataFrame (without eventDetail) and the normalized details
    # Ensure indices align correctly, especially after sorting df earlier
    df = pd.concat([df.reset_index(drop=True),
                            event_details_normalized.reset_index(drop=True)], axis=1)
    print(f"Successfully loaded {len(df)} records.")
    return df

df = read_all_landings_takeoffs()
df    

Reading file: landings-takeoffs-2025-04-08.json
Reading file: landings-takeoffs-2025-04-09.json
Reading file: landings-takeoffs-2025-04-10.json
Reading file: landings-takeoffs-2025-04-11.json
Reading file: landings-takeoffs-2025-04-12.json
Reading file: landings-takeoffs-2025-04-13.json
Reading file: landings-takeoffs-2025-04-14.json
Reading file: landings-takeoffs-2025-04-15.json
Reading file: landings-takeoffs-2025-04-16.json
Reading file: landings-takeoffs-2025-04-17.json
Reading file: landings-takeoffs-2025-04-18.json
Reading file: landings-takeoffs-2025-04-19.json
Reading file: landings-takeoffs-2025-04-20.json
Reading file: landings-takeoffs-2025-04-21.json
Reading file: landings-takeoffs-2025-04-22.json
Reading file: landings-takeoffs-2025-04-23.json
Reading file: landings-takeoffs-2025-04-24.json
Reading file: landings-takeoffs-2025-04-25.json
Reading file: landings-takeoffs-2025-04-26.json
Reading file: landings-takeoffs-2025-04-27.json
Reading file: landings-takeoffs-2025-04-

Unnamed: 0,time,icao,callSign,typeCode,typeDesc,reg,lon,lat,reverseGeo,url,type,heading,confidence,airport.airportIdent,airport.runwayIdent,airport.runwayScore,airport
0,2025-04-07 23:59:08.286000+00:00,a1a740,N2052M,P28A,,N2052M,-98.24469,30.719402,,https://globe.adsbexchange.com/?icao=a1a740&zo...,takeoff,14.534644,medium,KBMQ,01,-2533.046100,
1,2025-04-07 23:59:26.286000+00:00,a2aa66,CAP4271,C172,,N271CA,-98.84658,29.330610,,https://globe.adsbexchange.com/?icao=a2aa66&zo...,takeoff,340.896150,medium,KCVB,33,-23.047781,
2,2025-04-07 23:59:32.286000+00:00,a5e01a,BKN78,P28A,,N478PC,-88.76141,37.070340,,https://globe.adsbexchange.com/?icao=a5e01a&zo...,takeoff,224.077600,medium,KPAH,22,-2.746204,
3,2025-04-07 23:59:38.286000+00:00,7c43d7,NOP,SR20,,VH-NOP,153.26329,-28.840897,,https://globe.adsbexchange.com/?icao=7c43d7&zo...,takeoff,166.458620,medium,YLIS,15,-118.746360,
4,2025-04-07 23:59:48.286000+00:00,aa4c26,N7626F,P28A,,N7626F,-97.19913,46.848038,,https://globe.adsbexchange.com/?icao=aa4c26&zo...,takeoff,314.065200,medium,K5N8,31,-3551.608000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6513930,2025-05-05 23:01:16.465000+00:00,c822f9,ZKICH,EC30,,ZK-ICH,168.39911,-44.654537,,https://globe.adsbexchange.com/?icao=c822f9&zo...,takeoff,183.943740,high,,,,
6513931,2025-05-05 23:01:16.465000+00:00,a8b3d0,N66VW,,,N66VW,-94.11987,36.172348,,https://globe.adsbexchange.com/?icao=a8b3d0&zo...,takeoff,11.225867,high,KASG,36,-79.660965,
6513932,2025-05-05 23:01:16.465000+00:00,a60c09,SWA3373,B737,,N489WN,-104.64142,39.856537,,https://globe.adsbexchange.com/?icao=a60c09&zo...,takeoff,180.708680,high,KDEN,17L,-27.364378,
6513933,2025-05-05 23:01:16.465000+00:00,ab6316,SWA1891,B738,,N8327A,-96.85684,32.845642,,https://globe.adsbexchange.com/?icao=ab6316&zo...,takeoff,135.803450,high,KDAL,13R,-4.773046,


In [3]:
def count_sequence_violations(df: pd.DataFrame) -> tuple[int, float]:
    """
    Counts the number of times the takeoff/landing sequence is violated for each aircraft
    and calculates the percentage of violations relative to total consecutive pairs.

    A violation occurs when two consecutive events (sorted by time) for the same aircraft
    have the same type ('takeoff' or 'landing').

    Args:
        df: DataFrame containing flight events, must include 'icao', 'time', and 'type' columns.
            The DataFrame should ideally be pre-sorted by time globally, but the function
            will sort within each group for safety.

    Returns:
        A tuple containing:
          - The total number of sequence violations (int).
          - The percentage of consecutive pairs that are violations (float).
    """
    if not {'icao', 'time', 'type'}.issubset(df.columns):
        raise ValueError("DataFrame must contain 'icao', 'time', and 'type' columns.")

    if df.empty:
        return 0, 0.0

    total_violations = 0
    total_pairs = 0
    # Ensure 'type' column exists after flattening. It should be derived from eventDetail.type
    if 'type' not in df.columns:
         print("Warning: 'type' column not found. Returning (0, 0.0).")
         return 0, 0.0

    # Filter out rows where 'type' is NaN before grouping if necessary
    df_filtered = df.dropna(subset=['type'])

    # Group by aircraft
    grouped = df_filtered.groupby('icao')

    for name, group in grouped:
        if len(group) < 2:
            continue # Need at least two events to form a pair

        # Sort events for the aircraft by time (important for sequence checking)
        # Make sure NaT times don't cause issues, though sorting should handle them
        group_sorted = group.sort_values(by='time')

        # Calculate pairs for this group
        num_pairs_in_group = len(group_sorted) - 1
        total_pairs += num_pairs_in_group

        # Get the event type of the previous record within the group
        previous_type = group_sorted['type'].shift(1)

        # Find rows where the current type is the same as the previous type
        # This automatically handles the first row (previous_type is NaN)
        violations = group_sorted['type'] == previous_type

        # Sum the violations (True counts as 1)
        total_violations += violations.sum()

    # Calculate percentage
    if total_pairs == 0:
        percentage = 0.0
    else:
        percentage = (total_violations / total_pairs) * 100

    return int(total_violations), percentage # Cast count to int for clarity

# --- Now, let's use the function in a new cell ---

# Assuming 'df' is your flattened and sorted DataFrame from the previous steps
violation_count, violation_percentage = count_sequence_violations(df.copy()) # Use .copy() if you might modify df later

print(f"Total takeoff/landing sequence violations found: {violation_count}")
print(f"Percentage of consecutive pairs violating sequence: {violation_percentage:.2f}%")

# Example check for a specific aircraft (optional)
# try:
#     # Filter out NaNs in 'icao' before selecting unique values
#     valid_icaos = df['icao'].dropna().unique()
#     if len(valid_icaos) > 0:
#         specific_icao = valid_icaos[0] # Get the first valid ICAO
#         print(f"\\nChecking sequence for ICAO: {specific_icao}")
#         # Filter for the specific ICAO and ensure it's sorted
#         specific_df = df[df['icao'] == specific_icao].sort_values('time')
#         # Display only relevant columns and non-NaN types if possible
#         print(specific_df[['time', 'type']].dropna(subset=['type']).head(10))
#         specific_violations, specific_perc = count_sequence_violations(specific_df)
#         print(f"Violations for {specific_icao}: {specific_violations} ({specific_perc:.2f}%)")
#     else:
#          print("\\nCould not find any valid ICAOs to check.")
# except Exception as e: # Catch potential errors during the example check
#      print(f"\\nError during example check: {e}")


Total takeoff/landing sequence violations found: 1260501
Percentage of consecutive pairs violating sequence: 20.07%


In [4]:
# In generate_airport_pairs:
import pandas as pd
from typing import Optional, Pattern, Iterable, Tuple, Any
from tqdm.notebook import tqdm
import re

def generate_airport_pairs(
    df: pd.DataFrame,
    callsign_filter: Optional[str | Pattern] = None
) -> Iterable[Tuple[Any, Any, str, str, str, str | None, str | None]]: # New (Takeoff Time, Landing Time, ...)
    """
    Generates (takeoff_timestamp, landing_timestamp, icao, origin, destination, call_sign) tuples from flight event data,
    handling sequence violations and optional callsign filtering.

    Handles takeoff/landing sequence violations by inferring intermediate events:
    - T(A) -> T(B) implies landing at B, yields flight (A -> B). Takeoff time is from T(A), Landing time estimated as T(A) + 2 hours.
    - L(A) -> L(B) implies takeoff at A, yields flight (A -> B). Landing time is from L(B), Takeoff time estimated as L(B) - 2 hours.
    - T(A) -> L(B) is a standard flight (A -> B). Takeoff time from T(A), Landing time from L(B).
    - L(A) -> T(B) marks the end of one leg and start of another, doesn't yield a pair itself.

    Filters flights based on a callsign regex: A pair (A -> B) is generated if
    EITHER the event determining A as origin OR the event determining B as
    destination matches the filter (if a filter is provided).

    Args:
        df: DataFrame with flight events. Must include 'icao', 'time', 'type',
            'callSign', and 'airport.airportIdent' columns. The DataFrame
            should be sorted by 'time' for correct sequencing.
        callsign_filter: An optional regex string or compiled pattern to filter
                         callsigns.

    Yields:
        Tuples of (takeoff_timestamp, landing_timestamp, icao, origin_airport_id, destination_airport_id, call_sign, registration).
        The call_sign is typically from the event determining the destination.
    """
    # --- Pre-compile regex if it's a string ---
    compiled_filter: Optional[Pattern] = None
    if isinstance(callsign_filter, str):
        try:
            compiled_filter = re.compile(callsign_filter)
        except re.error as e:
            raise ValueError(f"Invalid regex provided for callsign_filter: {e}")
    elif isinstance(callsign_filter, re.Pattern):
        compiled_filter = callsign_filter
    elif callsign_filter is not None:
        raise TypeError("callsign_filter must be a string or compiled regex pattern.")

    # --- Group by aircraft ---
    df_filtered = df.dropna(subset=['icao', 'type', 'time']) # Also drop NaN time
    grouped = df_filtered.groupby('icao')

    print(f"Processing {len(grouped)} aircraft groups...")
    count = 0

    for icao, group in tqdm(grouped, desc="Processing Aircraft Groups", total=len(grouped), unit="group"):
        if len(group) < 2:
            continue

        group_sorted = group.sort_values(by='time').reset_index(drop=True)

        for i in range(len(group_sorted) - 1):
            event_prev = group_sorted.iloc[i]
            event_curr = group_sorted.iloc[i+1]

            prev_airport = event_prev['airport.airportIdent']
            prev_type = event_prev['type']
            prev_callsign = event_prev['callSign']
            prev_time = event_prev['time']
            curr_airport = event_curr['airport.airportIdent']
            curr_type = event_curr['type']
            curr_callsign = event_curr['callSign']
            curr_time = event_curr['time']
            reg = event_curr['reg']

            if pd.isna(prev_airport) or pd.isna(curr_airport):
                continue

            origin: Optional[str] = None
            destination: Optional[str] = None
            takeoff_timestamp: Optional[Any] = None
            landing_timestamp: Optional[Any] = None
            origin_event_cs = None
            dest_event_cs = None

            # Ensure times are valid Timestamps before doing arithmetic
            if not isinstance(prev_time, pd.Timestamp) or not isinstance(curr_time, pd.Timestamp):
                 # print(f"Skipping pair due to invalid time types for ICAO {icao}: prev={prev_time}, curr={curr_time}")
                 continue # Skip if times aren't valid

            if prev_type == 'takeoff':
                origin = prev_airport
                origin_event_cs = prev_callsign
                takeoff_timestamp = prev_time
                if curr_type == 'landing': # T(A) -> L(B)
                    destination = curr_airport
                    dest_event_cs = curr_callsign
                    landing_timestamp = curr_time
                elif curr_type == 'takeoff': # T(A) -> T(B)
                    destination = curr_airport
                    dest_event_cs = curr_callsign
                    landing_timestamp = takeoff_timestamp + pd.Timedelta(hours=2) # Estimate
            elif prev_type == 'landing':
                if curr_type == 'landing': # L(A) -> L(B)
                    origin = prev_airport
                    origin_event_cs = prev_callsign
                    destination = curr_airport
                    dest_event_cs = curr_callsign
                    landing_timestamp = curr_time
                    takeoff_timestamp = landing_timestamp - pd.Timedelta(hours=2) # Estimate

            if origin and destination and takeoff_timestamp and landing_timestamp:
                filter_match = False
                if compiled_filter:
                    origin_match = pd.notna(origin_event_cs) and bool(compiled_filter.match(str(origin_event_cs)))
                    dest_match = pd.notna(dest_event_cs) and bool(compiled_filter.match(str(dest_event_cs)))
                    filter_match = origin_match or dest_match
                else:
                    filter_match = True

                if filter_match:
                    yield (takeoff_timestamp, landing_timestamp, icao, origin, destination, dest_event_cs, reg)
                    count += 1

    print(f"Finished processing. Total pairs yielded: {count}")


In [5]:
# tyson_filter = r"^(TYSON|TYS|ON)[0-9]+"
# pair_counts = Counter(generate_airport_pairs(df.copy(), callsign_filter=tyson_filter))
# print("\nTop 20 most frequent pairs:")
# for pair, count in pair_counts.most_common(20):
#    print(f"  {pair[0]} -> {pair[1]} : {count}")


In [6]:
# In the cell with write_origin_destination_csv:
import csv
import datetime
from typing import Iterable, Tuple, Any, Optional

tyson_filter = r"^(TYSON|TYS|ON)[0-9]+"


def write_origin_destination_csv(
    tuples: Iterable[tuple[Any, Any, str, str, str, Optional[str]]], filename: str
):
    with open(filename, "w", newline="") as f:  # Added newline=''
        writer = csv.writer(f)
        writer.writerow(
            [
                "takeoff_time",
                "landing_time",
                "icao",
                "origin",
                "destination",
                "callsign",
                "registration",
            ]
        )  # New header
        for (
            takeoff_time,
            landing_time,
            icao,
            origin,
            destination,
            callsign,
            registration,
        ) in tuples:  # New unpacking
            if callsign is None:
                callsign = ""
            writer.writerow(
                [
                    takeoff_time,
                    landing_time,
                    icao,
                    origin,
                    destination,
                    callsign,
                    registration,
                ]
            )  # New write


write_origin_destination_csv(
    generate_airport_pairs(df, callsign_filter=tyson_filter), "ice_air2.csv"
)

Processing 233197 aircraft groups...


Processing Aircraft Groups:   0%|          | 0/233197 [00:00<?, ?group/s]

Finished processing. Total pairs yielded: 504


In [7]:


def ngrams(seq: List[str], n: int) -> Iterable[str]:
    """Yield sliding n-grams as 'A-B-C' strings."""
    for i in range(len(seq) - n + 1):
        yield "-".join(seq[i:i + n])

def read_files(files: list[str], cs_re: re.Pattern) -> list:
    visits = []
    for f in files:
        with open(f) as fh:
            for line in fh:
                rec = json.loads(line)
                cs = rec["callSign"]
                if not cs:
                    continue
                cs_stripped = cs.strip()
                if not cs_stripped:
                    continue
                if not cs_re.match(cs_stripped):
                    continue
                icao24 = rec["icao"]
                airport = rec["eventDetail"]["airport"]
                if airport is None:
                    airportId = 'OFFAIRPORT'
                else:
                    airportId = rec["eventDetail"]["airport"]["airportIdent"]
                ts = rec["time"]
                visits.append((ts, icao24, airportId, cs_stripped))
    return visits


In [8]:
# Find landings-*.json in this directory.
TYSON_RE = re.compile(r"^(TYSON|TYS|ON)[0-9]+")
# TYSON_RE = re.compile(r"^GXA6[0-9]+")
# TYSON_RE = re.compile(r"^TYSON[0-9]+")
FILES = glob.glob("landings-*.json")
print(f"Found {len(FILES)} files")
VISITS = read_files(FILES, TYSON_RE)
print(f"Found {len(VISITS)} visits")


Found 49 files
Found 1125 visits


In [9]:
# Print VISITS as csv.
print("time,icao24,airportId,callSign")
for visit in VISITS:
    print(f"{visit[0]},{visit[1]},{visit[2]},{visit[3]}")


time,icao24,airportId,callSign
2025-04-28T11:45:28.348Z,a8d231,KHRL,TYSON51
2025-04-28T12:42:49.005Z,a2f9dd,KAEX,TYSON46
2025-04-28T13:03:32.882Z,a835d1,KELP,TYSON45
2025-04-28T14:39:34.929Z,a2c4e0,KAEX,TYSON44
2025-04-28T15:06:16.486Z,a2f9dd,KCSG,TYSON46
2025-04-28T15:09:02.760Z,a2bd72,KNYL,TYSON48
2025-04-28T15:42:07.947Z,a8b479,KMIA,TYSON52
2025-04-28T15:39:02.689Z,a2f016,KCSG,TYSON41
2025-04-28T17:12:22.765Z,a8d231,MGGT,TYSON51
2025-04-28T17:41:16.212Z,a2c4e0,KHRL,TYSON44
2025-04-28T19:01:49.661Z,a2bd72,KHRL,TYSON48
2025-04-28T19:04:09.881Z,a2f016,KHRL,TYSON41
2025-04-28T20:14:19.514Z,a8b479,KHRL,TYSON52
2025-04-28T21:08:42.361Z,a2f016,KAEX,TYSON41
2025-04-28T21:24:13.865Z,a8d231,KSAT,TYSON51
2025-04-28T21:44:20.897Z,a835d1,KHRL,TYSON45
2025-04-28T22:17:17.475Z,a75013,KAEX,TYSON42
2025-04-28T22:24:50.345Z,ad7274,KAEX,TYSON43
2025-04-28T22:31:00.472Z,a8b479,KSAT,TYSON52
2025-04-28T23:56:04.328Z,a6b0f5,KIWA,TYSON47
2025-04-12T01:35:58.767Z,a2d25e,KELP,TYSON45
2025-04-12T04:29:40.906Z

In [10]:

# For each call-sign, sort & create 1-,2-,3-gram frequency tables

TOP_N = 10
# for cs, tuples in VISITS.items():
#     tuples.sort(key=lambda t: t[0])                  # sort by time
#     airports = [ap for _, ap in tuples]
#     counters = {
#         n: Counter(ngrams(airports, n))
#         for n in (1, 2, 3)
#     }
#     print(f"\n=== {cs} ===")
#     for n in (1, 2, 3):
#         common = counters[n].most_common(TOP_N)
#         label = "-".join(["gram"] * n) if TOP_N == 1 else f"top {n}-grams"
#         for gram, cnt in common:
#             print(f"{n}-gram: {gram:<20}  ({cnt}×)")
#         if not common:
#             print(f"{n}-gram: <none recorded>")


In [11]:
# Print the TOP-N 1-,2-,3-grams.
counters: dict[int, Counter[str]] = defaultdict(Counter)

for icao, tuples in VISITS.items():
    tuples.sort(key=lambda t: t[0])
    airports = [ap for _, ap in tuples]
    for n in (1, 2, 3):
        for ngram in ngrams(airports, n):
            counters[n][ngram] += 1

TOP_N = 12
for n in (1, 2, 3):
    common = counters[n].most_common(TOP_N)
    label = "-".join(["gram"] * n) if TOP_N == 1 else f"top {n}-grams"
    for gram, cnt in common:
        print(f"{n}-gram: {gram:<20}  ({cnt}×)")
    if not common:
        print(f"{n}-gram: <none recorded>")

AttributeError: 'list' object has no attribute 'items'

In [110]:
# Write a csv of the form origin,destination,count.
# Only write bigrams, and only the top 15.
import csv


def write_ngrams_csv(counters: dict[int, Counter[str]], filename: str, top_n: int = 15):
    """Write the top N bigrams to a CSV file."""
    bigram_counter = counters.get(2)
    if not bigram_counter:
        print("No bigrams found to write.")
        return

    top_bigrams = bigram_counter.most_common(top_n)

    with open(filename, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["origin", "destination", "count"])  # Write header
        for gram, count in top_bigrams:
            parts = gram.split("-")
            if len(parts) == 2:
                origin, destination = parts
                writer.writerow([origin, destination, count])
            else:
                # This case might happen if an airport code itself contains a hyphen
                # or if something unexpected occurred during ngram generation.
                print(f"Skipping potentially malformed bigram: {gram}")


# Call the function to write the CSV
# Ensure the 'counters' variable is defined and populated from the previous cell
csv_filename = "top_bigrams.csv"
write_ngrams_csv(counters, csv_filename)
print(f"Wrote top {15} bigrams to {csv_filename}")

Wrote top 15 bigrams to top_bigrams.csv


In [111]:
write_ngrams_csv(counters, "bigrams.csv")

In [15]:
import pandas as pd
import numpy as np

# Load the runways CSV
runways_df = pd.read_csv('runways.csv')

# Initialize empty lists to store airport data
airport_idents = []
airport_lats = []
airport_lons = []

# Group by airport_ident
grouped = runways_df.groupby('airport_ident')

for airport_ident, group in grouped:
    # Extract latitude and longitude values, filtering out missing values
    le_lats = group['le_latitude_deg'].dropna().values
    le_lons = group['le_longitude_deg'].dropna().values
    he_lats = group['he_latitude_deg'].dropna().values
    he_lons = group['he_longitude_deg'].dropna().values
    
    # Combine all available lat/lon values
    all_lats = np.concatenate([le_lats, he_lats])
    all_lons = np.concatenate([le_lons, he_lons])
    
    # Only proceed if we have valid lat/lon values
    if len(all_lats) > 0 and len(all_lons) > 0:
        # Calculate the centroid
        avg_lat = np.mean(all_lats)
        avg_lon = np.mean(all_lons)
        
        # Store the data
        airport_idents.append(airport_ident)
        airport_lats.append(avg_lat)
        airport_lons.append(avg_lon)

# Create a new DataFrame for airports
airports_df = pd.DataFrame({
    'airport_ident': airport_idents,
    'latitude': airport_lats,
    'longitude': airport_lons
})

# Save to CSV
airports_df.to_csv('airports.csv', index=False)

# Display the first few rows of the new dataset
print(airports_df.head())

  airport_ident   latitude   longitude
0          00CA  35.354799 -116.885498
1          00WI  44.304300  -89.050102
2           01C  43.341702  -85.775101
3          01ID  42.607349 -112.033001
4           01J  30.686300  -81.905701


In [17]:
import pandas as pd
import requests
import time

# Read ice_air2.csv
ice_air2_df = pd.read_csv('ice_air2.csv')
# get the unique icaos.
icaos = ice_air2_df['icao'].unique()

# Create a list to store aircraft info
aircraft_data = []

# Iterate through each ICAO code and get aircraft information
for icao in tqdm(icaos):
    if not pd.isna(icao):  # Skip NaN values
        try:
            url = f"https://hexdb.io/api/v1/aircraft/{icao}"
            response = requests.get(url)
            
            if response.status_code == 200:
                data = response.json()
                # Add ICAO to the data for reference
                data['icao'] = icao
                aircraft_data.append(data)
            else:
                print(f"Failed to get data for ICAO {icao}: Status code {response.status_code}")
                
            # Add a small delay to avoid overloading the API
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error getting data for ICAO {icao}: {str(e)}")

# Create a DataFrame from the collected aircraft data
aircraft_df = pd.DataFrame(aircraft_data)

# Write the aircraft DataFrame to CSV
aircraft_df.to_csv('aircraft.csv', index=False)

print(f"Created aircraft.csv with information for {len(aircraft_data)} unique ICAO codes")

AttributeError: type object 'tqdm_notebook' has no attribute 'tqdm'