## Project Phase 1 - Aviation Accident Data Integration
### Group 03:
- Tommaso Tragno - fc64699
- Manuel Cardoso - fc56274
- Chen Cheng - fc64872
- Cristian Tedesco - fc65149

#### Setup

In [None]:
import pandas as pd
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import os

### 1. Data Loading

In [None]:
PATH = 'data_sources/'

NTSB_DATA = 'ntsb-us-2003-2023.json'
AIR_TRAFFIC_DATA = 'u-s-airline-traffic-data.csv'
AIRCRAFT_DATA = 'aircraft_data.csv'

#### Load NTSB JSON Data

In [None]:
with open(PATH+NTSB_DATA, 'r', encoding='utf-8') as f:
    ntsb_raw_data = json.load(f)

# Each record is one accident/incident entry in a list
print(f'\n--- NTSB JSON loaded: {len(ntsb_raw_data)} total records found ---')

# Convert to a DataFrame (this will flatten top-level fields)
# For nested fields like 'Vehicles', we might do a separate flatten later
df_ntsb = pd.json_normalize(ntsb_raw_data, 
                            meta=[
                                'Oid','MKey','Closed','CompletionStatus','HasSafetyRec',
                                'HighestInjury','IsStudy','Mode','NtsbNumber',
                                'OriginalPublishedDate','MostRecentReportType','ProbableCause',
                                'City','Country','EventDate','State','Agency','BoardLaunch',
                                'BoardMeetingDate','DocketDate','EventType','Launch','ReportDate',
                                'ReportNum','ReportType','AirportId','AirportName','AnalysisNarrative',
                                'FactualNarrative','PrelimNarrative','FatalInjuryCount','MinorInjuryCount',
                                'SeriousInjuryCount','InvestigationClass','AccidentSiteCondition',
                                'Latitude','Longitude','DocketOriginalPublishDate'
                            ],
                            record_path=['Vehicles'],  # This flattens out the 'Vehicles' array
                            record_prefix='Vehicles.'
                           )

print('\n--- Flattened NTSB DataFrame (including Vehicles info): ---')

# print(df_ntsb.info())

# combines all injury counts to 1 column
df_ntsb['TotalInjuryCount'] = df_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
df_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch'
                      ,'BoardMeetingDate','Launch','IsStudy','OriginalPublishedDate','DocketOriginalPublishDate'
                      ,'ReportType','ReportNum','ReportDate','MostRecentReportType','FatalInjuryCount','MinorInjuryCount'
                      ,'SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'
                      ,'Vehicles.AircraftCategory','Vehicles.AmateurBuilt','Vehicles.EventID','Vehicles.AirMedical'
                      ,'Vehicles.AirMedicalType','Vehicles.flightScheduledType','Vehicles.flightServiceType'
                      ,'Vehicles.flightTerminalType','Vehicles.RegisteredOwner','Vehicles.RegulationFlightConductedUnder'
                      ,'Vehicles.RepGenFlag','Vehicles.RevenueSightseeing','Vehicles.SecondPilotPresent','Vehicles.Damage'
                      ,'AccidentSiteCondition'], inplace=True) 

# dropping NaT entries from EventDate
df_ntsb = df_ntsb.dropna(subset=['EventDate'])

# Type Conversion
df_ntsb['EventDate'] = pd.to_datetime(df_ntsb['EventDate']).dt.tz_localize(None)
df_ntsb['Vehicles.VehicleNumber'] = pd.to_numeric(df_ntsb['Vehicles.VehicleNumber'], errors='coerce').astype(int)
df_ntsb['MKey'] = pd.to_numeric(df_ntsb['MKey'], errors='coerce').astype(int)
df_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(df_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(int)
df_ntsb['Latitude'] = pd.to_numeric(df_ntsb['Latitude'], errors='coerce').astype(float)
df_ntsb['Longitude'] = pd.to_numeric(df_ntsb['Longitude'], errors='coerce').astype(float)
df_ntsb['TotalInjuryCount'] = pd.to_numeric(df_ntsb['TotalInjuryCount'], errors='coerce').astype(int)

categorical_cols = [
    'Vehicles.DamageLevel',
    'Vehicles.ExplosionType',
    'Vehicles.FireType',
    'HighestInjury',
    'EventType',
    'AccidentSiteCondition'
]

for col in categorical_cols:
    if col in df_ntsb.columns:
        df_ntsb[col] = df_ntsb[col].astype('category')

df_ntsb = df_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase

print(df_ntsb.info())

print('\n--- Saving filtered NTSB DataFrame... ---')
df_ntsb.to_pickle(PATH+'filtered/ntsb.pkl')


--- NTSB JSON loaded: 22992 total records found ---

--- Flattened NTSB DataFrame (including Vehicles info): ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23403 entries, 0 to 23402
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Vehicles.VehicleNumber        23403 non-null  int64         
 1   Vehicles.DamageLevel          23400 non-null  category      
 2   Vehicles.ExplosionType        21880 non-null  category      
 3   Vehicles.FireType             23321 non-null  category      
 4   Vehicles.SerialNumber         23283 non-null  object        
 5   Vehicles.Make                 23402 non-null  object        
 6   Vehicles.Model                23398 non-null  object        
 7   Vehicles.NumberOfEngines      23403 non-null  int64         
 8   Vehicles.RegistrationNumber   23397 non-null  object        
 9   Vehicles.FlightOperationType  21593 non-null  

#### Load Airline Traffic CSV Data

In [113]:
df_airline_traffic = pd.read_csv(PATH+AIR_TRAFFIC_DATA, encoding='utf-8')

print(f'\n--- Airline CSV loaded: {df_airline_traffic.shape[0]} rows, {df_airline_traffic.shape[1]} columns ---')

# dropping unnecessary columns
df_airline_traffic.drop(columns=['Dom_RPM','Int_RPM','RPM','Dom_ASM','Int_ASM','ASM','Dom_LF','Int_LF','LF'], inplace=True) 

# print(df_airline_traffic.info())

# Remove commas from all columns and then convert
df_airline_traffic = df_airline_traffic.replace(',', '', regex=True)

# Now convert each column to numeric. If everything converts well, no rows become NaN.
df_airline_traffic = df_airline_traffic.apply(pd.to_numeric, errors='coerce').astype(int)

print(df_airline_traffic.info())

print('\n--- Saving filtered Airline DataFrame... ---')
df_airline_traffic.to_pickle(PATH+'filtered/airline.pkl')


--- Airline CSV loaded: 249 rows, 17 columns ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Year     249 non-null    int64
 1   Month    249 non-null    int64
 2   Dom_Pax  249 non-null    int64
 3   Int_Pax  249 non-null    int64
 4   Pax      249 non-null    int64
 5   Dom_Flt  249 non-null    int64
 6   Int_Flt  249 non-null    int64
 7   Flt      249 non-null    int64
dtypes: int64(8)
memory usage: 15.7 KB
None

--- Saving filtered Airline DataFrame... ---


#### Load Aircraft CSV Data

In [114]:
df_aircraft = pd.read_csv(PATH+AIRCRAFT_DATA, encoding='utf-8')

print(f'\n--- Aircraft CSV loaded: {df_aircraft.shape[0]} rows, {df_aircraft.shape[1]} columns ---')

# print(df_aircraft.info())

# dropping unnecessary columns
df_aircraft.drop(columns=['Unnamed: 0'], inplace=True)
df_aircraft.drop(columns=['retired'], inplace=True)

# make string values lowercase
df_aircraft['aircraft'] = df_aircraft['aircraft'].str.lower()

# Type Conversion
df_aircraft['nbBuilt'] = pd.to_numeric(df_aircraft['nbBuilt'], errors='coerce').astype(int)
df_aircraft['startDate'] = pd.to_numeric(df_aircraft['startDate'], errors='coerce').astype(int)
df_aircraft['endDate'] = pd.to_numeric(df_aircraft['endDate'], errors='coerce').astype('Int64')  # Use 'Int64' for nullable integers

print(df_aircraft.info())

print('\n--- Saving filtered Aircraft DataFrame... ---')
df_aircraft.to_pickle(PATH+'filtered/aircraft.pkl')


--- Aircraft CSV loaded: 1266 rows, 6 columns ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   aircraft   1266 non-null   object
 1   nbBuilt    1266 non-null   int64 
 2   startDate  1266 non-null   int64 
 3   endDate    997 non-null    Int64 
dtypes: Int64(1), int64(2), object(1)
memory usage: 40.9+ KB
None

--- Saving filtered Aircraft DataFrame... ---


### 2. Data Profiling

In [61]:
def profile_dataframe(df, name='DataFrame'):
    print(f'\n=== Profiling {name} ===')
    print(f'Total Rows: {len(df)}')
    print(f'Total Columns: {len(df.columns)}\n')
    
    profile_results = []

    for col in df.columns:
        series = df[col]
        col_dtype = series.dtype
        
        # Basic counts
        total_count = len(series)
        missing_vals = series.isna().sum()
        non_null_count = total_count - missing_vals
        missing_perc = (missing_vals / total_count) * 100
        unique_vals = series.nunique(dropna=False)
        
        # Mode & frequency
        try:
            modes = series.mode(dropna=True)
            mode_val = modes.iloc[0] if len(modes) > 0 else np.nan
            mode_freq = (series == mode_val).sum(skipna=True)
        except:
            mode_val, mode_freq = np.nan, np.nan
        
        # Initialize placeholders
        mean_ = np.nan
        min_  = np.nan
        q25   = np.nan
        q50   = np.nan
        q75   = np.nan
        max_  = np.nan
        std_  = np.nan  # only for numeric columns

        # Numeric columns
        if pd.api.types.is_numeric_dtype(series):
            mean_ = series.mean(skipna=True)
            min_  = series.min(skipna=True)
            q25   = series.quantile(0.25)
            q50   = series.quantile(0.50)
            q75   = series.quantile(0.75)
            max_  = series.max(skipna=True)
            std_  = series.std(skipna=True)

        # Datetime columns
        elif pd.api.types.is_datetime64_any_dtype(series):
            # We can compute mean & quartiles by time. 
            # .quantile() and .mean() are valid for datetime in pandas
            # They return a Timestamp for mean, 
            # and Timestamps for quantiles
            if non_null_count > 0:
                mean_ = series.mean(skipna=True)
                min_  = series.min(skipna=True)
                q25   = series.quantile(0.25)
                q50   = series.quantile(0.50)
                q75   = series.quantile(0.75)
                max_  = series.max(skipna=True)
            # We skip std_ for datetime.

        # Categorical/object columns 
        # do not get numeric stats (we keep them as NaN).

        profile_results.append((
            col,
            str(col_dtype),
            total_count,
            non_null_count,
            missing_vals,
            round(missing_perc, 2),
            unique_vals,
            mode_val,
            mode_freq,
            mean_,
            min_,
            q25,
            q50,
            q75,
            max_,
            std_
        ))

    columns = [
        'Column', 'DataType', 'TotalCount', 'NonNullCount', 'NumMissing',
        'MissingPerc', 'Cardinality', 'Mode', 'ModeFreq',
        'Mean', 'Min', 'Q25', 'Q50', 'Q75', 'Max', 'Std'
    ]

    prof_df = pd.DataFrame(profile_results, columns=columns)
    
    return prof_df

#### NTSB Data Profile

In [62]:
ntsb_profile = profile_dataframe(df_ntsb, name='NTSB Data')
display(HTML(ntsb_profile.to_html()))
ntsb_profile.to_csv(PATH+'profiling/ntsb_profile.csv', index=False)


=== Profiling NTSB Data ===
Total Rows: 23403
Total Columns: 27



Unnamed: 0,Column,DataType,TotalCount,NonNullCount,NumMissing,MissingPerc,Cardinality,Mode,ModeFreq,Mean,Min,Q25,Q50,Q75,Max,Std
0,Vehicles.VehicleNumber,int64,23403,23403,0,0.0,3,1,22986,1.018117,1,1.0,1.0,1.0,3,0.135603
1,Vehicles.DamageLevel,category,23403,23400,3,0.01,6,substantial,19718,,,,,,,
2,Vehicles.ExplosionType,category,23403,21880,1523,6.51,6,none,21306,,,,,,,
3,Vehicles.FireType,category,23403,23321,82,0.35,7,none,20993,,,,,,,
4,Vehicles.SerialNumber,object,23403,23283,120,0.51,21514,001,19,,,,,,,
5,Vehicles.Make,object,23403,23402,1,0.0,1098,cessna,8191,,,,,,,
6,Vehicles.Model,object,23403,23398,5,0.02,3362,172,762,,,,,,,
7,Vehicles.NumberOfEngines,int64,23403,23403,0,0.0,5,1,19416,1.13823,0,1.0,1.0,1.0,4,0.422117
8,Vehicles.RegistrationNumber,object,23403,23397,6,0.03,22386,unreg,27,,,,,,,
9,Vehicles.FlightOperationType,object,23403,21593,1810,7.73,22,pers,14516,,,,,,,


Insights from the data profile results:

- there are some `null` values for Latitude and Longitude --> we keep like this, but they should be handled during the API calls to open-meteo
- there are less unique `NtsbNumber` than rows --> for incident where more than one aircraft is involved, the rows are duplicated with different values for Vehicles characteristic, and same value for incident data (look at the following example)

In [22]:
df_ntsb.loc[df_ntsb['NtsbNumber']=='ops24la011']

Unnamed: 0,Vehicles.VehicleNumber,Vehicles.DamageLevel,Vehicles.ExplosionType,Vehicles.FireType,Vehicles.SerialNumber,Vehicles.Make,Vehicles.Model,Vehicles.NumberOfEngines,Vehicles.RegistrationNumber,Vehicles.FlightOperationType,Vehicles.OperatorName,Oid,MKey,HighestInjury,NtsbNumber,ProbableCause,City,Country,EventDate,State,Agency,EventType,AirportId,AirportName,Latitude,Longitude,TotalInjuryCount
39,1,none,none,none,c0218,diamond aircraft ind inc,da20-c1,1,n857pa,,diamond aircraft sales of kentucky llc,67ee2dab017de3d12ee03758,193529,,ops24la011,,north las vegas,usa,2023-12-09 13:06:00,nv,ntsb,occ,vgt,north las vegas,36.211268,-115.19968,0
40,2,none,none,none,1955,robinson helicopter,r44,1,n744af,,skyline helicopter tours llc,67ee2dab017de3d12ee03758,193529,,ops24la011,,north las vegas,usa,2023-12-09 13:06:00,nv,ntsb,occ,vgt,north las vegas,36.211268,-115.19968,0


#### Air Traffic Data Profile

In [84]:
airline_profile = profile_dataframe(df_airline_traffic, name='Airline Data')
display(HTML(airline_profile.to_html()))
airline_profile.to_csv(PATH+'profiling/airline_profile.csv', index=False)


=== Profiling Airline Data ===
Total Rows: 249
Total Columns: 8



Unnamed: 0,Column,DataType,TotalCount,NonNullCount,NumMissing,MissingPerc,Cardinality,Mode,ModeFreq,Mean,Min,Q25,Q50,Q75,Max,Std
0,Year,int64,249,249,0,0.0,21,2003,12,2012.88,2003,2008.0,2013.0,2018.0,2023,6.002817
1,Month,int64,249,249,0,0.0,12,1,21,6.445783,1,3.0,6.0,9.0,12,3.442803
2,Dom_Pax,int64,249,249,0,0.0,249,2877290,1,55209710.0,2877290,50982170.0,56200104.0,60892131.0,75378157,10440550.0
3,Int_Pax,int64,249,249,0,0.0,249,136609,1,7392209.0,136609,6395022.0,7419187.0,8567847.0,12432615,2020273.0
4,Pax,int64,249,249,0,0.0,249,3013899,1,62601920.0,3013899,57664576.0,63899130.0,69447429.0,87810772,12292110.0
5,Dom_Flt,int64,249,249,0,0.0,249,217262,1,706750.7,217262,662000.0,709933.0,781804.0,890938,107055.0
6,Int_Flt,int64,249,249,0,0.0,248,63469,2,64736.17,4996,61615.0,66557.0,71924.0,82681,12129.62
7,Flt,int64,249,249,0,0.0,249,222280,1,771486.9,222280,727898.0,779011.0,848650.0,964102,115686.7


#### Aircraft Data Profile

In [100]:
aircraft_profile = profile_dataframe(df_aircraft, name='Aircraft Data')
display(HTML(aircraft_profile.to_html()))
aircraft_profile.to_csv(PATH+'profiling/aircraft_profile.csv', index=False)


=== Profiling Aircraft Data ===
Total Rows: 1266
Total Columns: 4



Unnamed: 0,Column,DataType,TotalCount,NonNullCount,NumMissing,MissingPerc,Cardinality,Mode,ModeFreq,Mean,Min,Q25,Q50,Q75,Max,Std
0,aircraft,object,1266,1266,0,0.0,1265,mcdonnell douglas f-4 phantom ii in uk service,2,,,,,,,
1,nbBuilt,int64,1266,1266,0,0.0,633,1,69,1252.535545,0.0,32.25,185.0,703.0,43400.0,3618.899938
2,startDate,int64,1266,1266,0,0.0,123,1936,33,1929.436019,1.0,1937.0,1951.0,1974.75,2015.0,224.918816
3,endDate,Int64,1266,997,269,21.25,119,1945,51,1930.302909,1.0,1938.0,1949.0,1979.0,2016.0,227.826756


Insights from the data profile results:

- there are some `startDate` and `endDate` equal to 1 --> it is supposed to be a year

In [110]:
df_filtered = df_aircraft[(df_aircraft['startDate'] < 1000) | (df_aircraft['endDate'] < 1000)]
df_filtered.style.map(
    lambda val: 'background-color: red' if val < 1000 else '',
    subset=['startDate', 'endDate']
)

Unnamed: 0,aircraft,nbBuilt,startDate,endDate
82,lockheed c-5 galaxy,131,5,5.0
86,british aerospace nimrod aew3,8,11,11.0
171,schneider es-57 kingfisher,11,2,
190,bell 222,230,222,1991.0
284,flitfire,49,10,10.0
308,grumman c-2 greyhound,58,2,2.0
498,chu hummingbird,2,2,2.0
514,embraer legacy 500,500,500,
518,lockheed martin f-22 raptor,195,22,22.0
536,gallaudet d-4,2,2,2.0


### 3. Data Cleaning

In [None]:
# todo: fix the data issue

## Wheather Data Query

In [None]:
PKL_FILE = PATH+"filtered/ntsb.pkl"
WEATHER_CACHE_FILE = PATH+"weather_results.json"
MAX_CALLS_PER_DAY = 10000
COST_PER_CALL = 1.9

# --------------------------------------------------------------------------
# 1) Read NTSB dataframe
# --------------------------------------------------------------------------

df_ntsb = pd.read_pickle(PKL_FILE)

# --------------------------------------------------------------------------
# 2) Identify unique accidents that need API calls
# --------------------------------------------------------------------------
unique_accidents = df_ntsb[['NtsbNumber', 'Latitude', 'Longitude', 'EventDate']].drop_duplicates()

# --------------------------------------------------------------------------
# 3) Load partial results from disk if they exist
#    This lets us resume without losing progress
# --------------------------------------------------------------------------
weather_results_file = WEATHER_CACHE_FILE
if os.path.exists(weather_results_file):
    with open(weather_results_file, 'r', encoding='utf-8') as f:
        weather_data_cache = json.load(f)
else:
    weather_data_cache = {}  # key = NtsbNumber (or tuple), value = dictionary of weather data

# Keep track of how many calls we’ve made so far today
calls_made_today = 0

# --------------------------------------------------------------------------
# 4) Loop over each unique accident, calling the weather API if needed
# --------------------------------------------------------------------------
for idx, row in unique_accidents.iterrows():
    ntsb_number = row["NtsbNumber"]
    date_str = pd.to_datetime(row["EventDate"]).strftime('%Y-%m-%d')
    lat, lon = row["Latitude"], row["Longitude"]
    cache_key = f"{ntsb_number}_{date_str}_{lat}_{lon}"

    # Skip if we already have this accident’s weather data
    if cache_key in weather_data_cache:
        continue

    if calls_made_today + COST_PER_CALL > MAX_CALLS_PER_DAY:
        print("Reached daily limit. Saving partial results and exiting.")
        break

    # ----------------------------------------------------------------------
    # 5) Call the Open-Meteo API for the date & coordinates
    # ----------------------------------------------------------------------
    endpoint = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": date_str,
        "end_date": date_str,
        "hourly": ",".join([
            "temperature_2m",
            "relative_humidity_2m",
            "dew_point_2m",
            "pressure_msl",
            "surface_pressure",
            "precipitation",
            "rain",
            "snowfall",
            "cloud_cover",
            "cloud_cover_low",
            "cloud_cover_mid",
            "cloud_cover_high",
            "wind_speed_10m",
            "wind_speed_100m",
            "wind_direction_10m",
            "wind_direction_100m",
            "wind_gusts_10m",
            "weather_code",
            "snow_depth"
        ]),
        "timezone": "GMT"
    }

    try:
        response = requests.get(endpoint, params=params)
        if response.status_code == 200:
            data = response.json()
            # For demonstration, let’s store the entire JSON for the date, or just store the hour you want
            # Example: store the entire ‘hourly’ block
            weather_data_cache[cache_key] = data.get('hourly', {})
            print(f"Success for {cache_key}")
        else:
            print(f"API call failed for {cache_key}: status {response.status_code}")
            weather_data_cache[cache_key] = {}  # store an empty dict on failure
    except Exception as e:
        print(f"Error calling API for {cache_key}: {e}")
        weather_data_cache[cache_key] = {}

    # Bump up our calls counter
    calls_made_today += COST_PER_CALL

    # ----------------------------------------------------------------------
    # 6) Write partial results every 100 request to disk so we don’t lose
    #    them in case we break out or crash
    # ----------------------------------------------------------------------
    if idx % 100 == 0:
        print(f"Processed {idx} accidents. Saving partial results.")
        with open(WEATHER_CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(weather_data_cache, f)
            
# ------------------------------------------------------------------
# Finished or stopped by limit. Write final cache to disk.
# ------------------------------------------------------------------
with open(WEATHER_CACHE_FILE, 'w', encoding='utf-8') as f:
    json.dump(weather_data_cache, f)
print("All done. Weather data cached in:", WEATHER_CACHE_FILE)

## step for next phase (I think)

In [None]:
# --------------------------------------------------------------------------
# 7) Once finished, we have a cache (weather_data_cache) with keys for each
#    unique NTSB event and the associated weather data
# --------------------------------------------------------------------------

# Example: how to merge back with your main df_ntsb
# Create a dataframe from the cached results. Suppose we want a final table with:
#   [NtsbNumber, temperature_2m_17h, precipitation_17h, ...] 
# We would parse the stored JSON for the hour '17:00' (as you did in your example).
records_for_merge = []
for key, hourly_data in weather_data_cache.items():
    # key might be the ntsb_number or a composite key
    ntsb_number = key.split("_")[0] if "_" in key else key  # if needed

    # If “hourly_data” is empty, we skip
    if not hourly_data:
        records_for_merge.append({
            "NtsbNumber": ntsb_number,
            "temperature_2m_17h": None,
            "precipitation_17h": None
            # ... etc
        })
        continue

    # Attempt to find the index of e.g. 'YYYY-MM-DDT17:00'
    # For illustration, let's re-construct the date string from your original approach
    # This depends on how you want to store it. Adjust as needed
    # We might just store the entire time series or pick a single hour
    times = hourly_data.get("time", [])
    weather_record = {
        "NtsbNumber": ntsb_number,
        "temperature_2m_17h": None,
        "precipitation_17h": None
        # ...
    }
    try:
        # For instance, "YYYY-MM-DDT17:00" is in times
        # (If you want 17:00Z specifically, build that string with the date you used above)
        target_time = f"{times[0][:10]}T17:00"  # e.g. "2023-12-31T17:00"
        idx = times.index(target_time)
        # Fill in the relevant fields from 'hourly_data'
        weather_record["temperature_2m_17h"] = hourly_data["temperature_2m"][idx]
        weather_record["precipitation_17h"] = hourly_data["precipitation"][idx]
        # ... any other fields you want
    except ValueError:
        # Means that the "17:00" time not in list
        pass

    records_for_merge.append(weather_record)

df_weather = pd.DataFrame(records_for_merge)

# Now merge with your main df_ntsb
df_merged = df_ntsb.merge(df_weather, on="NtsbNumber", how="left")

print(df_merged.head())