In [17]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [82]:
df = pd.read_pickle("data_sources/filtered/ntsb.pkl")
df.sample(10)

Unnamed: 0,Vehicles.VehicleNumber,Vehicles.DamageLevel,Vehicles.ExplosionType,Vehicles.FireType,Vehicles.SerialNumber,Vehicles.Make,Vehicles.Model,Vehicles.NumberOfEngines,Vehicles.RegistrationNumber,Vehicles.FlightOperationType,...,Country,EventDate,State,Agency,EventType,AirportId,AirportName,Latitude,Longitude,TotalInjuryCount
11914,1,substantial,none,none,17276639,cessna,172p,1,n9853l,inst,...,usa,2011-10-06 01:30:00,,ntsb,acc,pgum,guam international airport,13.483611,144.791381,0
20610,1,substantial,on-ground,on-ground,18263383,cessna,182p,1,n9187g,pers,...,usa,2004-12-19 10:48:00,ca,ntsb,acc,ful,fullerton municipal airport,33.888053,-118.023612,2
8973,1,substantial,none,none,5109,christen industries inc,pitts s,1,n50xv,inst,...,usa,2014-08-12 11:52:00,fl,ntsb,acc,,,27.232221,-82.524169,2
18729,1,substantial,none,none,2235,aviat,a-1b,1,n166ma,pers,...,usa,2006-05-26 20:05:00,tx,ntsb,acc,,,28.941667,-96.536941,2
4934,1,substantial,none,none,18-8140,piper,pa18,1,n5373y,pers,...,usa,2018-08-31 19:00:00,ak,ntsb,acc,,,62.095554,-148.212219,0
7111,1,substantial,none,none,172s8937,cessna,172,1,n128rm,inst,...,usa,2016-07-18 16:00:00,mo,ntsb,acc,sus,spirit of st louis,38.660278,-90.645835,0
15775,1,substantial,none,none,12-2288,piper,pa-12,1,n3424m,bant,...,usa,2008-08-01 15:46:00,sc,ntsb,acc,cre,grand strand airport,33.811668,-78.723892,1
17558,1,substantial,,none,28-7990479,piper,pa-28-181,1,n2834u,pers,...,usa,2007-05-04 20:30:00,nv,ntsb,acc,las,mc carran intl,36.080001,-115.152221,1
17612,1,none,none,none,23724,boeing,757-24apf,2,n402up,,...,usa,2007-04-24 07:20:00,ca,ntsb,acc,bur,bob hope airport,34.200553,-118.358612,1
17014,1,destroyed,unknown,unknown,32-7200030,piper,pa-32-260,1,n5067t,pers,...,usa,2007-08-25 14:15:00,ny,ntsb,acc,,,40.7,-72.583335,1


In [None]:
ntsb_copied = df.copy()
ntsb_copied = ntsb_copied.rename(columns={"EventDate": "Date", "NtsbNumber": "ID", "State": "Location"}) # for schema matching 

seed = 10
np.random.seed(seed)

n = len(ntsb_copied)

# select 40% indices for later use
n_forty = int(np.floor(0.4 * n))
random_indices = np.random.choice(ntsb_copied.index, n_forty, replace=False)
print(f"random_indices count (40%): {len(random_indices)}")  # Should be ~0.4 * n

# select half of these indices to assign NaN in 'ID'
n_missing = int(np.floor(0.5 * n_forty))
missing_indices = np.random.choice(random_indices, n_missing, replace=False)

# assign NaN only to these missing_indices
ntsb_copied.loc[missing_indices, "ID"] = np.nan  # for slot filling

print(f"Number of NaNs assigned (should be 20% of n): {len(missing_indices)}")

# count total NaNs in 'ID' (including existing NaNs)
total_nans = ntsb_copied['ID'].isna().sum()
print(f"Total NaNs in 'ID' column: {total_nans}")

conflict_indices = np.setdiff1d(random_indices, missing_indices)

# transform strings for conflict resolution
for index, row in ntsb_copied.iterrows():
    airport = row["AirportName"]
    if pd.notna(airport):
        result = ' '.join([word[0] + '.' for word in airport.split()])
        ntsb_copied.loc[index, "AirportName"] = result

random_indices count (40%): 9361
Number of NaNs assigned (should be 20% of n): 4680
Total NaNs in 'ID' column: 4680


11914       g. i. a.
20610       f. m. a.
8973            None
18729           None
4934            None
7111     s. o. s. l.
15775       g. s. a.
17558       m. c. i.
17612       b. h. a.
17014           None
Name: AirportName, dtype: object

## Weather Data Fusion

In [None]:
# spatial & temporal thresholds
LAT_LON_EPS   = 0.10       # ≈ 11 km at mid-latitudes
MAX_TIME_DIFF = pd.Timedelta('3h')   # reject candidates > 3 h away

# --- 1. Load data -------------------------------------------------------------
ntsb_path    = Path("data_sources/filtered/ntsb.pkl")
weather_path = Path("data_sources/filtered/weather.pkl")

ntsb    = pd.read_pickle(ntsb_path)
weather = pd.read_pickle(weather_path)

# ensure correct dtypes
ntsb["EventDate"] = pd.to_datetime(ntsb["EventDate"], errors="coerce")
weather["time"]   = pd.to_datetime(weather["time"],   errors="coerce")

# --- 2. Blocking on event *date* ---------------------------------------------
ntsb["event_day"]    = ntsb["EventDate"].dt.date
weather["weather_day"] = weather["time"].dt.date

weather_by_day = {d: w.reset_index(drop=True)
                  for d, w in weather.groupby("weather_day")}

# --- 3. Similarity matching & temporal precedence -----------------------------
best_rows = []       # stores best-matching weather rows (or None)

for _, acc in ntsb.iterrows():
    day_candidates = weather_by_day.get(acc["event_day"], pd.DataFrame())
    if day_candidates.empty:
        best_rows.append(None); continue
    
    # coarse spatial filter  |lat/lon diff| < LAT_LON_EPS
    spatial = day_candidates[
        (day_candidates["time"].notna()) &
        (day_candidates["AccidentID"].notna()) &       # keeps malformed rows out
        (day_candidates["AccidentID"].str.contains('_'))  # quick sanity
    ].copy()

    spatial = spatial[
        (np.abs(spatial["AccidentID"].str.split('_').str[-2].astype(float) - acc["Latitude" ] ) < LAT_LON_EPS) &
        (np.abs(spatial["AccidentID"].str.split('_').str[-1].astype(float) - acc["Longitude"]) < LAT_LON_EPS)
    ]

    if spatial.empty:
        best_rows.append(None); continue
    
    # temporal distance to the accident moment
    spatial["time_diff"] = (spatial["time"] - acc["EventDate"]).abs()
    
    # keep the closest hour that is still within MAX_TIME_DIFF
    spatial = spatial[spatial["time_diff"] <= MAX_TIME_DIFF]
    
    best_rows.append(spatial.nsmallest(1, "time_diff").iloc[0] if not spatial.empty else None)

# --- 4. Assemble the fused dataset -------------------------------------------
weather_match_df = pd.DataFrame.from_records(
    [row.to_dict() if row is not None else {}          # convert None into an empty dict {}
     for row in best_rows],
    index=ntsb.index                                   # keeps row-alignment
)

accident_weather = pd.concat(
    [ntsb.reset_index(drop=True),
     weather_match_df.add_prefix("wx_")],              # prefix to avoid clashes
    axis=1
)

# --- 5. Quick diagnostics -----------------------------------------------------
total_accidents = len(ntsb)
matched         = accident_weather["wx_time"].notna().sum()
print(f"Matched {matched} of {total_accidents} accidents "
      f"({matched / total_accidents:.1%})")

if matched:
    print("\nTime difference (min) for matched rows:")
    print((accident_weather.loc[accident_weather.wx_time.notna(), "wx_time_diff"]
           .dt.total_seconds().div(60)
           .describe().round(2)))

    print("\nSpatial deltas (deg lat/lon) for matched rows:")
    lat_delta = np.abs(accident_weather["Latitude"] - accident_weather["wx_AccidentID"]
                       .str.split('_').str[-2].astype(float))
    lon_delta = np.abs(accident_weather["Longitude"] - accident_weather["wx_AccidentID"]
                       .str.split('_').str[-1].astype(float))
    print(pd.concat({"lat": lat_delta, "lon": lon_delta}, axis=1).describe().round(4))

accident_weather.drop(columns=["event_day","wx_AccidentID","wx_weather_day"], errors='ignore', inplace=True)
accident_weather.to_pickle("data_sources/fused/accident_weather.pkl")
accident_weather.to_csv("data_sources/fused/accident_weather.csv", index=False)

Matched 20858 of 23403 accidents (89.1%)

Time difference (min) for matched rows:
count    20858.00
mean        14.80
std         10.81
min          0.00
25%          5.00
50%         15.00
75%         25.00
max         58.00
Name: wx_time_diff, dtype: float64

Spatial deltas (deg lat/lon) for matched rows:
              lat         lon
count  20858.0000  20858.0000
mean       0.0000      0.0000
std        0.0010      0.0010
min        0.0000      0.0000
25%        0.0000      0.0000
50%        0.0000      0.0000
75%        0.0000      0.0000
max        0.0753      0.0881


## Matched Aircraft Data Fusion

In [None]:
# Load the datasets
accident_weather_path = 'data_sources/fused/accident_weather.pkl'
matched_results_path = 'data_sources/binding/matched_results.csv'

accident_weather_df = pd.read_pickle(accident_weather_path)
matched_results_df = pd.read_csv(matched_results_path)


def clean_text(s):
    """ Normalizzazione del testo: rimozione di caratteri speciali, lowercase e spazi extra. """
    return re.sub(r'\W+', ' ', str(s)).lower().strip()

# Pulizia dei dati
accident_weather_df['Vehicles.Model'] = accident_weather_df['Vehicles.Model'].apply(clean_text)
accident_weather_df['Vehicles.Make'] = accident_weather_df['Vehicles.Make'].apply(clean_text)

# Normalize casing for matching
matched_results_df['NtsbNumber'] = matched_results_df['NtsbNumber'].str.lower()
matched_results_df['EventDate'] = pd.to_datetime(matched_results_df['EventDate'], errors='coerce')
matched_results_df['Vehicles.SerialNumber'] = matched_results_df['Vehicles.SerialNumber'].str.lower()
matched_results_df['Vehicles.RegistrationNumber'] = matched_results_df['Vehicles.RegistrationNumber'].str.lower()
matched_results_df['Vehicles.Make'] = matched_results_df['Vehicles.Make'].str.lower()
matched_results_df['Vehicles.Model'] = matched_results_df['Vehicles.Model'].str.lower()

matched_results_df.drop(columns=["JW_Score","LEV_Score","Jac_Score","SimilarityScore","Matched_Aircraft_Model"], errors='ignore', inplace=True)


accident_weather_df['NtsbNumber'] = accident_weather_df['NtsbNumber'].str.lower()
accident_weather_df['EventDate'] = pd.to_datetime(accident_weather_df['EventDate'], errors='coerce')
accident_weather_df['Vehicles.SerialNumber'] = accident_weather_df['Vehicles.SerialNumber'].astype(str).str.lower()
accident_weather_df['Vehicles.RegistrationNumber'] = accident_weather_df['Vehicles.RegistrationNumber'].astype(str).str.lower()
accident_weather_df['Vehicles.Make'] = accident_weather_df['Vehicles.Make'].astype(str).str.lower()
accident_weather_df['Vehicles.Model'] = accident_weather_df['Vehicles.Model'].astype(str).str.lower()

accident_weather_df.drop(columns=["Vehicles.VehicleNumber"], errors='ignore', inplace=True)
accident_weather_df.rename(columns={"wx_time": "weather_time"}, inplace=True)

for key in accident_weather_df.columns:
    if key.startswith('wx_'):
        accident_weather_df.rename(columns={key: key[3:]}, inplace=True)

# Define the merge keys
merge_keys = ['NtsbNumber','EventDate','Vehicles.SerialNumber', 'Vehicles.RegistrationNumber', 'Vehicles.Make', 'Vehicles.Model']

# Perform the merge
fused_df = accident_weather_df.merge(
    matched_results_df,
    how='left',
    left_on=merge_keys,
    right_on=merge_keys
)

# Drop the duplicate matching columns from the right
for key in merge_keys:
    fused_df.drop(columns=[f"{key}_y"], errors='ignore', inplace=True)
    fused_df.rename(columns={f"{key}_x": key}, inplace=True)

# Save the resulting dataframe
fused_df.to_pickle('data_sources/fused/accident_weather_enriched.pkl')
fused_df.to_csv("data_sources/fused/accident_weather_enriched.csv", index=False)

# Compute matching stats
total_records = len(accident_weather_df)
matched_records = fused_df['Matched_Aircraft_Model'].notna().sum()
unmatched_records = total_records - matched_records
match_percentage = (matched_records / total_records) * 100

# Print statistics
print("Fusion complete. Enriched dataset saved to: data_sources/fused/accident_weather_enriched.pkl")
print("\n--- Matching Statistics ---")
print(f"Total records in original dataset: {total_records}")
print(f"Total records matched with binding CSV: {matched_records}")
print(f"Total unmatched records: {unmatched_records}")
print(f"Match percentage: {match_percentage:.2f}%")


Fusion complete. Enriched dataset saved to: data_sources/fused/accident_weather_enriched.pkl

--- Matching Statistics ---
Total records in original dataset: 23403
Total records matched with binding CSV: 4962
Total unmatched records: 18441
Match percentage: 21.20%


In [25]:
fused_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23403 entries, 0 to 23402
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype          
---  ------                        --------------  -----          
 0   Vehicles.DamageLevel          23400 non-null  category       
 1   Vehicles.ExplosionType        21880 non-null  category       
 2   Vehicles.FireType             23321 non-null  category       
 3   Vehicles.SerialNumber         23403 non-null  object         
 4   Vehicles.Make                 23403 non-null  object         
 5   Vehicles.Model                23403 non-null  object         
 6   Vehicles.NumberOfEngines      23403 non-null  int64          
 7   Vehicles.RegistrationNumber   23403 non-null  object         
 8   Vehicles.FlightOperationType  21593 non-null  object         
 9   Vehicles.OperatorName         11290 non-null  object         
 10  Oid                           23403 non-null  object         
 11  MKey           

## fixing issue between `engine_count` and `Vehicles.NumberOfEngines`

In [32]:
# Load the dataset
df_path = 'data_sources/fused/accident_weather_enriched.pkl'
df = pd.read_pickle(df_path)

# Convert columns to nullable integers
engine_count_int = df['engine_count'].astype('Int64')
vehicle_engines = df['Vehicles.NumberOfEngines'].astype('Int64')

# Rule 1: Fill NaNs in Vehicles.NumberOfEngines with engine_count
df['Vehicles.NumberOfEngines'] = vehicle_engines.combine_first(engine_count_int)

# Rule 2: If Vehicles.NumberOfEngines == 0 and engine_count > 0 → trust engine_count
mask_replace_zero = (
    (df['Vehicles.NumberOfEngines'] == 0) &
    (engine_count_int > 0)
)
df.loc[mask_replace_zero, 'Vehicles.NumberOfEngines'] = engine_count_int[mask_replace_zero]

# Rule 3: Overwrite in case of real conflict (≠ 0 and ≠ each other)
conflict_mask = (
    engine_count_int.notna() &
    df['Vehicles.NumberOfEngines'].notna() &
    (df['Vehicles.NumberOfEngines'] != engine_count_int) &
    (df['Vehicles.NumberOfEngines'] != 0) &
    (engine_count_int != 0)
)
df.loc[conflict_mask, 'Vehicles.NumberOfEngines'] = engine_count_int[conflict_mask]

# Drop auxiliary column
df.drop(columns=['engine_count'], inplace=True)

# Save cleaned and final dataset
final_pkl_path = 'data_sources/fused/accident_weather_final.pkl'
final_csv_path = 'data_sources/fused/accident_weather_final.csv'

df.to_pickle(final_pkl_path)
df.to_csv(final_csv_path, index=False)

print(f"✅ Fusion complete. Cleaned dataset saved to:\n  • {final_pkl_path}\n  • {final_csv_path}")
print(f"🔄 {conflict_mask.sum()} engine count conflicts were resolved by trusting the 'engine_count' value.")

✅ Fusion complete. Cleaned dataset saved to:
  • data_sources/fused/accident_weather_final.pkl
  • data_sources/fused/accident_weather_final.csv
🔄 8 engine count conflicts were resolved by trusting the 'engine_count' value.
