In [None]:
import pandas as pd
import json
#import pymongo as pm
#import mysql.connector
import time
import requests
import json
import py_stringmatching as sm
import re

## Data cleaning
1. Load the `.csv` and `.json` dataset;
2. Drop the rows that do not contains required data
3. Fill the `na` cells with a predefined value
4. Drop eventualy doplicates
5. Convert the string data into the proper data type

In [4]:
PATH = 'data_sources'

# Load dataset into pandas dataframe
df_airline_traffic = pd.read_csv(f'{PATH}/u-s-airline-traffic-data.csv')
df_ntsb = pd.read_json(f'{PATH}/ntsb-us-2003-2023.json')

print('Check NA values presence before data validation')
print(f'Airline traffic data frame: {df_airline_traffic.isna().any().any()}')
print(f'NTSB data frame: {df_ntsb.isna().any().any()}')

#Cleaning df_ntsb
# Convert EventDate to datetime and remove timezone
df_ntsb['EventDate'] = pd.to_datetime(df_ntsb['EventDate']).dt.tz_localize(None)

#df.drop_duplicates(subset=[col for col in df.columns if df[col].dtype != 'object'], inplace=True) # no need to drop duplicates because there aren't

df_ntsb = df_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase

# combines all injury counts to 1 column
df_ntsb['TotalInjuryCount'] = df_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
df_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch','BoardMeetingDate','Launch','IsStudy'
                 ,'OriginalPublishedDate','DocketOriginalPublishDate','ReportType','ReportNum','ReportDate','MostRecentReportType'
                 ,'FatalInjuryCount','MinorInjuryCount','SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'], inplace=True) 

# dropping NaT entries from EventDate
df_ntsb = df_ntsb.dropna(subset=['EventDate'])

#print(df_ntsb.columns.tolist())
#print(df.describe())  # Summary statistics
#print(df.info())  # Data types and missing values
#print(df.isnull().sum())  # Check missing values

#Cleaning df_airline_traffic

# dropping unnecessary columns
df_airline_traffic.drop(columns=['Dom_RPM','Int_RPM','RPM','Dom_ASM','Int_ASM','ASM','Dom_LF','Int_LF','LF'], inplace=True) 

df_airline_traffic

Check NA values presence before data validation
Airline traffic data frame: False
NTSB data frame: True


Unnamed: 0,Year,Month,Dom_Pax,Int_Pax,Pax,Dom_Flt,Int_Flt,Flt
0,2003,1,43032450,4905830,47938280,785160,57667,842827
1,2003,2,41166780,4245366,45412146,690351,51259,741610
2,2003,3,49992700,5008613,55001313,797194,58926,856120
3,2003,4,47033260,4345444,51378704,766260,55005,821265
4,2003,5,49152352,4610834,53763186,789397,55265,844662
...,...,...,...,...,...,...,...,...
244,2023,5,71423653,10358666,81782319,667331,71924,739255
245,2023,6,72482621,11544505,84027126,661293,75279,736572
246,2023,7,75378157,12432615,87810772,684939,79738,764677
247,2023,8,71477988,11572149,83050137,691482,77137,768619


In [None]:
# Code to filter to the date we want

# Debug: Check min and max dates
print("Earliest Date:", df_ntsb['EventDate'].min())
print("Latest Date:", df_ntsb['EventDate'].max())

# Define the date range (without timezone)
start_date = pd.to_datetime('2003-01-01')
end_date = pd.to_datetime('2023-12-31')

# Filter the dataset
filtered_df = df_ntsb[(df_ntsb['EventDate'] >= start_date) & (df_ntsb['EventDate'] <= end_date) & (df_ntsb['Country'] == 'usa')]
print(filtered_df['State'].tolist())
filtered_df
# Display results
#print(f"Total Records Found: {len(filtered_df)}")
#print(filtered_df[['EventDate', 'HighestInjury', 'Country']].sample(10))  # Show 50 random dates

### open-meteo API call test

In [None]:
# Define the endpoint
endpoint = "https://archive-api.open-meteo.com/v1/archive"

# Define the parameters
params = {
    "latitude": 41.610278,
    "longitude": -90.588361,
    "start_date": "2023-12-31",
    "end_date": "2023-12-31",
    "hourly": ",".join([
        "temperature_2m",
        "relative_humidity_2m",
        "dew_point_2m",
        "pressure_msl",
        "surface_pressure",
        "precipitation",
        "rain",
        "snowfall",
        "cloud_cover",
        "cloud_cover_low",
        "cloud_cover_mid",
        "cloud_cover_high",
        "wind_speed_10m",
        "wind_speed_100m",
        "wind_direction_10m",
        "wind_direction_100m",
        "wind_gusts_10m",
        "weather_code",
        "snow_depth"
    ]),
    "timezone": "GMT"
}

# Make the request
response = requests.get(endpoint, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    # Process the data as needed
    print(data)
    time_series = data["hourly"]["time"]
    try:
        idx = time_series.index("2023-12-31T17:00")
        selected_data = {k: v[idx] for k, v in data["hourly"].items() if k != "time"}
        print(f"Weather data at 2023-12-31T17:00Z:")
        for key, val in selected_data.items():
            print(f"{key}: {val}")
    except ValueError:
        print("Selected hour not found in response.")
else:
    print(f"Error: {response.status_code}")


{'latitude': 41.581722, 'longitude': -90.64935, 'generationtime_ms': 0.3064870834350586, 'utc_offset_seconds': 0, 'timezone': 'GMT', 'timezone_abbreviation': 'GMT', 'elevation': 228.0, 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C', 'relative_humidity_2m': '%', 'dew_point_2m': '°C', 'pressure_msl': 'hPa', 'surface_pressure': 'hPa', 'precipitation': 'mm', 'rain': 'mm', 'snowfall': 'cm', 'cloud_cover': '%', 'cloud_cover_low': '%', 'cloud_cover_mid': '%', 'cloud_cover_high': '%', 'wind_speed_10m': 'km/h', 'wind_speed_100m': 'km/h', 'wind_direction_10m': '°', 'wind_direction_100m': '°', 'wind_gusts_10m': 'km/h', 'weather_code': 'wmo code', 'snow_depth': 'm'}, 'hourly': {'time': ['2023-12-31T00:00', '2023-12-31T01:00', '2023-12-31T02:00', '2023-12-31T03:00', '2023-12-31T04:00', '2023-12-31T05:00', '2023-12-31T06:00', '2023-12-31T07:00', '2023-12-31T08:00', '2023-12-31T09:00', '2023-12-31T10:00', '2023-12-31T11:00', '2023-12-31T12:00', '2023-12-31T13:00', '2023-12-31T14:00', '202

In [None]:

#####Update the file 
NTSB_DATA = "ntsb-us-2003-2023.json"

with open(NTSB_DATA, 'r', encoding='utf-8') as f:
    ntsb_raw_data = json.load(f)

# Each record is one accident/incident entry in a list
print(f'\n--- NTSB JSON loaded: {len(ntsb_raw_data)} total records found ---')

# Convert to a DataFrame (this will flatten top-level fields)
# For nested fields like 'Vehicles', we might do a separate flatten later
df_ntsb = pd.json_normalize(ntsb_raw_data,
                            meta=[
                                'Oid','MKey','Closed','CompletionStatus','HasSafetyRec',
                                'HighestInjury','IsStudy','Mode','NtsbNumber',
                                'OriginalPublishedDate','MostRecentReportType','ProbableCause',
                                'City','Country','EventDate','State','Agency','BoardLaunch',
                                'BoardMeetingDate','DocketDate','EventType','Launch','ReportDate',
                                'ReportNum','ReportType','AirportId','AirportName','AnalysisNarrative',
                                'FactualNarrative','PrelimNarrative','FatalInjuryCount','MinorInjuryCount',
                                'SeriousInjuryCount','InvestigationClass','AccidentSiteCondition',
                                'Latitude','Longitude','DocketOriginalPublishDate'
                            ],
                            record_path=['Vehicles'],  # This flattens out the 'Vehicles' array
                            record_prefix='Vehicles.'
                           )

print('\n--- Flattened NTSB DataFrame (including Vehicles info): ---')

# combines all injury counts to 1 column
df_ntsb['TotalInjuryCount'] = df_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
df_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch'
                      ,'BoardMeetingDate','Launch','IsStudy','OriginalPublishedDate','DocketOriginalPublishDate'
                      ,'ReportType','ReportNum','ReportDate','MostRecentReportType','FatalInjuryCount','MinorInjuryCount'
                      ,'SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'
                      ,'Vehicles.AircraftCategory','Vehicles.AmateurBuilt','Vehicles.EventID','Vehicles.AirMedical'
                      ,'Vehicles.AirMedicalType','Vehicles.flightScheduledType','Vehicles.flightServiceType'
                      ,'Vehicles.flightTerminalType','Vehicles.RegisteredOwner','Vehicles.RegulationFlightConductedUnder'
                      ,'Vehicles.RepGenFlag','Vehicles.RevenueSightseeing','Vehicles.SecondPilotPresent','Vehicles.Damage'
                      ,'AccidentSiteCondition'], inplace=True)

# dropping NaT entries from EventDate
df_ntsb = df_ntsb.dropna(subset=['EventDate'])

# Type Conversion
df_ntsb['EventDate'] = pd.to_datetime(df_ntsb['EventDate']).dt.tz_localize(None)
df_ntsb['Vehicles.VehicleNumber'] = pd.to_numeric(df_ntsb['Vehicles.VehicleNumber'], errors='coerce').astype(int)
df_ntsb['MKey'] = pd.to_numeric(df_ntsb['MKey'], errors='coerce').astype(int)
df_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(df_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(int)
df_ntsb['Latitude'] = pd.to_numeric(df_ntsb['Latitude'], errors='coerce').astype(float)
df_ntsb['Longitude'] = pd.to_numeric(df_ntsb['Longitude'], errors='coerce').astype(float)
df_ntsb['TotalInjuryCount'] = pd.to_numeric(df_ntsb['TotalInjuryCount'], errors='coerce').astype(int)

categorical_cols = [
    'Vehicles.DamageLevel',
    'Vehicles.ExplosionType',
    'Vehicles.FireType',
    'HighestInjury',
    'EventType',
    'AccidentSiteCondition'
]

for col in categorical_cols:
    if col in df_ntsb.columns:
        df_ntsb[col] = df_ntsb[col].astype('category')

df_ntsb = df_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase


print(df_ntsb.head())

print('\n--- DataFrame Info ---')
df_ntsb.info()

In [None]:
def clean_text(s):
    return re.sub(r'\W+', ' ', str(s)).lower().strip()

In [None]:

# === 1. Dataset Loading ===
df_aircraft = pd.read_csv('aircraft_data.csv')  # Assicurati del path
df_ntsb_model = df_ntsb[['NtsbNumber', 'EventDate', 'Vehicles.SerialNumber',
                         'Vehicles.RegistrationNumber', 'Vehicles.Make', 'Vehicles.Model']].copy()

# === 2. Data Cleaning and Normalization ===
df_ntsb_model['Vehicles.Model'] = df_ntsb_model['Vehicles.Model'].apply(clean_text)
df_aircraft['aircraft'] = df_aircraft['aircraft'].apply(clean_text)

df_ntsb_model.dropna(subset=['Vehicles.Model'], inplace=True)
df_aircraft.dropna(subset=['aircraft'], inplace=True)

# === 3. Similarity Setup ===
jw = sm.JaroWinkler()
lev = sm.Levenshtein()
jac = sm.Jaccard()
qgram = sm.QgramTokenizer(qval=3)

# === 4. Matching with Q-gram and Numeric Filtering ===
df_ntsb_model['qgrams'] = df_ntsb_model['Vehicles.Model'].apply(lambda x: set(qgram.tokenize(x)))
df_aircraft['qgrams'] = df_aircraft['aircraft'].apply(lambda x: set(qgram.tokenize(x)))
matches = []

for i, ntsb_row in df_ntsb_model.iterrows():
    model_ntsb = ntsb_row['Vehicles.Model']
    grams_ntsb = ntsb_row['qgrams']

    for j, aircraft_row in df_aircraft.iterrows():
        model_aircraft = aircraft_row['aircraft']
        grams_aircraft = aircraft_row['qgrams']

        # BLOCKING: at least 2 shared q-grams or a substring match
        blocking_pass = (
            len(grams_ntsb & grams_aircraft) >= 2 or
            model_aircraft in model_ntsb or
            model_ntsb in model_aircraft
        )

        if blocking_pass:
            # Numeric Filter: numbers must match if present
            nums_ntsb = re.findall(r'\d+', model_ntsb)
            nums_aircraft = re.findall(r'\d+', model_aircraft)

            if nums_ntsb and nums_aircraft and nums_ntsb != nums_aircraft:
                continue  # i numeri non coincidono → scarto

            # Computing the Three Similarity Scores
            jw_score = jw.get_sim_score(model_ntsb, model_aircraft)
            lev_score = lev.get_sim_score(model_ntsb, model_aircraft)
            jac_score = jac.get_sim_score(model_ntsb.split(), model_aircraft.split())

            # Linear Rule
            final_score = 0.4 * jw_score + 0.3 * lev_score + 0.3 * jac_score

            if final_score > 0.75:
                matches.append({
                    'NtsbNumber': ntsb_row['NtsbNumber'],
                    'startDate': aircraft_row['startDate'],
                    'Vehicles.SerialNumber': ntsb_row['Vehicles.SerialNumber'],
                    'Vehicles.RegistrationNumber': ntsb_row['Vehicles.RegistrationNumber'],
                    'Vehicles.Make': ntsb_row['Vehicles.Make'],
                    'Vehicles.Model': model_ntsb,
                    'Matched_Aircraft': model_aircraft,
                    'JW_Score': round(jw_score, 3),        
                    'LEV_Score': round(lev_score, 3),
                    'JAC_Score': round(jac_score, 3),
                    'SimilarityScore': round(final_score, 4)
                })

# === 5. Final Output ===
if not matches:
    print("⚠️ No matches found with the current rules.")
else:
    df_matches = pd.DataFrame(matches)
    print(f"✅ Matches Found: {len(df_matches)}")
    print("📦 Columns:", df_matches.columns.tolist())
    df_matches = df_matches.sort_values(by='SimilarityScore', ascending=False)
    display(df_matches.head(10))


In [None]:
display(df_matches.head(38))