## Project Phase 1 - Aviation Accident Data Integration
### Group 03:
- Tommaso Tragno - fc64699
- Manuel Cardoso - fc56274
- Chen Cheng - fc64872
- Cristian Tedesco - fc65149

#### Setup

In [1]:
import pandas as pd
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import os
import time
import seaborn as sns
import calendar
#import py_stringmatching as sm
import re

### 1. Data Loading

In [5]:
PATH = 'data_sources/'
FILTERED_PATH = 'filtered/'

NTSB_DATA = 'ntsb-us-2003-2023.json'
AIR_TRAFFIC_DATA = 'u-s-airline-traffic-data.csv'
AIRCRAFT_DATA = 'aircraft_data.csv' #'aircraft_data_cleaned.csv' # the "cleaned" one contains the data cleaning part
WEATHER_DATA = 'weather_results.json'

#### Load NTSB JSON Data

In [None]:
with open(PATH+NTSB_DATA, 'r', encoding='utf-8') as f:
    ntsb_raw_data = json.load(f)

# Each record is one accident/incident entry in a list
print(f'\n--- NTSB JSON loaded: {len(ntsb_raw_data)} total records found ---')

# Convert to a DataFrame (this will flatten top-level fields)
# For nested fields like 'Vehicles', we might do a separate flatten later
df_ntsb = pd.json_normalize(ntsb_raw_data, 
                            meta=[
                                'Oid','MKey','Closed','CompletionStatus','HasSafetyRec',
                                'HighestInjury','IsStudy','Mode','NtsbNumber',
                                'OriginalPublishedDate','MostRecentReportType','ProbableCause',
                                'City','Country','EventDate','State','Agency','BoardLaunch',
                                'BoardMeetingDate','DocketDate','EventType','Launch','ReportDate',
                                'ReportNum','ReportType','AirportId','AirportName','AnalysisNarrative',
                                'FactualNarrative','PrelimNarrative','FatalInjuryCount','MinorInjuryCount',
                                'SeriousInjuryCount','InvestigationClass','AccidentSiteCondition',
                                'Latitude','Longitude','DocketOriginalPublishDate'
                            ],
                            record_path=['Vehicles'],  # This flattens out the 'Vehicles' array
                            record_prefix='Vehicles.'
                           )

print('\n--- Flattened NTSB DataFrame (including Vehicles info): ---')

# print(df_ntsb.info())

# combines all injury counts to 1 column
df_ntsb['TotalInjuryCount'] = df_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
df_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch'
                      ,'BoardMeetingDate','Launch','IsStudy','OriginalPublishedDate','DocketOriginalPublishDate'
                      ,'ReportType','ReportNum','ReportDate','MostRecentReportType','FatalInjuryCount','MinorInjuryCount'
                      ,'SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'
                      ,'Vehicles.AircraftCategory','Vehicles.AmateurBuilt','Vehicles.EventID','Vehicles.AirMedical'
                      ,'Vehicles.AirMedicalType','Vehicles.flightScheduledType','Vehicles.flightServiceType'
                      ,'Vehicles.flightTerminalType','Vehicles.RegisteredOwner','Vehicles.RegulationFlightConductedUnder'
                      ,'Vehicles.RepGenFlag','Vehicles.RevenueSightseeing','Vehicles.SecondPilotPresent','Vehicles.Damage'
                      ,'AccidentSiteCondition'], inplace=True) 

# dropping NaT entries from EventDate
df_ntsb = df_ntsb.dropna(subset=['EventDate'])

# Type Conversion
df_ntsb['EventDate'] = pd.to_datetime(df_ntsb['EventDate']).dt.tz_localize(None)
df_ntsb['Vehicles.VehicleNumber'] = pd.to_numeric(df_ntsb['Vehicles.VehicleNumber'], errors='coerce').astype(int)
df_ntsb['MKey'] = pd.to_numeric(df_ntsb['MKey'], errors='coerce').astype(int)
df_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(df_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(int)
df_ntsb['Latitude'] = pd.to_numeric(df_ntsb['Latitude'], errors='coerce').astype(float)
df_ntsb['Longitude'] = pd.to_numeric(df_ntsb['Longitude'], errors='coerce').astype(float)
df_ntsb['TotalInjuryCount'] = pd.to_numeric(df_ntsb['TotalInjuryCount'], errors='coerce').astype(int)

categorical_cols = [
    'Vehicles.DamageLevel',
    'Vehicles.ExplosionType',
    'Vehicles.FireType',
    'HighestInjury',
    'EventType',
    'AccidentSiteCondition'
]

for col in categorical_cols:
    if col in df_ntsb.columns:
        df_ntsb[col] = df_ntsb[col].astype('category')

df_ntsb = df_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase

print(df_ntsb.info())

print('\n--- Saving filtered NTSB DataFrame... ---')
df_ntsb.to_pickle(PATH+'filtered/ntsb.pkl')

#### Load Weather JSON Data
(after fetching the data from open-meteo API)

In [None]:
with open(PATH+WEATHER_DATA, 'r', encoding='utf-8') as f:
    weather_raw_data = json.load(f)

# Each record is one day weather entry in a list
print(f'\n--- Weather JSON loaded: {len(weather_raw_data)} total records found ---')

# weather_data is a dict, e.g.:
# {
#   "cen24la079_2023-12-31_41.610278_-90.588361": {
#       "time": [...],
#       "temperature_2m": [...],
#       ...
#   }
# }

# Flatten into a tabular structure
all_rows = []
num_skip = 0

for accident_id, subdict in weather_raw_data.items():
    # subdict is a dict with keys like "time", "temperature_2m", ...
    # Each key is an array of the same length (24 hours).
    times = subdict.get("time", None)
    if times is None:
        print(f'Skipping {accident_id}: no "time" found.')
        num_skip += 1
        continue
    num_hours = len(subdict["time"])
    for i in range(num_hours):
        row = {"AccidentID": accident_id}  # store the top-level key
        for param, values_array in subdict.items():
            # param: "time", "temperature_2m", ...
            row[param] = values_array[i]  # pick the ith hour’s value
        all_rows.append(row)

df_weather = pd.DataFrame(all_rows)

# The missing values exists because not all accident have position data
# this cause the api to return empty data.
print("Skipped {} records over {} accidents.".format(num_skip, len(weather_raw_data.items())))

# Type conversion
df_weather["time"] = pd.to_datetime(df_weather["time"], errors="coerce")

int_columns = [
    "relative_humidity_2m",
    "cloud_cover_low",
    "cloud_cover_mid",
    "cloud_cover_high",
    "wind_direction_10m",
    "wind_direction_100m",
    "weather_code"
]
float_columns = [
    "temperature_2m",
    "dew_point_2m",
    "pressure_msl",
    "surface_pressure",
    "precipitation",
    "rain",
    "snowfall",
    "wind_speed_10m",
    "wind_speed_100m",
    "wind_gusts_10m",
    "snow_depth"
]
for col in int_columns:
    df_weather[col] = pd.to_numeric(df_weather[col], errors="coerce").astype(int)
for col in float_columns:
    df_weather[col] = pd.to_numeric(df_weather[col], errors="coerce").astype(float)


print("\n--- Weather DataFrame sample ---")
print(df_weather.info())

print('\n--- Saving filtered Weather DataFrame... ---')
df_weather.to_pickle(PATH+'filtered/weather.pkl')

#### Load Airline Traffic CSV Data

In [None]:
df_airline_traffic = pd.read_csv(PATH+AIR_TRAFFIC_DATA, encoding='utf-8')

print(f'\n--- Airline CSV loaded: {df_airline_traffic.shape[0]} rows, {df_airline_traffic.shape[1]} columns ---')

# dropping unnecessary columns
df_airline_traffic.drop(columns=['Dom_RPM','Int_RPM','RPM','Dom_ASM','Int_ASM','ASM'], inplace=True) 

# print(df_airline_traffic.info())

# Remove commas from all columns and then convert
df_airline_traffic = df_airline_traffic.replace(',', '', regex=True)

# Now convert each column to numeric. If everything converts well, no rows become NaN.
df_airline_traffic = df_airline_traffic.apply(pd.to_numeric, errors='coerce').astype(int)

print(df_airline_traffic.info())

print('\n--- Saving filtered Airline DataFrame... ---')
df_airline_traffic.to_pickle(PATH+'filtered/airline.pkl')

#### Load Aircraft CSV Data

In [None]:
df_aircraft = pd.read_csv(PATH+AIRCRAFT_DATA, encoding='utf-8')

print(f'\n--- Aircraft CSV loaded: {df_aircraft.shape[0]} rows, {df_aircraft.shape[1]} columns ---')

# print(df_aircraft.info())

# dropping unnecessary columns
df_aircraft.drop(columns=['Unnamed: 0'], inplace=True)
df_aircraft.drop(columns=['retired'], inplace=True)

# make string values lowercase
df_aircraft['aircraft'] = df_aircraft['aircraft'].str.lower()

# Type Conversion
df_aircraft['nbBuilt'] = pd.to_numeric(df_aircraft['nbBuilt'], errors='coerce').astype(int)
df_aircraft['startDate'] = pd.to_numeric(df_aircraft['startDate'], errors='coerce').astype(int)
df_aircraft['endDate'] = pd.to_numeric(df_aircraft['endDate'], errors='coerce').astype('Int64')  # Use 'Int64' for nullable integers

print(df_aircraft.info())

print('\n--- Saving filtered Aircraft DataFrame... ---')
df_aircraft.to_pickle(PATH+'filtered/aircraft.pkl')

### 2. Data Profiling

In [None]:
def profile_dataframe(df, name='DataFrame'):
    print(f'\n=== Profiling {name} ===')
    print(f'Total Rows: {len(df)}')
    print(f'Total Columns: {len(df.columns)}\n')
    
    profile_results = []

    for col in df.columns:
        series = df[col]
        col_dtype = series.dtype
        
        # Basic counts
        total_count = len(series)
        missing_vals = series.isna().sum()
        non_null_count = total_count - missing_vals
        missing_perc = (missing_vals / total_count) * 100
        unique_vals = series.nunique(dropna=False)
        
        # Mode & frequency
        try:
            modes = series.mode(dropna=True)
            mode_val = modes.iloc[0] if len(modes) > 0 else np.nan
            mode_freq = (series == mode_val).sum(skipna=True)
        except:
            mode_val, mode_freq = np.nan, np.nan
        
        # Initialize placeholders
        mean_ = np.nan
        min_  = np.nan
        q25   = np.nan
        q50   = np.nan
        q75   = np.nan
        max_  = np.nan
        std_  = np.nan  # only for numeric columns

        # Numeric columns
        if pd.api.types.is_numeric_dtype(series):
            mean_ = series.mean(skipna=True)
            min_  = series.min(skipna=True)
            q25   = series.quantile(0.25)
            q50   = series.quantile(0.50)
            q75   = series.quantile(0.75)
            max_  = series.max(skipna=True)
            std_  = series.std(skipna=True)

        # Datetime columns
        elif pd.api.types.is_datetime64_any_dtype(series):
            # We can compute mean & quartiles by time. 
            # .quantile() and .mean() are valid for datetime in pandas
            # They return a Timestamp for mean, 
            # and Timestamps for quantiles
            if non_null_count > 0:
                mean_ = series.mean(skipna=True)
                min_  = series.min(skipna=True)
                q25   = series.quantile(0.25)
                q50   = series.quantile(0.50)
                q75   = series.quantile(0.75)
                max_  = series.max(skipna=True)
            # We skip std_ for datetime.

        # Categorical/object columns 
        # do not get numeric stats (we keep them as NaN).

        profile_results.append((
            col,
            str(col_dtype),
            total_count,
            non_null_count,
            missing_vals,
            round(missing_perc, 2),
            unique_vals,
            mode_val,
            mode_freq,
            mean_,
            min_,
            q25,
            q50,
            q75,
            max_,
            std_
        ))

    columns = [
        'Column', 'DataType', 'TotalCount', 'NonNullCount', 'NumMissing',
        'MissingPerc', 'Cardinality', 'Mode', 'ModeFreq',
        'Mean', 'Min', 'Q25', 'Q50', 'Q75', 'Max', 'Std'
    ]

    prof_df = pd.DataFrame(profile_results, columns=columns)
    
    return prof_df

#### NTSB Data Profile

In [None]:
ntsb_profile = profile_dataframe(df_ntsb, name='NTSB Data')
display(HTML(ntsb_profile.to_html()))
ntsb_profile.to_csv(PATH+'profiling/ntsb_profile.csv', index=False)

Insights from the data profile results:

- there are some `null` values for Latitude and Longitude --> we keep like this, but they should be handled during the API calls to open-meteo
- there are less unique `NtsbNumber` than rows --> for incident where more than one aircraft is involved, the rows are duplicated with different values for Vehicles characteristic, and same value for incident data (look at the following example)

In [None]:
df_ntsb.loc[df_ntsb['NtsbNumber']=='ops24la011']

#### Weather Data Profile

In [None]:
weather_profile = profile_dataframe(df_weather, name='Weather Data')
display(HTML(weather_profile.to_html()))
weather_profile.to_csv(PATH+'profiling/weather_profile.csv', index=False)

#### Air Traffic Data Profile

In [None]:
airline_profile = profile_dataframe(df_airline_traffic, name='Airline Data')
display(HTML(airline_profile.to_html()))
airline_profile.to_csv(PATH+'profiling/airline_profile.csv', index=False)

#### Aircraft Data Profile

In [None]:
aircraft_profile = profile_dataframe(df_aircraft, name='Aircraft Data')
display(HTML(aircraft_profile.to_html()))
aircraft_profile.to_csv(PATH+'profiling/aircraft_profile.csv', index=False)

Insights from the data profile results:

- there are some `startDate` and `endDate` equal to 1 --> it is supposed to be a year

In [None]:
df_filtered = df_aircraft[(df_aircraft['startDate'] < 1000) | (df_aircraft['endDate'] < 1000)]
df_filtered.style.map(
    lambda val: 'background-color: red' if val < 1000 else '',
    subset=['startDate', 'endDate']
)

### Charts

In [None]:
df_airline = pd.read_pickle(PATH + FILTERED_PATH + 'airline.pkl')

# Group by 'Month' and sum 'Flt'
monthly_flt_sum = df_airline.groupby('Month')['Flt'].sum().reset_index()

# Sort by month to be sure
monthly_flt_sum = monthly_flt_sum.sort_values('Month')

# Map month numbers to names (Jan, Feb, ...)
month_names = [calendar.month_abbr[m] for m in monthly_flt_sum['Month']]
monthly_flt_sum['Month_Name'] = month_names

# Display result
print(monthly_flt_sum)

# Histogram
# Plot
plt.figure(figsize=(10, 6))
plt.bar(monthly_flt_sum['Month_Name'], monthly_flt_sum['Flt'], color='skyblue', edgecolor='black')

# Labels and title
plt.title('Total Flights per Month (All Years)')
plt.xlabel('Month')
plt.ylabel('Total Flights')

plt.tight_layout()
plt.show()

# Box Plot
# Map numeric month to abbreviation
df_airline['Month_Name'] = df_airline['Month'].apply(lambda x: calendar.month_abbr[x])

# Optional: Order months correctly
month_order = list(calendar.month_abbr)[1:]  # ['Jan', 'Feb', ..., 'Dec']

# Plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_airline, x='Month_Name', y='Flt', order=month_order, palette='pastel')

# Labels and title
plt.title('Distribution of Flights per Month (All Years)')
plt.xlabel('Month')
plt.ylabel('Number of Flights')

plt.tight_layout()
plt.show()

## A blocking strategy

In [None]:

# === 1. Data Loading ===
# Caricamento dei dataset
df_aircraft = pd.read_csv('data_sources/combined_aircraft_data.csv')
df_ntsb = pd.read_pickle(PATH + FILTERED_PATH + 'ntsb.pkl')

# Selezione delle colonne necessarie
df_ntsb_model = df_ntsb[['NtsbNumber', 'EventDate', 'Vehicles.SerialNumber',
                         'Vehicles.RegistrationNumber', 'Vehicles.Make', 'Vehicles.Model']].copy()

# === 2. Data Cleaning and Normalization ===
def clean_text(s):
    """ Normalizzazione del testo: rimozione di caratteri speciali, lowercase e spazi extra. """
    return re.sub(r'\W+', ' ', str(s)).lower().strip()

# Pulizia dei dati
df_ntsb_model['Vehicles.Model'] = df_ntsb_model['Vehicles.Model'].apply(clean_text)
df_ntsb_model['Vehicles.Make'] = df_ntsb_model['Vehicles.Make'].apply(clean_text)

df_aircraft['model_no'] = df_aircraft['model_no'].apply(clean_text)
df_aircraft['manufacturer_code'] = df_aircraft['manufacturer_code'].apply(clean_text)

df_ntsb_model.dropna(subset=['Vehicles.Model'], inplace=True)
df_aircraft.dropna(subset=['model_no'], inplace=True)

# === 3. Similarity Setup ===
jw = sm.JaroWinkler()
lev = sm.Levenshtein()
jac = sm.Jaccard()  

# === 4. Precomputation degli n-gram ===
def generate_qgrams(model):
    """ Genera un insieme di trigrammi (q-grams di lunghezza 3) per una stringa data. """
    qgrams = [model[i:i+3] for i in range(len(model) - 2)]
    return set(qgrams)

# === 5. Matching with Optimized Loop ===
matches = []
matched_set = set()  # Set per controllare i duplicati di NtsbNumber + SerialNumber
serial_set = set()   # Set per controllare i duplicati di SerialNumber

for i, ntsb_row in df_ntsb_model.iterrows():
    model_ntsb = ntsb_row['Vehicles.Model']
    make_ntsb = ntsb_row['Vehicles.Make']
    grams_ntsb = generate_qgrams(model_ntsb)

    # 🔎 **Filtro preliminare basato sul Make (flessibile)**
    filtered_aircraft = df_aircraft[
        df_aircraft['manufacturer_code'].apply(lambda x: make_ntsb in x or x in make_ntsb or jw.get_sim_score(x, make_ntsb) > 0.85)
    ]
    
    # Se non ci sono candidati, passa al prossimo ciclo
    if filtered_aircraft.empty:
        continue

    # Precomputa gli n-gram per i candidati
    aircraft_grams = {index: generate_qgrams(model) for index, model in enumerate(filtered_aircraft['model_no'])}

    # 🔎 **Filtro preliminare basato sugli n-grammi**
    candidate_matches = []
    for idx, grams_aircraft in aircraft_grams.items():
        if len(grams_ntsb & grams_aircraft) >= 2:
            candidate_matches.append(filtered_aircraft.index[idx])

    if not candidate_matches:
        continue  # Nessun match possibile, passo al successivo

    # 🔎 **Controllo diretto:** se esiste un match esatto tra i candidati
    direct_match = df_aircraft.loc[candidate_matches]
    direct_match = direct_match[direct_match['model_no'] == model_ntsb]

    if not direct_match.empty:
        for _, row in direct_match.iterrows():
            match_id = f"{ntsb_row['NtsbNumber']}_{ntsb_row['Vehicles.SerialNumber']}_{row['model_no']}"
            if match_id not in matched_set and ntsb_row['Vehicles.SerialNumber'] not in serial_set:
                matches.append({
                    'NtsbNumber': ntsb_row['NtsbNumber'],
                    'EventDate': ntsb_row['EventDate'],
                    'Vehicles.SerialNumber': ntsb_row['Vehicles.SerialNumber'],
                    'Vehicles.RegistrationNumber': ntsb_row['Vehicles.RegistrationNumber'],
                    'Vehicles.Make': ntsb_row['Vehicles.Make'],
                    'Vehicles.Model': model_ntsb,
                    'Matched_Aircraft_Model': row['model_no'],
                    'engine_count': row['engine_count'],
                    'engine_type': row['engine_type'],
                    'JW_Score': 1.0,
                    'LEV_Score': 1.0,
                    'Jac_Score': 1.0,
                    'SimilarityScore': 1.0
                })
                matched_set.add(match_id)
                serial_set.add(ntsb_row['Vehicles.SerialNumber'])

        continue  # Salta il loop di matching

    # 🔎 **Controllo di Variante Generico**
    for idx in candidate_matches:
        model_aircraft = df_aircraft.loc[idx, 'model_no']

        # Numeric Filter: numbers must match if present
        nums_ntsb = re.findall(r'\d+', model_ntsb)
        nums_aircraft = re.findall(r'\d+', model_aircraft)

        if nums_ntsb and nums_aircraft and nums_ntsb != nums_aircraft:
            continue

        # Computing the Three Similarity Scores
        jw_score = jw.get_sim_score(model_ntsb, model_aircraft)
        lev_score = lev.get_sim_score(model_ntsb, model_aircraft)
        jac_score = jac.get_sim_score(list(grams_ntsb), list(generate_qgrams(model_aircraft)))

        # Linear Rule
        final_score = 0.4 * jw_score + 0.3 * lev_score + 0.3 * jac_score

        # Controllo duplicati
        match_id = f"{ntsb_row['NtsbNumber']}_{ntsb_row['Vehicles.SerialNumber']}_{model_aircraft}"
        if final_score > 0.75 and match_id not in matched_set and ntsb_row['Vehicles.SerialNumber'] not in serial_set:
            matches.append({
                'NtsbNumber': ntsb_row['NtsbNumber'],
                'EventDate': ntsb_row['EventDate'],
                'Vehicles.SerialNumber': ntsb_row['Vehicles.SerialNumber'],
                'Vehicles.RegistrationNumber': ntsb_row['Vehicles.RegistrationNumber'],
                'Vehicles.Make': ntsb_row['Vehicles.Make'],
                'Vehicles.Model': model_ntsb,
                'Matched_Aircraft_Model': model_aircraft,
                'engine_count': df_aircraft.loc[idx, 'engine_count'],
                'engine_type': df_aircraft.loc[idx, 'engine_type'],
                'JW_Score': round(jw_score, 3),
                'LEV_Score': round(lev_score, 3),
                'Jac_Score': round(jac_score, 3),
                'SimilarityScore': round(final_score, 4)
            })
            matched_set.add(match_id)
            serial_set.add(ntsb_row['Vehicles.SerialNumber'])

# === 6. Final Output ===
if not matches:
    print("No matches found with the current rules.")
else:
    df_matches = pd.DataFrame(matches)
    print(f"Matches Found: {len(df_matches)}")
    print("Columns:", df_matches.columns.tolist())
    df_matches = df_matches.sort_values(by='SimilarityScore', ascending=False)
    display(df_matches.head(25))


#### All Match

In [None]:
display(df_matches)

In [None]:
df_matches.to_csv(PATH+"binding/matched_results.csv", index=False)

***Individual Challenge: Data Cleaning Expert***

*Manuel Cardoso 56274*

Duplicated dataset: NTSB

In [10]:
with open("data_sources/ntsb-us-2003-2023.json", 'r', encoding='utf-8') as f:
    ic_ntsb = json.load(f)

##### THE CODE ON THE START OF THE CELL BELOW IS COPIED FROM THE START OF THE PROJECT JUST TO HAVE AN EQUAL DATASET, THE INDIVIDUAL CHALLENGE WILL BE DONE WITH THE DATA THAT REMAINS

In [21]:
# Each record is one accident/incident entry in a list

# Convert to a DataFrame (this will flatten top-level fields)
# For nested fields like 'Vehicles', we might do a separate flatten later
messy_ntsb = pd.json_normalize(ic_ntsb, 
                            meta=[
                                'Oid','MKey','Closed','CompletionStatus','HasSafetyRec',
                                'HighestInjury','IsStudy','Mode','NtsbNumber',
                                'OriginalPublishedDate','MostRecentReportType','ProbableCause',
                                'City','Country','EventDate','State','Agency','BoardLaunch',
                                'BoardMeetingDate','DocketDate','EventType','Launch','ReportDate',
                                'ReportNum','ReportType','AirportId','AirportName','AnalysisNarrative',
                                'FactualNarrative','PrelimNarrative','FatalInjuryCount','MinorInjuryCount',
                                'SeriousInjuryCount','InvestigationClass','AccidentSiteCondition',
                                'Latitude','Longitude','DocketOriginalPublishDate'
                            ],
                            record_path=['Vehicles'],  # This flattens out the 'Vehicles' array
                            record_prefix='Vehicles.'
                           )

# combines all injury counts to 1 column
messy_ntsb['TotalInjuryCount'] = messy_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
messy_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch'
                      ,'BoardMeetingDate','Launch','IsStudy','OriginalPublishedDate','DocketOriginalPublishDate'
                      ,'ReportType','ReportNum','ReportDate','MostRecentReportType','FatalInjuryCount','MinorInjuryCount'
                      ,'SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'
                      ,'Vehicles.AircraftCategory','Vehicles.AmateurBuilt','Vehicles.EventID','Vehicles.AirMedical'
                      ,'Vehicles.AirMedicalType','Vehicles.flightScheduledType','Vehicles.flightServiceType'
                      ,'Vehicles.flightTerminalType','Vehicles.RegisteredOwner','Vehicles.RegulationFlightConductedUnder'
                      ,'Vehicles.RepGenFlag','Vehicles.RevenueSightseeing','Vehicles.SecondPilotPresent','Vehicles.Damage'
                      ,'AccidentSiteCondition'], inplace=True) 

# dropping NaT entries from EventDate
messy_ntsb = messy_ntsb.dropna(subset=['EventDate'])

# Type Conversion
messy_ntsb['EventDate'] = pd.to_datetime(messy_ntsb['EventDate']).dt.tz_localize(None)
messy_ntsb['Vehicles.VehicleNumber'] = pd.to_numeric(messy_ntsb['Vehicles.VehicleNumber'], errors='coerce').astype(int)
messy_ntsb['MKey'] = pd.to_numeric(messy_ntsb['MKey'], errors='coerce').astype(int)
messy_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(messy_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(float) # only change from the original Data Cleaning so it was easier to manipulate
messy_ntsb['Latitude'] = pd.to_numeric(messy_ntsb['Latitude'], errors='coerce').astype(float)
messy_ntsb['Longitude'] = pd.to_numeric(messy_ntsb['Longitude'], errors='coerce').astype(float)
messy_ntsb['TotalInjuryCount'] = pd.to_numeric(messy_ntsb['TotalInjuryCount'], errors='coerce').astype(int)

categorical_cols = [
    'Vehicles.DamageLevel',
    'Vehicles.ExplosionType',
    'Vehicles.FireType',
    'HighestInjury',
    'EventType',
    'AccidentSiteCondition'
]

for col in categorical_cols:
    if col in messy_ntsb.columns:
        messy_ntsb[col] = messy_ntsb[col].astype('category')

messy_ntsb = messy_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase

#################################################################################### CODE ABOVE IS COPIED ####################################################################################

starting_entries = len(messy_ntsb)
print(f"Number of starting entries: {starting_entries}")

# Manipulating records
seed = 5 # for reproducibility

# Introducing missing values on Vehicles.Make column
missing_count = messy_ntsb['Vehicles.Make'].isna().sum()
print(missing_count)
np.random.seed(seed) 
n = starting_entries
n_missing = int(np.floor(0.1 * n)) # 0.1 = 10% missing values
missing_indices = np.random.choice(messy_ntsb.index, n_missing, replace=False)
messy_ntsb.loc[missing_indices, "Vehicles.Make"] = np.nan
print(f"Number of entries after introducing missing values: {len(messy_ntsb)}")

# Introducing duplicate records
n_dup = int(np.floor(0.05 * n)) # 0.05 = 5% duplicated records
# Randomly choose rows to duplicate
dup_indices = np.random.choice(messy_ntsb.index, n_dup, replace=False)
duplicates = messy_ntsb.loc[dup_indices].copy()
# Append duplicates to original DataFrame
messy_ntsb = pd.concat([messy_ntsb, duplicates], ignore_index=True)
print(f"Number of entries after introducing duplicate records: {len(messy_ntsb)}")
n = len(messy_ntsb) # reset

# Introducing negative and incorrect values for Vehicles.NumberOfEngines
n_invalid = int(np.floor(0.05 * n)) # 0.05 = 5% induced negatives and incorrect 
# Randomly choose rows
invalid_indices = np.random.choice(messy_ntsb.index, n_invalid, replace=False)
# Flip values to negative (ensure they're numeric first)
messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"] = -messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"].abs()
# Randomly choose rows
invalid_indices = np.random.choice(messy_ntsb.index, n_invalid, replace=False)
# Flip values to negative (ensure they're numeric first)
messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"] = (messy_ntsb.loc[invalid_indices, "Vehicles.NumberOfEngines"] + (0.01*np.random.rand())) # add decimals to the Number of Engines
print(f"Number of entries after introducing invalid values: {len(messy_ntsb)}")

# Introducing Outliers in 'Longitude' and 'Latitude' columns
n_outliers = int(np.floor(0.025 * n)) # 0.025 = 2.5% induced outliers
outlier_indices = np.random.choice(messy_ntsb.index, n_outliers, replace=False)
# Longitude: valid range ~ -180 to 180
messy_ntsb.loc[outlier_indices, 'Longitude'] = (messy_ntsb.loc[outlier_indices, 'Longitude'] + np.random.uniform(400, 500, size=n_outliers)) # clearly invalid, just to induce
# Latitude: valid range ~ -90 to 90
messy_ntsb.loc[outlier_indices, 'Latitude'] = (messy_ntsb.loc[outlier_indices, 'Latitude'] + np.random.uniform(200, 300, size=n_outliers)) # clearly invalid, just to induce
print(f"Number of entries after introducing outliers in 'Longitude' and 'Latitude': {len(messy_ntsb)}")

#messy_ntsb

Number of starting entries: 23403
1
Number of entries after introducing missing values: 23403
Number of entries after introducing duplicate records: 24573
Number of entries after introducing invalid values: 24573
Number of entries after introducing outliers in 'Longitude' and 'Latitude': 24573


### Data Cleaning
#### For this challenge, I'm going to assume that the user noticed the errors on the specific columns and dealt with them (Qualitative Cleaning - "Manual crafting of rules and transform function")

In [22]:
# This block of code needs the previous block of code to be ran first

# First, start by dropping duplicate rows (this will have to be done again at the end, we do it at the start anyway to minimize computing needs)
messy_ntsb = messy_ntsb.drop_duplicates()
print(f"Number of entries after dropping duplicates: {len(messy_ntsb)}")

print("----------")

# Fixing missing values in column Vehicles.Make
# Filter rows where 'Vehicles.Make' is not missing
model_dict = {}
missing_count = messy_ntsb['Vehicles.Make'].isna().sum()
print(f"Number of NaN: {missing_count}")
for index, row in messy_ntsb.iterrows():
    make = row['Vehicles.Make']
    model = row['Vehicles.Model']
    
    if pd.notna(make):  # Only build dict from known makes
        model_dict[model] = make

# Iterate again to replace NaNs
for index, row in messy_ntsb.iterrows():
    make = row['Vehicles.Make']
    model = row['Vehicles.Model']
    
    if pd.isna(make):  # Only build dict from known makes
        messy_ntsb.loc[index, 'Vehicles.Make'] = model_dict.get(model, None)


print(f"Number of entries after fixing Vehicles.Make: {len(messy_ntsb)}")

missing_count = messy_ntsb['Vehicles.Make'].isna().sum()
print(f"Number of NaN after trying to fix Vehicles.Make: {missing_count}") 
# We can check that not all was fixed, I tried to fix by checking with other entries that had the same Model
# but if a Model never has a Make to begin with, this can't be done

print("----------")

# Fixing invalid values in Vehicles.NumberOfEngines
engines_dict = {}
negative_count = (messy_ntsb['Vehicles.NumberOfEngines'] < 0).sum()
decimal_count = (messy_ntsb['Vehicles.NumberOfEngines'] % 1 != 0).sum() - messy_ntsb['Vehicles.NumberOfEngines'].isna().sum() # NaNs count here if not for the subtraction
print(f"Number of negatives: {negative_count}")
print(f"Number of numbers with decimal parts: {decimal_count}")
for index, row in messy_ntsb.iterrows():
    engines = row['Vehicles.NumberOfEngines']
    model = row['Vehicles.Model']

    if engines.is_integer() and engines >= 1:
        engines_dict[model] = engines

    elif engines < 0:
        messy_ntsb.loc[index, 'Vehicles.NumberOfEngines'] = abs(engines)

# Iterate again to check for the ones with decimal parts and previously negative
for index, row in messy_ntsb.iterrows():
    model = row['Vehicles.Model']

    if model in engines_dict:
        messy_ntsb.loc[index, 'Vehicles.NumberOfEngines'] = engines_dict[model]
    else:
        messy_ntsb.loc[index, 'Vehicles.NumberOfEngines'] = None # I could round the number to the closet whole number but that wouldn't be trustworthy 
                                                                 # so I think it's better to replace it with None


negative_count = (messy_ntsb['Vehicles.NumberOfEngines'] < 0).sum()
decimal_count = (messy_ntsb['Vehicles.NumberOfEngines'] % 1 != 0).sum() - messy_ntsb['Vehicles.NumberOfEngines'].isna().sum()
print(f"Number of negatives after trying to fix Vehicles.NumberOfEngines: {negative_count}")
print(f"Number of numbers with decimal parts after trying to fix Vehicles.NumberOfEngines: {decimal_count}")

messy_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(messy_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(int) # fix Type Conversion

print("----------")

# Removing outliers from Longitude and Latitude ~ -180 to 180
longitude_count = (messy_ntsb['Longitude'] < -180).sum() + (messy_ntsb['Longitude'] > 180).sum()
latitude_count = (messy_ntsb['Latitude'] < -90).sum() + (messy_ntsb['Latitude'] > 90).sum()
print(f"Number of outliers in Longitude: {longitude_count}")
print(f"Number of outliers in Latitude: {latitude_count}")
for index, row in messy_ntsb.iterrows():
    long = row['Longitude']
    lat = row['Latitude']

    if long and (long < -180 or long > 190):
        messy_ntsb.loc[index, 'Longitude'] = None

    if lat and (lat < -90 or lat > 90):
        messy_ntsb.loc[index, 'Latitude'] = None

longitude_count = (messy_ntsb['Longitude'] < -180).sum() + (messy_ntsb['Longitude'] > 180).sum()
latitude_count = (messy_ntsb['Latitude'] < -90).sum() + (messy_ntsb['Latitude'] > 90).sum()
print(f"Number of outliers in Longitude after fix: {longitude_count}")
print(f"Number of outliers in Latitude after fix: {latitude_count}")

print("----------")

messy_ntsb = messy_ntsb.drop_duplicates()
print(f"Number of entries after dropping duplicates at the end: {len(messy_ntsb)}") # we do this again because there may exist rows that had records manipulated and weren't 
                                                                                    # duplicated because of that, but at the end of the Data Cleaning could be duplicated again
print(f"Number of starting entries for comparison: {starting_entries}") # this value being different is natural, as the order of the manipulation has the duplication happening before other
                                                                        # data issue insertions,, I tested with the duplication as the last manipulation and the number of entries
                                                                        # at the end of the Data Cleaning coincides with the starting entries (23403) 

Number of entries after dropping duplicates: 23657
----------
Number of NaN: 2359
Number of entries after fixing Vehicles.Make: 23657
Number of NaN after trying to fix Vehicles.Make: 154
----------
Number of negatives: 1198
Number of numbers with decimal parts: 1226
Number of negatives after trying to fix Vehicles.NumberOfEngines: 0
Number of numbers with decimal parts after trying to fix Vehicles.NumberOfEngines: 0
----------
Number of outliers in Longitude: 613
Number of outliers in Latitude: 612
Number of outliers in Longitude after fix: 0
Number of outliers in Latitude after fix: 0
----------
Number of entries after dropping duplicates at the end: 23466
Number of starting entries for comparison: 23403
