## Project Phase 1 - Aviation Accident Data Integration
### Group 03:
- Tommaso Tragno - fc64699
- Manuel Cardoso - fc56274
- Chen Cheng - fc64872
- Cristian Tedesco - fc65149

#### Setup

In [2]:
import pandas as pd
import numpy as np
import json
import os
import requests

### 1. Load NTSB JSON Data

In [19]:
PATH = 'data_sources/'
NTSB_DATA = 'ntsb-us-2003-2023.json'

with open(PATH+NTSB_DATA, 'r', encoding='utf-8') as f:
    ntsb_raw_data = json.load(f)

# Each record is one accident/incident entry in a list
print(f'\n--- NTSB JSON loaded: {len(ntsb_raw_data)} total records found ---')

# Convert to a DataFrame (this will flatten top-level fields)
# For nested fields like 'Vehicles', we might do a separate flatten later
df_ntsb = pd.json_normalize(ntsb_raw_data, 
                            meta=[
                                'Oid','MKey','Closed','CompletionStatus','HasSafetyRec',
                                'HighestInjury','IsStudy','Mode','NtsbNumber',
                                'OriginalPublishedDate','MostRecentReportType','ProbableCause',
                                'City','Country','EventDate','State','Agency','BoardLaunch',
                                'BoardMeetingDate','DocketDate','EventType','Launch','ReportDate',
                                'ReportNum','ReportType','AirportId','AirportName','AnalysisNarrative',
                                'FactualNarrative','PrelimNarrative','FatalInjuryCount','MinorInjuryCount',
                                'SeriousInjuryCount','InvestigationClass','AccidentSiteCondition',
                                'Latitude','Longitude','DocketOriginalPublishDate'
                            ],
                            record_path=['Vehicles'],  # This flattens out the 'Vehicles' array
                            record_prefix='Vehicles.'
                           )

print('\n--- Flattened NTSB DataFrame (including Vehicles info): ---')

# combines all injury counts to 1 column
df_ntsb['TotalInjuryCount'] = df_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
df_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch'
                      ,'BoardMeetingDate','Launch','IsStudy','OriginalPublishedDate','DocketOriginalPublishDate'
                      ,'ReportType','ReportNum','ReportDate','MostRecentReportType','FatalInjuryCount','MinorInjuryCount'
                      ,'SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'
                      ,'Vehicles.AircraftCategory','Vehicles.AmateurBuilt','Vehicles.EventID','Vehicles.AirMedical'
                      ,'Vehicles.AirMedicalType','Vehicles.flightScheduledType','Vehicles.flightServiceType'
                      ,'Vehicles.flightTerminalType','Vehicles.RegisteredOwner','Vehicles.RegulationFlightConductedUnder'
                      ,'Vehicles.RepGenFlag','Vehicles.RevenueSightseeing','Vehicles.SecondPilotPresent','Vehicles.Damage'
                      ,'AccidentSiteCondition'], inplace=True) 

# dropping NaT entries from EventDate
df_ntsb = df_ntsb.dropna(subset=['EventDate'])

# Type Conversion
df_ntsb['EventDate'] = pd.to_datetime(df_ntsb['EventDate']).dt.tz_localize(None)
df_ntsb['Vehicles.VehicleNumber'] = pd.to_numeric(df_ntsb['Vehicles.VehicleNumber'], errors='coerce').astype(int)
df_ntsb['MKey'] = pd.to_numeric(df_ntsb['MKey'], errors='coerce').astype(int)
df_ntsb['Vehicles.NumberOfEngines'] = pd.to_numeric(df_ntsb['Vehicles.NumberOfEngines'], errors='coerce').fillna(0).astype(int)
df_ntsb['Latitude'] = pd.to_numeric(df_ntsb['Latitude'], errors='coerce').astype(float)
df_ntsb['Longitude'] = pd.to_numeric(df_ntsb['Longitude'], errors='coerce').astype(float)
df_ntsb['TotalInjuryCount'] = pd.to_numeric(df_ntsb['TotalInjuryCount'], errors='coerce').astype(int)

categorical_cols = [
    'Vehicles.DamageLevel',
    'Vehicles.ExplosionType',
    'Vehicles.FireType',
    'HighestInjury',
    'EventType',
    'AccidentSiteCondition'
]

for col in categorical_cols:
    if col in df_ntsb.columns:
        df_ntsb[col] = df_ntsb[col].astype('category')

df_ntsb = df_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase


print(df_ntsb.head())

print('\n--- DataFrame Info ---')
df_ntsb.info()




--- NTSB JSON loaded: 22992 total records found ---

--- Flattened NTSB DataFrame (including Vehicles info): ---
   Vehicles.VehicleNumber Vehicles.DamageLevel Vehicles.ExplosionType  \
0                       1          substantial                   none   
1                       1          substantial                   none   
2                       1          substantial                   none   
3                       1          substantial                   none   
4                       1          substantial                   none   

  Vehicles.FireType Vehicles.SerialNumber       Vehicles.Make Vehicles.Model  \
0              none                  0702  cirrus design corp          sr22t   
1              none                 004ce   golden avio s r l       f30 brio   
2              none               4692139  piper aircraft inc     pa46r-350t   
3              none              79-30941            bellanca         17-30a   
4              none                 7452c      

### 2. Data Profiling

In [None]:
def profile_dataframe(df, name='DataFrame'):
    print(f'\n=== Profiling {name} ===')
    print(f'Total Rows: {len(df)}')
    print(f'Total Columns: {len(df.columns)}\n')
    
    # Basic stats: cardinality, missing, data type
    profile_results = []
    for col in df.columns:
        unique_vals = df[col].nunique(dropna=False)
        missing_vals = df[col].isna().sum()
        dtype_ = str(df[col].dtype)
        profile_results.append(
            (col, dtype_, unique_vals, missing_vals)
        )
    prof_df = pd.DataFrame(
        profile_results,
        columns=['Column', 'DataType', 'Cardinality', 'NumMissing']
    )
    return prof_df

ntsb_profile = profile_dataframe(df_ntsb, name='NTSB Data')
print(ntsb_profile)


=== Profiling NTSB Data ===
Total Rows: 23403
Total Columns: 27

                          Column        DataType  Cardinality  NumMissing
0         Vehicles.VehicleNumber           int64            3           0
1           Vehicles.DamageLevel        category            6           3
2         Vehicles.ExplosionType        category            6        1523
3              Vehicles.FireType        category            7          82
4          Vehicles.SerialNumber          object        21514         120
5                  Vehicles.Make          object         1098           1
6                 Vehicles.Model          object         3362           5
7       Vehicles.NumberOfEngines           int64            5           0
8    Vehicles.RegistrationNumber          object        22386           6
9   Vehicles.FlightOperationType          object           22        1810
10         Vehicles.OperatorName          object         9289       12113
11                           Oid          obje

In [24]:
df_ntsb.describe(include='all')

Unnamed: 0,Vehicles.VehicleNumber,Vehicles.DamageLevel,Vehicles.ExplosionType,Vehicles.FireType,Vehicles.SerialNumber,Vehicles.Make,Vehicles.Model,Vehicles.NumberOfEngines,Vehicles.RegistrationNumber,Vehicles.FlightOperationType,Vehicles.OperatorName,Oid,MKey,HighestInjury,NtsbNumber,ProbableCause,City,Country,EventDate,State,Agency,EventType,AirportId,AirportName,Latitude,Longitude,TotalInjuryCount
count,23403.0,23400,21880,23321,23283.0,23402,23398.0,23403.0,23397,21593,11290,23403,23403.0,23307,23403,23205,23403,23403,23403,23356,22495,23403,17179,17208,23107.0,23106.0,23403.0
unique,,5,5,6,21513.0,1097,3361.0,,22385,21,9288,22992,,4,22992,20889,6092,1,,57,2,3,5358,8773,,,
top,,substantial,none,none,1.0,cessna,172.0,,unreg,pers,pilot,67ee2d96017de3d12edfdd84,,none,cen10fa115,a loss of engine power for undetermined reasons.,anchorage,usa,,ca,ntsb,acc,none,private,,,
freq,,19718,21306,20993,19.0,8191,762.0,,27,14516,176,3,,13920,3,77,161,23403,,2244,22459,22679,333,187,,,
mean,1.018117,,,,,,,1.13823,,,,,84032.497543,,,,,,2012-08-07 07:34:48.078451456,,,,,,1941.001,-264.209,0.763706
min,1.0,,,,,,,0.0,,,,,56260.0,,,,,,2003-01-01 13:12:00,,,,,,13.29278,-1004241.0,0.0
25%,1.0,,,,,,,1.0,,,,,65775.5,,,,,,2007-05-06 02:55:00,,,,,,33.81167,-115.3719,0.0
50%,1.0,,,,,,,1.0,,,,,82662.0,,,,,,2011-12-31 12:11:00,,,,,,38.60833,-95.67028,0.0
75%,1.0,,,,,,,1.0,,,,,96128.0,,,,,,2017-09-27 07:47:30,,,,,,42.65556,-83.05994,1.0
max,3.0,,,,,,,4.0,,,,,195527.0,,,,,,2023-12-31 17:40:00,,,,,,43511180.0,174.1242,190.0


In [22]:
df_ntsb.loc[df_ntsb['NtsbNumber']=='ops24la011']

Unnamed: 0,Vehicles.VehicleNumber,Vehicles.DamageLevel,Vehicles.ExplosionType,Vehicles.FireType,Vehicles.SerialNumber,Vehicles.Make,Vehicles.Model,Vehicles.NumberOfEngines,Vehicles.RegistrationNumber,Vehicles.FlightOperationType,Vehicles.OperatorName,Oid,MKey,HighestInjury,NtsbNumber,ProbableCause,City,Country,EventDate,State,Agency,EventType,AirportId,AirportName,Latitude,Longitude,TotalInjuryCount
39,1,none,none,none,c0218,diamond aircraft ind inc,da20-c1,1,n857pa,,diamond aircraft sales of kentucky llc,67ee2dab017de3d12ee03758,193529,,ops24la011,,north las vegas,usa,2023-12-09 13:06:00,nv,ntsb,occ,vgt,north las vegas,36.211268,-115.19968,0
40,2,none,none,none,1955,robinson helicopter,r44,1,n744af,,skyline helicopter tours llc,67ee2dab017de3d12ee03758,193529,,ops24la011,,north las vegas,usa,2023-12-09 13:06:00,nv,ntsb,occ,vgt,north las vegas,36.211268,-115.19968,0


In [None]:
for _ wehfiher:
    if null:
     pass

SyntaxError: invalid syntax (620194965.py, line 1)

## Data cleaning
1. Load the `.csv` and `.json` dataset;
2. Drop the rows that do not contains required data
3. Fill the `na` cells with a predefined value
4. Drop eventualy doplicates
5. Convert the string data into the proper data type

In [None]:
PATH = 'data_sources'

# Load dataset into pandas dataframe
df_airline_traffic = pd.read_csv(f'{PATH}/u-s-airline-traffic-data.csv')
df_ntsb = pd.read_json(f'{PATH}/ntsb-us-2003-2023.json')

print('Check NA values presence before data validation')
print(f'Airline traffic data frame: {df_airline_traffic.isna().any().any()}')
print(f'NTSB data frame: {df_ntsb.isna().any().any()}')



print(df_ntsb.columns.tolist())
#print(df_ntsb.describe())  # Summary statistics
print(df_ntsb.info())  # Data types and missing values
#print(df_ntsb.isnull().sum())  # Check missing values

df_ntsb

Check NA values presence before data validation
Airline traffic data frame: False
NTSB data frame: True
['Oid', 'MKey', 'HighestInjury', 'NtsbNumber', 'ProbableCause', 'City', 'Country', 'EventDate', 'State', 'Agency', 'EventType', 'Vehicles', 'AirportId', 'AirportName', 'AccidentSiteCondition', 'Latitude', 'Longitude', 'TotalInjuryCount']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22992 entries, 0 to 22991
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Oid                    22992 non-null  object        
 1   MKey                   22992 non-null  int64         
 2   HighestInjury          22908 non-null  object        
 3   NtsbNumber             22992 non-null  object        
 4   ProbableCause          22800 non-null  object        
 5   City                   22992 non-null  object        
 6   Country                22992 non-null  object        
 7   EventDate    

Unnamed: 0,Oid,MKey,HighestInjury,NtsbNumber,ProbableCause,City,Country,EventDate,State,Agency,EventType,Vehicles,AirportId,AirportName,AccidentSiteCondition,Latitude,Longitude,TotalInjuryCount
0,67ee2dab017de3d12ee0378c,193595,none,cen24la079,,davenport,usa,2023-12-31 17:40:00,ia,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",dvn,,vmc,41.610278,-90.588361,0
1,67ee2dab017de3d12ee03793,193603,none,era24la084,the pilot’s improper recovery from a bounced l...,midland,usa,2023-12-31 16:13:00,va,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",hwy,warrenton/fauquier,vmc,38.586285,-77.710631,0
2,67ee2dab017de3d12ee0377f,193581,fatal,era24fa078,,mooresville,usa,2023-12-31 14:13:00,nc,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",14a,lake norman airpark,vmc,35.624637,-80.912255,1
3,67ee2dab017de3d12ee03783,193585,serious,cen24la077,,washington,usa,2023-12-30 17:00:00,mo,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",fyg,washington regional,vmc,38.587583,-90.993806,1
4,67ee2dab017de3d12ee03794,193605,none,cen24la081,the pilot’s failure to maintain directional co...,beaumont,usa,2023-12-29 16:27:00,tx,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",bpt,jack brooks rgnl,vmc,30.070603,-94.215837,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22987,67ee2d86017de3d12edf9ae9,56292,none,chi03la045,failure of the pilot-in-command to maintain di...,alton,usa,2003-01-03 13:00:00,il,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",aln,st louis regional,vmc,38.890277,-90.046112,0
22988,67ee2d86017de3d12edf9b2f,56373,none,mia03la044,the loss of engine power for undetermined reas...,jessup,usa,2003-01-03 10:12:00,ga,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Substant...",,,vmc,31.431667,-81.819999,0
22989,67ee2d86017de3d12edf9ad1,56264,fatal,ftw03fa071,the pilot's failure to maintain adequate airsp...,shreveport,usa,2003-01-02 14:46:00,la,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Destroye...",kdtn,shreveport downtown airport,vmc,32.593055,-93.765556,2
22990,67ee2d86017de3d12edf9acf,56260,serious,den03la027,the inadequate altitude chosen by the flight i...,milford,usa,2003-01-01 23:00:00,ut,ntsb,acc,"[{'VehicleNumber': 1, 'DamageLevel': 'Destroye...",,,vmc,38.283332,-112.900001,2


In [None]:
# Code to filter to the date we want

# Debug: Check min and max dates
print('Earliest Date:', df_ntsb['EventDate'].min())
print('Latest Date:', df_ntsb['EventDate'].max())

# Define the date range (without timezone)
start_date = pd.to_datetime('2003-01-01')
end_date = pd.to_datetime('2023-12-31')

# Filter the dataset
filtered_df = df_ntsb[(df_ntsb['EventDate'] >= start_date) & (df_ntsb['EventDate'] <= end_date) & (df_ntsb['Country'] == 'usa')]
print(filtered_df['State'].tolist())
filtered_df
# Display results
#print(f'Total Records Found: {len(filtered_df)}')
#print(filtered_df[['EventDate', 'HighestInjury', 'Country']].sample(10))  # Show 50 random dates

### open-meteo API call test

In [None]:
# Define the endpoint
endpoint = 'https://archive-api.open-meteo.com/v1/archive'

# Define the parameters
params = {
    'latitude': 41.610278,
    'longitude': -90.588361,
    'start_date': '2023-12-31',
    'end_date': '2023-12-31',
    'hourly': ','.join([
        'temperature_2m',
        'relative_humidity_2m',
        'dew_point_2m',
        'pressure_msl',
        'surface_pressure',
        'precipitation',
        'rain',
        'snowfall',
        'cloud_cover',
        'cloud_cover_low',
        'cloud_cover_mid',
        'cloud_cover_high',
        'wind_speed_10m',
        'wind_speed_100m',
        'wind_direction_10m',
        'wind_direction_100m',
        'wind_gusts_10m',
        'weather_code',
        'snow_depth'
    ]),
    'timezone': 'GMT'
}

# Make the request
response = requests.get(endpoint, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    # Process the data as needed
    print(data)
    time_series = data['hourly']['time']
    try:
        idx = time_series.index('2023-12-31T17:00')
        selected_data = {k: v[idx] for k, v in data['hourly'].items() if k != 'time'}
        print(f'Weather data at 2023-12-31T17:00Z:')
        for key, val in selected_data.items():
            print(f'{key}: {val}')
    except ValueError:
        print('Selected hour not found in response.')
else:
    print(f'Error: {response.status_code}')


{'latitude': 41.581722, 'longitude': -90.64935, 'generationtime_ms': 0.3064870834350586, 'utc_offset_seconds': 0, 'timezone': 'GMT', 'timezone_abbreviation': 'GMT', 'elevation': 228.0, 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C', 'relative_humidity_2m': '%', 'dew_point_2m': '°C', 'pressure_msl': 'hPa', 'surface_pressure': 'hPa', 'precipitation': 'mm', 'rain': 'mm', 'snowfall': 'cm', 'cloud_cover': '%', 'cloud_cover_low': '%', 'cloud_cover_mid': '%', 'cloud_cover_high': '%', 'wind_speed_10m': 'km/h', 'wind_speed_100m': 'km/h', 'wind_direction_10m': '°', 'wind_direction_100m': '°', 'wind_gusts_10m': 'km/h', 'weather_code': 'wmo code', 'snow_depth': 'm'}, 'hourly': {'time': ['2023-12-31T00:00', '2023-12-31T01:00', '2023-12-31T02:00', '2023-12-31T03:00', '2023-12-31T04:00', '2023-12-31T05:00', '2023-12-31T06:00', '2023-12-31T07:00', '2023-12-31T08:00', '2023-12-31T09:00', '2023-12-31T10:00', '2023-12-31T11:00', '2023-12-31T12:00', '2023-12-31T13:00', '2023-12-31T14:00', '202