## Project Phase 1 - Aviation Accident Data Integration
### Group 03:
- Tommaso Tragno - fc64699
- Manuel Cardoso - fc56274
- Chen Cheng - fc64872
- Cristian Tedesco - fc65149

#### Setup

In [None]:
import pandas as pd
import json
import pymongo as pm
import mysql.connector
import time
import requests

## Data cleaning
1. Load the `.csv` and `.json` dataset;
2. Drop the rows that do not contains required data
3. Fill the `na` cells with a predefined value
4. Drop eventualy doplicates
5. Convert the string data into the proper data type

In [None]:
PATH = 'data_sources'

# Load dataset into pandas dataframe
df_airline_traffic = pd.read_csv(f'{PATH}/u-s-airline-traffic-data.csv')
df_ntsb = pd.read_json(f'{PATH}/ntsb-us-2003-2023.json')

print('Check NA values presence before data validation')
print(f'Airline traffic data frame: {df_airline_traffic.isna().any().any()}')
print(f'NTSB data frame: {df_ntsb.isna().any().any()}')

# Convert EventDate to datetime and remove timezone
df_ntsb['EventDate'] = pd.to_datetime(df_ntsb['EventDate']).dt.tz_localize(None)

#df.drop_duplicates(subset=[col for col in df.columns if df[col].dtype != 'object'], inplace=True) # no need to drop duplicates because there aren't

df_ntsb = df_ntsb.map(lambda x: x.lower() if isinstance(x, str) else x) # make all appropriate values lowercase

# combines all injury counts to 1 column
df_ntsb['TotalInjuryCount'] = df_ntsb[['FatalInjuryCount', 'MinorInjuryCount', 'SeriousInjuryCount']].sum(axis=1)

# dropping unnecessary columns
df_ntsb.drop(columns=['AnalysisNarrative','FactualNarrative','PrelimNarrative','InvestigationClass','BoardLaunch','BoardMeetingDate','Launch','IsStudy'
                 ,'OriginalPublishedDate','DocketOriginalPublishDate','ReportType','ReportNum','ReportDate','MostRecentReportType'
                 ,'FatalInjuryCount','MinorInjuryCount','SeriousInjuryCount','DocketDate','Mode','HasSafetyRec','CompletionStatus','Closed'], inplace=True) 

# dropping NaT entries from EventDate
df_ntsb = df_ntsb.dropna(subset=['EventDate'])

print(df_ntsb.columns.tolist())
#print(df.describe())  # Summary statistics
#print(df.info())  # Data types and missing values
#print(df.isnull().sum())  # Check missing values

df_ntsb

Check NA values presence before data validation
Airline traffic data frame: False
NTSB data frame: True


In [None]:
# Code to filter to the date we want

# Debug: Check min and max dates
print("Earliest Date:", df_ntsb['EventDate'].min())
print("Latest Date:", df_ntsb['EventDate'].max())

# Define the date range (without timezone)
start_date = pd.to_datetime('2003-01-01')
end_date = pd.to_datetime('2023-12-31')

# Filter the dataset
filtered_df = df_ntsb[(df_ntsb['EventDate'] >= start_date) & (df_ntsb['EventDate'] <= end_date) & (df_ntsb['Country'] == 'usa')]
print(filtered_df['State'].tolist())
filtered_df
# Display results
#print(f"Total Records Found: {len(filtered_df)}")
#print(filtered_df[['EventDate', 'HighestInjury', 'Country']].sample(10))  # Show 50 random dates

### open-meteo API call test

In [None]:
# Define the endpoint
endpoint = "https://archive-api.open-meteo.com/v1/archive"

# Define the parameters
params = {
    "latitude": 41.610278,
    "longitude": -90.588361,
    "start_date": "2023-12-31",
    "end_date": "2023-12-31",
    "hourly": ",".join([
        "temperature_2m",
        "relative_humidity_2m",
        "dew_point_2m",
        "pressure_msl",
        "surface_pressure",
        "precipitation",
        "rain",
        "snowfall",
        "cloud_cover",
        "cloud_cover_low",
        "cloud_cover_mid",
        "cloud_cover_high",
        "wind_speed_10m",
        "wind_speed_100m",
        "wind_direction_10m",
        "wind_direction_100m",
        "wind_gusts_10m",
        "weather_code",
        "snow_depth"
    ]),
    "timezone": "GMT"
}

# Make the request
response = requests.get(endpoint, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    # Process the data as needed
    print(data)
    time_series = data["hourly"]["time"]
    try:
        idx = time_series.index("2023-12-31T17:00")
        selected_data = {k: v[idx] for k, v in data["hourly"].items() if k != "time"}
        print(f"Weather data at 2023-12-31T17:00Z:")
        for key, val in selected_data.items():
            print(f"{key}: {val}")
    except ValueError:
        print("Selected hour not found in response.")
else:
    print(f"Error: {response.status_code}")


{'latitude': 41.581722, 'longitude': -90.64935, 'generationtime_ms': 0.3064870834350586, 'utc_offset_seconds': 0, 'timezone': 'GMT', 'timezone_abbreviation': 'GMT', 'elevation': 228.0, 'hourly_units': {'time': 'iso8601', 'temperature_2m': '°C', 'relative_humidity_2m': '%', 'dew_point_2m': '°C', 'pressure_msl': 'hPa', 'surface_pressure': 'hPa', 'precipitation': 'mm', 'rain': 'mm', 'snowfall': 'cm', 'cloud_cover': '%', 'cloud_cover_low': '%', 'cloud_cover_mid': '%', 'cloud_cover_high': '%', 'wind_speed_10m': 'km/h', 'wind_speed_100m': 'km/h', 'wind_direction_10m': '°', 'wind_direction_100m': '°', 'wind_gusts_10m': 'km/h', 'weather_code': 'wmo code', 'snow_depth': 'm'}, 'hourly': {'time': ['2023-12-31T00:00', '2023-12-31T01:00', '2023-12-31T02:00', '2023-12-31T03:00', '2023-12-31T04:00', '2023-12-31T05:00', '2023-12-31T06:00', '2023-12-31T07:00', '2023-12-31T08:00', '2023-12-31T09:00', '2023-12-31T10:00', '2023-12-31T11:00', '2023-12-31T12:00', '2023-12-31T13:00', '2023-12-31T14:00', '202