# Mapping Aircraft Accidents

Using the folium library in addition to the accident information from the NTSB database, I made an interactive graph
to look at the distribution of accidents around the US, to get a sense of the types of accidents that were occuring and 
where they were occuring.

In [13]:
import folium
from folium import plugins
from folium import Popup
from folium.plugins import MarkerCluster
import pandas as pd
import numpy as np
import pyodbc

In [14]:
# Connect to Access Database downloaded from 'https://app.ntsb.gov/avdata/'
# This database contains all accident data from 1982 to the present, in a
# relational database accessed with SQL commands.
access_database = r'C:\Users\thwhi\Dropbox\Personal\Programming\Aircraft Stuff\avall_full_aircraft_crash_database\avall.mdb'
driver = r'Microsoft Access Driver (*.mdb, *.accdb)'
conn = pyodbc.connect(r'DRIVER={' + driver + '};DBQ=' + access_database + ';')

In [15]:
# Mapping accidents by lat lon gps coordinates. 
# Most accidents occurring during or after 2002 included GPS data, however most accidents occuring before 2002 did not.

sql_event_location = '''
SELECT ev_id, ntsb_no, ev_type, ev_date, latitude, longitude, ev_site_zipcode, 
ev_city, ev_state, inj_tot_f, inj_tot_s, inj_tot_m
FROM events
WHERE ev_country='USA'
'''

# Read data from database with pandas read_sql_query
df_event_locations = pd.read_sql_query(sql_event_location,conn)

In [16]:
# Data cleaning:

df_event_locations['ev_date'] = pd.to_datetime(df_event_locations['ev_date'])

for col in df_event_locations.columns:
    if df_event_locations[col].dtype == 'O':
        df_event_locations[col] = df_event_locations[col].str.strip()

# Latitudes and longitudes included a letter at the end, and needed to be transformed so they could be used for mapping       
lat_regex = r'(\d{6}N)'
lon_regex = r'(\d{7}W)'

df_event_locations = (df_event_locations[df_event_locations['latitude']
                     .astype(str)
                     .str.match(lat_regex)])
df_event_locations = (df_event_locations[df_event_locations['longitude']
                     .astype(str)                     
                     .str.match(lon_regex)])

df_event_locations['lat'] = (df_event_locations['latitude']
                            .str[:6]
                            .astype(int)/10000)

df_event_locations['lon'] = (df_event_locations['longitude']
                            .str[:7]
                            .astype(int)/-10000)

In [17]:
# Aircraft type information to include:

sql_aircraft_type = '''
SELECT ev_id, acft_make, acft_model, damage
FROM aircraft
'''
df_aircraft_type = pd.read_sql_query(sql_aircraft_type,conn)

df_event_locations = df_event_locations.merge(df_aircraft_type,how='left',
                                              left_on='ev_id',right_on='ev_id')

In [18]:
# Accident Cluster Map
# Due to large file size the default is to only look at accidents that took place during or after 2018

def make_cluster_map(start_date = '2018',end_date='2050'):

    accident_map = folium.Map(location = [45, -102], zoom_start=3)
    marker_cluster = MarkerCluster().add_to(accident_map)

    df_date_range = df_event_locations[(df_event_locations['ev_date'] >= start_date) & (df_event_locations['ev_date'] <= end_date)]
    
    for _, row in df_date_range.iterrows():
        lat = row['lat']
        lon = row['lon']
        if not pd.isnull(row['ev_date']):
            date = row['ev_date'].strftime('%B %d, %Y')
        else: date='Unknown'
        injuries = 'Injuries:'
        if not (pd.isnull(row['inj_tot_f'])):
            injuries = injuries + ' Fatal = ' + str(int(row['inj_tot_f']))
        if not (pd.isnull(row['inj_tot_s'])):
            injuries = injuries + ' Serious = ' + str(int(row['inj_tot_s']))
        if not (pd.isnull(row['inj_tot_m'])):
            injuries = injuries + ' Minor = ' + str(int(row['inj_tot_m']))
        if injuries == 'Injuries:':
            injuries = 'Injuries: None'
        if not (pd.isnull(row['acft_make']) or pd.isnull(row['acft_model'])): 
            aircraft = row['acft_make'] + ' ' + row['acft_model']
        popup_html = '<div style="min-width: 150px;">' + injuries + '<br/>NTSB #: ' + row['ntsb_no'] + '<br/>Date: ' + date + '<br/>Aircraft: ' + aircraft + '</div>'
        folium.Marker([lat,lon],popup=Popup(html=popup_html)).add_to(marker_cluster)
    return(accident_map)

m = make_cluster_map('2010')
m.save('aircraft_accident_map_2010_to_present.html')

In the above generated cluster map, it's quite easy to dive into different areas of the country to see where accidents occur, and to get additional information about those accidents by clicking on the popups. 

In [19]:
# Accident heat map

def make_heatmap(start_date = '2018',end_date='2050'):
    
    df_date_range = df_event_locations[(df_event_locations['ev_date'] >= start_date) & (df_event_locations['ev_date'] <= end_date)]
    
    accident_heatmap = folium.Map(location = [45, -102], zoom_start=3)
    heat_data = [[lat, lon] for lat, lon in zip(df_date_range['lat'],df_date_range['lon'])]
    plugins.HeatMap(heat_data,min_opacity=.5,radius=10).add_to(accident_heatmap)
    return(accident_heatmap)
make_heatmap(start_date='2010')

In the heatmap shown above, it's clear which parts of the country have higher accident rates.

In [20]:
# Accident heat map with time

def make_heatmap_over_time(start_date = '2002',end_date='2050'):
    
    df_date_range = df_event_locations[(df_event_locations['ev_date'] >= start_date) & (df_event_locations['ev_date'] <= end_date)]

#     df_date_range['year_month'] = df_date_range['ev_date'].dt.strftime('%Y%m')
    df_date_range['year_month'] = df_date_range['ev_date'].dt.strftime('%Y')
    yearmonths = df_date_range[['lat','lon','year_month']].groupby('year_month').agg(lambda x: list(x))
    heatdata = []
    for _, row in yearmonths.iterrows():
        heatdata.append([list(a) for a in zip(row['lat'],row['lon'])])

    accident_heatmap_over_time = folium.Map(location = [45, -102], zoom_start=3)
    index = list(yearmonths.index)
    plugins.HeatMapWithTime(heatdata,index=index,min_opacity=5,radius=5).add_to(accident_heatmap_over_time)

    return(accident_heatmap_over_time)

make_heatmap_over_time()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In the heatmap over time shown above, one can see how the rate of aircraft accidents varies over time around the country.