# Import Libraries

In [1]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import requests
import html5lib
from bs4 import BeautifulSoup

# Grab and Parse Webpages for Data

In [2]:
# Grab webpages with usable data
years = range(1921, 2026)
pages = []

for year in years:

  url = f'https://www.planecrashinfo.com/{year}/{year}.htm'
  r = requests.get(url, timeout = 10)
  r.encoding = 'windows-1252'
  soup = BeautifulSoup(r.text, 'html5lib')
  table = soup.find('table')

  if (table != None):

    for row in table.find_all('tr')[1:]:

      for cell in row.find('td'): # First cell contains the hyperlink

        href = cell.find('a')['href']
        url = f'https://www.planecrashinfo.com/{year}/{href}'
        r = requests.get(url, timeout = 10)
        r.encoding = 'windows-1252'
        pages.append(r)

  else:
    print(f"{year} didn't load properly")

print(f'{len(pages)} pages loaded') # Should be 5011 if all pages load

1957
1979
1992
4817


In [6]:
# Parse webpages for relevant data
data = {
    'Date': [],
    'Time': [],
    'Location': [],
    'Operator': [],
    'Flight #': [],
    'Route': [],
    'AC Type': [],
    'Registration': [],
    'cn / ln': [],
    'Aboard': [],
    'Fatalities': [],
    'Ground': [],
    'Summary': []
}

for page in pages:

  soup = BeautifulSoup(page.text, 'html5lib')
  table = soup.find('table')

  for row in table.find_all('tr')[1:]:

    cells = row.find_all('td') # Each row should always have 2 cells
    feature = cells[0].text.replace('\n        ', ' ').strip(': ')
    value = cells[1].text if cells[1].text != '?' else None # '?' represents missing data
    data.get(feature).append(value)

# Create and Clean DataFrame

In [None]:
# Create DataFrame from dictionary
df = pd.DataFrame(data=data)

# Clean and reorganize data
df[['Month', 'Day', 'Year']] = df['Date'].str.replace(',', '').str.split(' ', expand = True)
df[['Aboard', 'Passengers Aboard', 'Crew Aboard']] = df['Aboard'].str.replace('\s+', ' ', regex = True).str.split(' ', expand = True)
df[['Fatalities', 'Passenger Fatalities', 'Crew Fatalities']] = df['Fatalities'].str.replace('\s+', ' ', regex = True).str.split(' ', expand = True)
df['Passengers Aboard'] = df['Passengers Aboard'].str.replace('[^\d?]', '', regex = True)
df['Crew Aboard'] = df['Crew Aboard'].str.replace('[^\d?]', '', regex = True)
df['Passenger Fatalities'] = df['Passenger Fatalities'].str.replace('[^\d?]', '', regex = True)
df['Crew Fatalities'] = df['Crew Fatalities'].str.replace('[^\d?]', '', regex = True)

df = df.drop('Date', axis = 1)
df = df.replace('?', None)
df = df.dropna()

numerical_cols = [
    'Aboard', 'Passengers Aboard', 'Crew Aboard',
    'Fatalities', 'Passenger Fatalities', 'Crew Fatalities',
    'Ground'
]
df[numerical_cols] = df[numerical_cols].astype(np.int64)

new_index = ['Day', 'Month', 'Year', 'Time', 'Location', 'Route', 'Aboard',
             'Passengers Aboard', 'Crew Aboard', 'Fatalities',
             'Passenger Fatalities', 'Crew Fatalities', 'Ground', 'Flight #',
             'AC Type', 'Registration', 'cn / ln', 'Summary'
]
df = df[new_index]

df.to_csv('Global Aviation Incidents.csv', index = False)
df.info()

In [2]:
# Load DataFrame from file
df = pd.read_csv('Global Aviation Incidents.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1223 entries, 0 to 1222
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Day                   1223 non-null   int64 
 1   Month                 1223 non-null   object
 2   Year                  1223 non-null   int64 
 3   Time                  1223 non-null   object
 4   Location              1223 non-null   object
 5   Route                 1223 non-null   object
 6   Aboard                1223 non-null   int64 
 7   Passengers Aboard     1223 non-null   int64 
 8   Crew Aboard           1223 non-null   int64 
 9   Fatalities            1223 non-null   int64 
 10  Passenger Fatalities  1223 non-null   int64 
 11  Crew Fatalities       1223 non-null   int64 
 12  Ground                1223 non-null   int64 
 13  Flight #              1223 non-null   object
 14  AC Type               1223 non-null   object
 15  Registration          1223 non-null   

# EDA

In [3]:
# EDA
df.head()

Unnamed: 0,Day,Month,Year,Time,Location,Route,Aboard,Passengers Aboard,Crew Aboard,Fatalities,Passenger Fatalities,Crew Fatalities,Ground,Flight #,AC Type,Registration,cn / ln,Summary
0,19,January,1930,1823,"Oceanside, California","Aqua Caliente, Mexico - Los Angeles",16,14,2,16,14,2,0,7,Ford 5-AT-C Tri Motor,NC9689,5-AT-046,"While en route to Los Angeles, the pilot, flyi..."
1,31,March,1931,1045,"Bazaar, Kansas",Kansas City - Wichita - Los Angeles,8,6,2,8,6,2,0,599,Fokker F10A Trimotor,NC-999,1063,"Shortly after taking off from Kansas City, one..."
2,31,August,1934,2245,"Amazonia, Missouri",Kansas City - Saint Joseph - Omaha,5,4,1,5,4,1,0,6,Stinson SM-6000B,NC11118,5004,The plane crashed about 11 miles from St. Jose...
3,6,May,1935,330,"Atlanta, Missouri",Los Angeles - Albuquerque - Kanasas City - Wa...,14,12,2,5,3,2,0,6,Douglas DC-2-112,NC13785,1295,The plane crashed while en route from Albuquer...
4,14,August,1935,2345,"Near Gilmer, Texas",Dallas - Atlanta,4,3,1,4,3,1,0,4,Stinson Model A,NC14599,9103,Crashed 3 miles south of Gilmer. The outboard ...


In [9]:
df[df['Summary'].str.contains('lightning')]

Unnamed: 0,Day,Month,Year,Time,Location,Route,Aboard,Passengers Aboard,Crew Aboard,Fatalities,Passenger Fatalities,Crew Fatalities,Ground,Flight #,AC Type,Registration,cn / ln,Summary
27,31,August,1940,1441,"Near Lovettsville, Virginia",Washington D.C. - Detroit,25,21,4,25,21,4,0,19,Douglas DC-3,NC21789,2188,The aircraft was flying through a thunderstorm...
142,14,June,1953,1635,"Near Zugdidi, Georgia",Krasnodar - Tbilisi,18,12,6,18,12,6,0,229,Ilyushin IL-2,CCCP-L1375,30103,Crashed after being struck by lightning. Poor ...
199,7,September,1958,318,"Kazakh, Russia",Frunze - Aktyubinsk - Uralsk - Moscow,27,22,5,27,22,5,0,164,Aeroflot,CCCP-L1692,7342304,The aircraft crashed shortly after being struc...
213,26,June,1959,1735,"Near Varese, Lombardia, 20 miles NW of Milan, ...",Milan - Paris,68,59,9,68,59,9,0,891,Lockheed 1649A Starliner,N7313C,1015,"While flying from Athens to Chicago, the No. 6..."
242,29,August,1960,647,"Off Dakar, Senegal",Paris - Dakar - Monrovia - Abidjan,63,55,8,63,55,8,0,343,Lockheed 1049G Super Constellation,F-BHBC,4622,The aircraft crashed into the Atlantic Ocean a...
319,12,August,1963,1419,"Lyon, France",Lille - Lyon,20,16,4,20,16,4,1,2611,Vickers Viscount 708,F-BGNV,039,Struck by lightning while on initial approach ...
325,8,December,1963,2059,"Elkton, Maryland",Washington - Baltimore - Philadelphia,81,73,8,81,73,8,0,214,Boeing B-707-121,N709PA,17588/3,The airliner was on a flight from Baltimore to...
421,3,May,1968,1548,"Near Dawson, Texas",Houston - Dallas-Fort Worth,85,80,5,85,80,5,0,352,Lockheed L188A Electra,N9707C,1099,On a flight from Houston to Dallas the airline...
501,24,December,1971,1236,"Near Puerto Inca, Huanuco, Peru",Lima - Pucallpa,92,86,6,91,85,6,0,508,Lockheed 188A Electra,OB-R-941,1086,The aircraft was struck by lightning after it ...
540,23,July,1973,1643,"St. Louis, Missouri","Marion, IL - St. Louis",44,41,3,38,37,1,0,809,Fairchild-Hiller FH-227B,N4215,513,The aircraft crashed after attempting to land ...
