In [1]:
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/Aviation_Data.csv')
df.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.9222,-81.8781,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [3]:
#drop columns with more than 20% nans 
nan_cols = list(df.loc[:,df.isna().sum()/len(df) > .2].columns) 

#drop uninformative columns
nan_cols.extend(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Injury.Severity', 'Registration.Number', 'Report.Status', 'Publication.Date'])
nan_cols

['Latitude',
 'Longitude',
 'Airport.Code',
 'Airport.Name',
 'Aircraft.Category',
 'FAR.Description',
 'Schedule',
 'Air.carrier',
 'Broad.phase.of.flight',
 'Event.Id',
 'Investigation.Type',
 'Accident.Number',
 'Injury.Severity',
 'Registration.Number',
 'Report.Status',
 'Publication.Date']

In [4]:
# get a list of columns to keep
non_nan_cols = [x for x in df.columns if x not in nan_cols]
non_nan_cols

['Event.Date',
 'Location',
 'Country',
 'Aircraft.damage',
 'Make',
 'Model',
 'Amateur.Built',
 'Number.of.Engines',
 'Engine.Type',
 'Purpose.of.flight',
 'Total.Fatal.Injuries',
 'Total.Serious.Injuries',
 'Total.Minor.Injuries',
 'Total.Uninjured',
 'Weather.Condition']

In [5]:
# remove columns from df
df = df[non_nan_cols].copy()
df.head()

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Amateur.Built,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,No,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,No,1.0,Reciprocating,Personal,3.0,,,,IMC
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,No,,,Personal,1.0,2.0,,0.0,VMC


In [6]:
# Check if any of the injury columns have true values. If any do, fill nans with 0s. 
# If all columns are nans, assume the data was not logged and keep them as nans. 
injury_cols = ['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured']
injury_data_exists = df[injury_cols].apply(lambda col: any(col), axis = 1)

for col in injury_cols:
    df.loc[injury_data_exists & df[col].isna(), col] = 0

In [7]:
#Make note of dividing by zero situation in pecert.Injured and Percent.Died / Number of Passengers

df['Total.Passengers'] = df['Total.Fatal.Injuries'] + df['Total.Serious.Injuries'] + df['Total.Minor.Injuries']+ df['Total.Uninjured']
df['Total.Injured'] = df['Total.Fatal.Injuries'] + df['Total.Serious.Injuries'] + df['Total.Minor.Injuries']
df['Percent.Injured'] = (df['Total.Injured'] / df['Total.Passengers']) * 100
df['Percent.Uninjured'] = (df['Total.Uninjured'] / df['Total.Passengers']) * 100
df['Percent.Died'] = (df['Total.Fatal.Injuries'] / df['Total.Passengers']) * 100
df.loc[df['Total.Passengers'] == 0, ['Percent.Injured','Percent.Died','Percent.Uninjured']] = 0

In [8]:
df['Airplane.Type'] = df['Total.Passengers'].apply(lambda x: 'Private' if x < 50 else 'Commercial')

In [9]:
# Remove rows where Amateur.Built is Yes or NaN, then remove Amateur.Built column
df = df.drop(df.loc[(df['Amateur.Built']=='Yes') |( df['Amateur.Built'].isna())].index)
df.reset_index(drop = True, inplace = True)
df.drop(columns = 'Amateur.Built', inplace = True)
df.head()

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Uninjured,Percent.Died,Airplane.Type
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK,2.0,2.0,100.0,0.0,100.0,Private
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK,4.0,4.0,100.0,0.0,100.0,Private
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,1.0,Reciprocating,Personal,3.0,0.0,0.0,0.0,IMC,3.0,3.0,100.0,0.0,100.0,Private
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC,2.0,2.0,100.0,0.0,100.0,Private
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,,,Personal,1.0,2.0,0.0,0.0,VMC,3.0,3.0,100.0,0.0,33.333333,Private


In [10]:
# Clean Weather Condition
df['Weather.Condition'].replace({'Unk':'Unknown','UNK':'Unknown','VMC':'Visual Meteorological Conditions',\
                                 'IMC':'Instrumental Meteorological Conditions'}, inplace = True)
df['Weather.Condition'].fillna('Unknown', inplace = True)
df['Weather.Condition'].value_counts(dropna=False)

Visual Meteorological Conditions          69084
Instrumental Meteorological Conditions     5831
Unknown                                    5397
Name: Weather.Condition, dtype: int64

In [11]:
# Check if Number.of.Engines = 0 corresponds to no Engine.Type
df[(df['Number.of.Engines']==0) & ((df['Engine.Type']!='NONE') & (df['Engine.Type'].notna())\
                                   & (df['Engine.Type'] != 'Unknown'))]

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Uninjured,Percent.Died,Airplane.Type
21149,1989-03-31,"GREAT HARBOR, Bahamas",Bahamas,Destroyed,Cessna,421B,0.0,Reciprocating,Unknown,1.0,0.0,0.0,1.0,Visual Meteorological Conditions,2.0,1.0,50.0,50.0,50.0,Private
67210,2013-09-02,"Hampshire, IL",United States,Substantial,SCHWEIZER,SGS 2 33A,0.0,,Instructional,0.0,0.0,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,100.0,0.0,Private
67790,2014-03-15,"Wrightwood, CA",United States,Substantial,DG FLUGZEUGBAU GMBH,DG 1000S,0.0,,Personal,0.0,1.0,0.0,1.0,Visual Meteorological Conditions,2.0,1.0,50.0,50.0,0.0,Private
68251,2014-07-04,"Nephi, UT",United States,Substantial,CENTRAIR,101A,0.0,,Personal,0.0,1.0,0.0,0.0,Visual Meteorological Conditions,1.0,1.0,100.0,0.0,0.0,Private
69487,2015-06-14,"Pell City, AL",United States,Substantial,WEATHERLY,201B,0.0,Reciprocating,Aerial Application,0.0,0.0,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,100.0,0.0,Private
72160,2017-04-22,"Park City, UT",United States,Substantial,BALONY KUBICEK SPOL SRO,BB70Z,0.0,,Business,0.0,0.0,0.0,9.0,Visual Meteorological Conditions,9.0,0.0,0.0,100.0,0.0,Private
72596,2017-07-21,"Deatsville, AL",United States,Substantial,LET,L23,0.0,,Instructional,0.0,1.0,0.0,0.0,Visual Meteorological Conditions,1.0,1.0,100.0,0.0,0.0,Private
73491,2018-03-28,"Cave Creek, AZ",United States,Destroyed,ULTRAMAGIC SA,N 300,0.0,,Business,0.0,0.0,0.0,13.0,Visual Meteorological Conditions,13.0,0.0,0.0,100.0,0.0,Private
73682,2018-05-13,"Tunbridge, VT",United States,Substantial,SCHWEIZER,SGS123,0.0,,Personal,0.0,0.0,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,100.0,0.0,Private
77888,2021-05-16,"Hampshire, IL",United States,Substantial,I.C.A.-BRASOV (ROMANIA),IS-28B2,0.0,,Instructional,0.0,1.0,1.0,0.0,Visual Meteorological Conditions,2.0,2.0,100.0,0.0,0.0,Private


In [12]:
# Replace Number.of.Engines = 0 with Number.of.Engines = Unknown where Engine.Type exists.
#Drop Unknown or null values for number of Engines
replace_indx = list(df[(df['Number.of.Engines']==0) 
                    & ((df['Engine.Type']!='NONE') 
                       & (df['Engine.Type'].notna()) 
                       & (df['Engine.Type'] != 'Unknown'))].index)

df.loc[replace_indx, 'Number.of.Engines'] = 'Unknown'
df.dropna(subset=['Number.of.Engines'],axis=0,inplace=True)
df = df[df['Number.of.Engines'] != 'Unknown']

In [13]:
# Clean Engine Type
df['Engine.Type'] = df.apply(lambda row: 'None' if row['Number.of.Engines']==0 else row['Engine.Type'], axis = 1)
df['Engine.Type'].fillna('UNK', inplace = True)
df['Engine.Type'].replace({'UNK':'Unknown'}, inplace = True)
df['Engine.Type'].value_counts(dropna=False)

Reciprocating      61261
Turbo Shaft         3501
Turbo Prop          3249
Turbo Fan           2369
Unknown             2353
None                1159
Turbo Jet            682
Geared Turbofan       11
Electric               8
LR                     2
Hybrid Rocket          1
Name: Engine.Type, dtype: int64

In [14]:
# Aircraft Damage
df['Aircraft.damage'].fillna('Unknown', inplace = True)
df['Aircraft.damage'].value_counts(dropna = False)

Substantial    55477
Destroyed      14888
Minor           2175
Unknown         2056
Name: Aircraft.damage, dtype: int64

In [15]:
#Transform the Event Date into three separate Date columns that specify day, month and year of the incident.
#Dropping 'Event.Date' once the columns are created:

df['Event.Date'] = pd.to_datetime(df['Event.Date'])
df['Event.Day'] = df['Event.Date'].map(lambda x: x.day)
df['Event.Month'] = df['Event.Date'].map(lambda x: x.month)
df['Event.Year'] = df['Event.Date'].map(lambda x: x.year)
df.drop(['Event.Date'], axis=1, inplace=True)

In [16]:
#Grouping all the Countries outside of the US as "Foreign Country"
df.loc[df['Country'] != 'United States', 'Country'] = 'Foreign Country'

In [17]:
#Filling the Null Values with "UNKNOWN" the the Location Column
df['Location'] = df['Location'].fillna('UNKNOWN')

# Adjusting the Column so it shows code of the corresponding State, if it's Unknown or outside of the US, label accordingly
df['Location'] = df['Location'].apply(lambda location: location.split(', ')[-1] if \
                                      len(location.split(', ')) > 1 and len(location.split(', ')[-1]) == 2 else \
                                      'Unknown/Foreign Location')

In [18]:
#Cleaning the 'Make' Column
characters_to_remove = ['(', ')', ',', '.', '%', '?','-']
df['Make'] = df['Make'].str.title().fillna('UNKNOWN')
df['Make'] = df['Make'].map(lambda x: ''.join(char for char in x if char not in characters_to_remove))
names_var = {'Boeing': 'Boeing', 'Airbus':'Airbus','Douglas':'Mcdonnel Douglas', \
             'Air Tractor':'Air Tractor','Embraer':'Embraer','Bombardier':'Bombardier'}
for key, value in names_var.items():
    df.loc[df['Make'].str.contains(key), 'Make'] = value
top_50_makes = df['Make'].value_counts().index[:50]
df = df[df['Make'].isin(top_50_makes)]

In [19]:
#Replacing NaN values for 'Unknown in 'Purpose of flight':
df['Purpose.of.flight'].fillna('Unknown', inplace=True)

In [20]:
#Cleaning the "Model" column

df['Model'] = df['Model'].str.title().fillna('UNKNOWN')

In [21]:
df_priv = df[df['Airplane.Type'] == 'Private'].reset_index(drop = True)
df_comm = df[df['Airplane.Type'] == 'Commercial'].reset_index(drop = True)

In [22]:
df_priv.to_csv('Private Airplanes cleaned.csv')
df_comm.to_csv('Commercial Airplanes cleaned.csv')
df.to_csv('Aviation Data Cleaned.csv')