## Phase 1 Project Initial Doodling with Dataset

The purpose of this notebook is to just get a better idea of what is in the data and a little bit of exploratory work. Data cleaning and real EDA will be in separate notebooks.

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns


In [3]:
import os

In [8]:
df_original = pd.read_csv('../Data/AviationData.csv', encoding='latin1', low_memory=False)

df = df_original.copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50132 non-null  object 
 9   Airport.Name            52704 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87507 non-null  object 
 14  Make                    88826 non-null

In [19]:
df.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [20]:
df.tail()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
88884,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,,,,,...,Personal,,0.0,1.0,0.0,0.0,,,,29-12-2022
88885,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,,...,,,0.0,0.0,0.0,0.0,,,,
88886,20221227106497,Accident,WPR23LA075,2022-12-26,"Payson, AZ",United States,341525N,1112021W,PAN,PAYSON,...,Personal,,0.0,0.0,0.0,1.0,VMC,,,27-12-2022
88887,20221227106498,Accident,WPR23LA076,2022-12-26,"Morgan, UT",United States,,,,,...,Personal,MC CESSNA 210N LLC,0.0,0.0,0.0,0.0,,,,
88888,20221230106513,Accident,ERA23LA097,2022-12-29,"Athens, GA",United States,,,,,...,Personal,,0.0,1.0,0.0,1.0,,,,30-12-2022


In [18]:
df.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [23]:
df.describe()

Unnamed: 0,Number.of.Engines,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured
count,82805.0,77488.0,76379.0,76956.0,82977.0
mean,1.146585,0.647855,0.279881,0.357061,5.32544
std,0.44651,5.48596,1.544084,2.235625,27.913634
min,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,1.0
75%,1.0,0.0,0.0,0.0,2.0
max,8.0,349.0,161.0,380.0,699.0


In [21]:
print("Total fatal injuries:", df['Total.Fatal.Injuries'].sum())
print("Total serious injuries:", df['Total.Serious.Injuries'].sum())

Total fatal injuries: 50201.0
Total serious injuries: 21377.0


In [22]:
df['Aircraft.Category'].value_counts()

Aircraft.Category
Airplane             27617
Helicopter            3440
Glider                 508
Balloon                231
Gyrocraft              173
Weight-Shift           161
Powered Parachute       91
Ultralight              30
Unknown                 14
WSFT                     9
Powered-Lift             5
Blimp                    4
UNK                      2
Rocket                   1
ULTR                     1
Name: count, dtype: int64

In [9]:
df['Aircraft.Category'].isna().value_counts()

Aircraft.Category
True     56602
False    32287
Name: count, dtype: int64

In [10]:
df.loc[df['Aircraft.Category'].isna() == True]

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88883,20221228106502,Accident,GAA23WA046,2022-12-22,"Brasnorte,",Brazil,,,,,...,,,1.0,0.0,0.0,0.0,,,,28-12-2022
88884,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,,,,,...,Personal,,0.0,1.0,0.0,0.0,,,,29-12-2022
88885,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,,...,,,0.0,0.0,0.0,0.0,,,,
88887,20221227106498,Accident,WPR23LA076,2022-12-26,"Morgan, UT",United States,,,,,...,Personal,MC CESSNA 210N LLC,0.0,0.0,0.0,0.0,,,,


In [15]:
df.loc[df['Investigation.Type'] == 'Incident']

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
23,20020917X02333,Incident,LAX82IA034,1982-01-03,"VAN NUYS, CA",United States,,,VNY,VAN NUYS,...,Personal,,0.0,0.0,0.0,1.0,VMC,Approach,Probable Cause,03-01-1983
40,20020917X01764,Incident,ATL82IA029,1982-01-05,"PENSACOLA, FL",United States,,,,,...,Business,,0.0,0.0,0.0,1.0,VMC,Cruise,Probable Cause,05-01-1983
79,20020917X01897,Incident,CHI82IA026,1982-01-12,"CHICAGO, IL",United States,,,ORD,CHICAGO O'HARE INTER'L,...,Unknown,Trans World Airlines,0.0,0.0,0.0,149.0,UNK,Cruise,Probable Cause,12-01-1983
80,20020917X01765,Incident,ATL82IA034,1982-01-12,"CLARKSBURG, WV",United States,,,CKB,BENEDUM,...,Unknown,Aeromech Incorporated,0.0,0.0,0.0,2.0,VMC,Taxi,Probable Cause,12-01-1983
119,20020917X01766,Incident,ATL82IA038,1982-01-19,"WASHINGTON, DC",United States,,,IAD,,...,Ferry,,0.0,0.0,0.0,1.0,IMC,Descent,Probable Cause,19-01-1983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88809,20221125106356,Incident,DCA23WA074,2022-11-21,"Maturin,",Venezuela,,,,,...,,,0.0,0.0,0.0,71.0,,,,25-11-2022
88819,20221125106362,Incident,DCA23WA076,2022-11-24,"Maiquetía,",Venezuela,,,,,...,,"Turpial Airlines, C.A.",0.0,0.0,0.0,73.0,,,,25-11-2022
88821,20221125106357,Incident,DCA23WA075,2022-11-25,"Breslau,",Canada,,,,,...,,Flair Airlines Ltd,0.0,0.0,0.0,140.0,,,,25-11-2022
88826,20221222106484,Incident,DCA23WA099,2022-11-26,"Bangkok,",Thailand,,,,,...,,,0.0,0.0,0.0,186.0,,,,22-12-2022


In [24]:
# Checking the number of missing values in each column
df.isna().sum()

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                     52
Country                     226
Latitude                  54507
Longitude                 54516
Airport.Code              38757
Airport.Name              36185
Injury.Severity            1000
Aircraft.damage            3194
Aircraft.Category         56602
Registration.Number        1382
Make                         63
Model                        92
Amateur.Built               102
Number.of.Engines          6084
Engine.Type                7096
FAR.Description           56866
Schedule                  76307
Purpose.of.flight          6192
Air.carrier               72241
Total.Fatal.Injuries      11401
Total.Serious.Injuries    12510
Total.Minor.Injuries      11933
Total.Uninjured            5912
Weather.Condition          4492
Broad.phase.of.flight     27165
Report.Status              6384
Publication.Date          13771
dtype: i

In [32]:
# Checking number of different aircraft manufacturers in the data
df['Make'].value_counts()

Make
Cessna             22227
Piper              12029
CESSNA              4922
Beech               4330
PIPER               2841
                   ...  
Leonard Walters        1
Maule Air Inc.         1
Motley Vans            1
Perlick                1
ROYSE RALPH L          1
Name: count, Length: 8237, dtype: int64

In [50]:
# String cleaning test
df_make = df['Make']
df_make = df_make.str.lower()

df_make.value_counts().to_csv('aircraft_makes.csv', index=True)

In [52]:
# Engine number value counts
df['Number.of.Engines'].value_counts()

Number.of.Engines
1.0    69582
2.0    11079
0.0     1226
3.0      483
4.0      431
8.0        3
6.0        1
Name: count, dtype: int64

In [61]:
# Number of entries where there were no injuries
df.loc[df['Total.Uninjured'] == 0]

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980
6,20001218X45446,Accident,CHI81LA106,1981-08-01,"COTTON, MN",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,IMC,Unknown,Probable Cause,06-11-2001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88881,20221221106483,Accident,CEN23LA067,2022-12-21,"Auburn Hills, MI",United States,,,,,...,Personal,Pilot,0.0,1.0,0.0,0.0,,,,22-12-2022
88883,20221228106502,Accident,GAA23WA046,2022-12-22,"Brasnorte,",Brazil,,,,,...,,,1.0,0.0,0.0,0.0,,,,28-12-2022
88884,20221227106491,Accident,ERA23LA093,2022-12-26,"Annapolis, MD",United States,,,,,...,Personal,,0.0,1.0,0.0,0.0,,,,29-12-2022
88885,20221227106494,Accident,ERA23LA095,2022-12-26,"Hampton, NH",United States,,,,,...,,,0.0,0.0,0.0,0.0,,,,


In [62]:
# Null value check
df.loc[df['Total.Uninjured'].isna()]

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
293,20020917X02136,Accident,FTW82FRA16,1982-02-11,"BRENHAM, TX",United States,,,,BRENHAM MUNICIPAL,...,Personal,,1.0,,,,VMC,Takeoff,Probable Cause,30-10-2017
593,20080417X00504,Accident,MIA08CA076,1982-03-16,"MOBILE, AL",United States,30.757778,-88.355555,MOB,MOBILE REGIONAL,...,Unknown,,1.0,,,,IMC,Landing,Probable Cause,30-04-2008
1970,20020917X02985,Accident,CHI82DA251,1982-07-16,"WASHINGTON ISLAND, WI",United States,,,3W12,WASHINGTON IS,...,Personal,,,1.0,3.0,,VMC,Landing,Probable Cause,01-02-2016
2099,20020917X02847,Accident,ATL82FA212,1982-07-25,"LOUISBURG, NC",United States,,,LFN,FRANKLIN COUNTY,...,Skydiving,,4.0,1.0,,,VMC,Takeoff,Probable Cause,17-10-2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63892,20080229X00252,Accident,MIA08WA058,2007-12-26,"OCUMARE DEL TUY, VENEZUELA",Venezuela,10.121667,-66.771667,,,...,Personal,,,,1.0,,VMC,,Foreign,29-02-2008
63899,20080108X00030,Accident,CHI08FA055,2007-12-27,"Traverse City, MI",United States,44.737222,-85.5275,TVC,Cherry Capital Airport,...,Personal,,1.0,1.0,,,IMC,Approach,Probable Cause,05-08-2011
63902,20080117X00071,Accident,LAX08WA045,2007-12-29,"WESTGATE BRIDGE, AUSTRALIA",Australia,-37.829723,144.896389,,,...,Instructional,,1.0,1.0,,,VMC,,Foreign,17-01-2008
63907,20071231X02014,Accident,DFW08FA053,2007-12-29,"Venice, LA",United States,28.958056,-89.258889,,South Pass 38,...,,,1.0,1.0,2.0,,IMC,Approach,Probable Cause,15-07-2009


#### More value counts checks

In [63]:
df['Investigation.Type'].value_counts()

Investigation.Type
Accident    85015
Incident     3874
Name: count, dtype: int64

In [65]:
df['Injury.Severity'].value_counts()

Injury.Severity
Non-Fatal     67357
Fatal(1)       6167
Fatal          5262
Fatal(2)       3711
Incident       2219
              ...  
Fatal(270)        1
Fatal(60)         1
Fatal(43)         1
Fatal(143)        1
Fatal(230)        1
Name: count, Length: 109, dtype: int64

In [72]:
# Maybe clean the injury severity column?

injury_severity_column = df['Injury.Severity']

injury_severity_column.to_csv('injurity_severity.csv', index=True)