## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
# Your code here - remember to use markdown cells for comments as well!

In [2]:
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/Aviation_Data.csv')
df.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.9222,-81.8781,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [4]:
df.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [5]:
# Event.Id #drop, uninformative
# Investigation.Type #drop, uninformative
# Accident.Number 
# Event.Date #split into month and year
# Location #keep to view specifics of US incidents
# Country #keep but change to US and non-US
# Injury.Severity  #drop b/c repeats info from the later injury categories 
# Aircraft.damage #keep
# Registration.Number #drop, uninformative
# Make #keep, maybe use to engineer a new feature using LLM 
# Model #keep, maybe use to engineer a new feature using LLM 
# Amateur.Built #keep: few nan (1.5k) vals, most are not amateur built (80k)
# Number.of.Engines #keep: few NAN values (7.5k), most are have 1 (70k)
# Engine.Type #keep, few unknown (2k) and nan vals (8.5k), most are reciprocating (70k)
# Purpose.of.flight #keep: few unknown (6.8k) and nan (7.6k) vals, most are personal (50k)
# Total.Fatal.Injuries #keep
# Total.Serious.Injuries #keep
# Total.Minor.Injuries #keep
# Total.Uninjured #keep
# Weather.Condition #keep, mostly split between low and high visibility 
# Report.Status #DROP, about 70k are probable cause or nan 
# Publication.Date #DROP, less important than event date; also, skewed towards 2020

In [6]:
#drop columns with more than 20% nans 
nan_cols = list(df.loc[:,df.isna().sum()/len(df) > .2].columns) 

#drop uninformative columns
nan_cols.extend(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Injury.Severity', 'Registration.Number', 'Report.Status', 'Publication.Date'])
nan_cols

['Latitude',
 'Longitude',
 'Airport.Code',
 'Airport.Name',
 'Aircraft.Category',
 'FAR.Description',
 'Schedule',
 'Air.carrier',
 'Broad.phase.of.flight',
 'Event.Id',
 'Investigation.Type',
 'Accident.Number',
 'Injury.Severity',
 'Registration.Number',
 'Report.Status',
 'Publication.Date']

In [7]:
# get a list of columns to keep
non_nan_cols = [x for x in df.columns if x not in nan_cols]
non_nan_cols

['Event.Date',
 'Location',
 'Country',
 'Aircraft.damage',
 'Make',
 'Model',
 'Amateur.Built',
 'Number.of.Engines',
 'Engine.Type',
 'Purpose.of.flight',
 'Total.Fatal.Injuries',
 'Total.Serious.Injuries',
 'Total.Minor.Injuries',
 'Total.Uninjured',
 'Weather.Condition']

In [8]:
# remove columns from df
df = df[non_nan_cols].copy()
df.head()

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Amateur.Built,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,No,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,No,1.0,Reciprocating,Personal,3.0,,,,IMC
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,No,,,Personal,1.0,2.0,,0.0,VMC


## Engineered Features

In [9]:
# Check if any of the injury columns have true values. If any do, fill nans with 0s. 
# If all columns are nans, assume the data was not logged and keep them as nans. 
injury_cols = ['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured']
injury_data_exists = df[injury_cols].apply(lambda col: any(col), axis = 1)

for col in injury_cols:
    df.loc[injury_data_exists & df[col].isna(), col] = 0

In [10]:
#Make note of dividing by zero situation in pecert.Injured and Percent.Died / Number of Passengers

df['Total.Passengers'] = df['Total.Fatal.Injuries'] + df['Total.Serious.Injuries'] + df['Total.Minor.Injuries']+ df['Total.Uninjured']
df['Total.Injured'] = df['Total.Fatal.Injuries'] + df['Total.Serious.Injuries'] + df['Total.Minor.Injuries']
df['Percent.Injured'] = (df['Total.Injured'] / df['Total.Passengers']) * 100
df['Percent.Died'] = (df['Total.Fatal.Injuries'] / df['Total.Passengers']) * 100
df.loc[df['Total.Passengers'] == 0, ['Percent.Injured','Percent.Died']] = 0

## We need to decide how are we measuring the size of the plane, by "Passenger load" or type of plane, having that one being just "Private" and Commercial, or if you want to just keep both

In [11]:
#Aircraft Capacity
print(df['Total.Passengers'].apply(lambda x: 'Small' if x < 10 else \
                                   ('Mid-Size' if x < 40 else 'Large')).value_counts(dropna=False))
print("----------------------------")
df['Passenger.Load'] = df['Total.Passengers'].apply(lambda x: 'Small' if x < 10 else ('Mid-Size' if x < 40 else 'Large'))
df.head()

Small       86808
Large        2433
Mid-Size     1107
Name: Total.Passengers, dtype: int64
----------------------------


Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Amateur.Built,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Died,Passenger.Load
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK,2.0,2.0,100.0,100.0,Small
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,No,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK,4.0,4.0,100.0,100.0,Small
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,No,1.0,Reciprocating,Personal,3.0,0.0,0.0,0.0,IMC,3.0,3.0,100.0,100.0,Small
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC,2.0,2.0,100.0,100.0,Small
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,No,,,Personal,1.0,2.0,0.0,0.0,VMC,3.0,3.0,100.0,33.333333,Small


In [12]:
df['Airplane.Type'] = df['Total.Passengers'].apply(lambda x: 'Private' if x < 50 else 'Commercial')

## Cleaning

In [13]:
# Remove rows where Amateur.Built is Yes or NaN, then remove Amateur.Built column
df = df.drop(df.loc[(df['Amateur.Built']=='Yes') |( df['Amateur.Built'].isna())].index)
df.reset_index(drop = True, inplace = True)
df.drop(columns = 'Amateur.Built', inplace = True)
df.head()

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Died,Passenger.Load,Airplane.Type
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK,2.0,2.0,100.0,100.0,Small,Private
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK,4.0,4.0,100.0,100.0,Small,Private
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,1.0,Reciprocating,Personal,3.0,0.0,0.0,0.0,IMC,3.0,3.0,100.0,100.0,Small,Private
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC,2.0,2.0,100.0,100.0,Small,Private
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,,,Personal,1.0,2.0,0.0,0.0,VMC,3.0,3.0,100.0,33.333333,Small,Private


In [14]:
# Clean Weather Condition
df['Weather.Condition'].replace({'Unk':'Unknown','UNK':'Unknown','VMC':'Visual Meteorological Conditions',\
                                 'IMC':'Instrumental Meteorological Conditions'}, inplace = True)
df['Weather.Condition'].fillna('Unknown', inplace = True)
df['Weather.Condition'].value_counts(dropna=False)

Visual Meteorological Conditions          69084
Instrumental Meteorological Conditions     5831
Unknown                                    5397
Name: Weather.Condition, dtype: int64

In [15]:
# Check if Number.of.Engines = 0 corresponds to no Engine.Type
df[(df['Number.of.Engines']==0) & ((df['Engine.Type']!='NONE') & (df['Engine.Type'].notna())\
                                   & (df['Engine.Type'] != 'Unknown'))]

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Died,Passenger.Load,Airplane.Type
21149,1989-03-31,"GREAT HARBOR, Bahamas",Bahamas,Destroyed,Cessna,421B,0.0,Reciprocating,Unknown,1.0,0.0,0.0,1.0,Visual Meteorological Conditions,2.0,1.0,50.0,50.0,Small,Private
67210,2013-09-02,"Hampshire, IL",United States,Substantial,SCHWEIZER,SGS 2 33A,0.0,,Instructional,0.0,0.0,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,0.0,Small,Private
67790,2014-03-15,"Wrightwood, CA",United States,Substantial,DG FLUGZEUGBAU GMBH,DG 1000S,0.0,,Personal,0.0,1.0,0.0,1.0,Visual Meteorological Conditions,2.0,1.0,50.0,0.0,Small,Private
68251,2014-07-04,"Nephi, UT",United States,Substantial,CENTRAIR,101A,0.0,,Personal,0.0,1.0,0.0,0.0,Visual Meteorological Conditions,1.0,1.0,100.0,0.0,Small,Private
69487,2015-06-14,"Pell City, AL",United States,Substantial,WEATHERLY,201B,0.0,Reciprocating,Aerial Application,0.0,0.0,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,0.0,Small,Private
72160,2017-04-22,"Park City, UT",United States,Substantial,BALONY KUBICEK SPOL SRO,BB70Z,0.0,,Business,0.0,0.0,0.0,9.0,Visual Meteorological Conditions,9.0,0.0,0.0,0.0,Small,Private
72596,2017-07-21,"Deatsville, AL",United States,Substantial,LET,L23,0.0,,Instructional,0.0,1.0,0.0,0.0,Visual Meteorological Conditions,1.0,1.0,100.0,0.0,Small,Private
73491,2018-03-28,"Cave Creek, AZ",United States,Destroyed,ULTRAMAGIC SA,N 300,0.0,,Business,0.0,0.0,0.0,13.0,Visual Meteorological Conditions,13.0,0.0,0.0,0.0,Mid-Size,Private
73682,2018-05-13,"Tunbridge, VT",United States,Substantial,SCHWEIZER,SGS123,0.0,,Personal,0.0,0.0,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,0.0,Small,Private
77888,2021-05-16,"Hampshire, IL",United States,Substantial,I.C.A.-BRASOV (ROMANIA),IS-28B2,0.0,,Instructional,0.0,1.0,1.0,0.0,Visual Meteorological Conditions,2.0,2.0,100.0,0.0,Small,Private


In [16]:
# Replace Number.of.Engines = 0 with Number.of.Engines = Unknown where Engine.Type exists.
#Drop Unknown or null values for number of Engines
replace_indx = list(df[(df['Number.of.Engines']==0) 
                    & ((df['Engine.Type']!='NONE') 
                       & (df['Engine.Type'].notna()) 
                       & (df['Engine.Type'] != 'Unknown'))].index)

df.loc[replace_indx, 'Number.of.Engines'] = 'Unknown'
df.dropna(subset=['Number.of.Engines'],axis=0,inplace=True)
df = df[df['Number.of.Engines'] != 'Unknown']

In [17]:
# Clean Engine Type
df['Engine.Type'] = df.apply(lambda row: 'None' if row['Number.of.Engines']==0 else row['Engine.Type'], axis = 1)
df['Engine.Type'].fillna('UNK', inplace = True)
df['Engine.Type'].replace({'UNK':'Unknown'}, inplace = True)
df['Engine.Type'].value_counts(dropna=False)

Reciprocating      61261
Turbo Shaft         3501
Turbo Prop          3249
Turbo Fan           2369
Unknown             2353
None                1159
Turbo Jet            682
Geared Turbofan       11
Electric               8
LR                     2
Hybrid Rocket          1
Name: Engine.Type, dtype: int64

In [18]:
# Aircraft Damage
df['Aircraft.damage'].fillna('Unknown', inplace = True)
df['Aircraft.damage'].value_counts(dropna = False)

Substantial    55477
Destroyed      14888
Minor           2175
Unknown         2056
Name: Aircraft.damage, dtype: int64

In [19]:
#Transform the Event Date into three separate Date columns that specify day, month and year of the incident.
#Dropping 'Event.Date' once the columns are created:

df['Event.Date'] = pd.to_datetime(df['Event.Date'])
df['Event.Day'] = df['Event.Date'].map(lambda x: x.day)
df['Event.Month'] = df['Event.Date'].map(lambda x: x.month)
df['Event.Year'] = df['Event.Date'].map(lambda x: x.year)
df.drop(['Event.Date'], axis=1, inplace=True)

In [20]:
#Grouping all the Countries outside of the US as "Foreign Country"
df.loc[df['Country'] != 'United States', 'Country'] = 'Foreign Country'

In [21]:
#Filling the Null Values with "UNKNOWN" the the Location Column
df['Location'] = df['Location'].fillna('UNKNOWN')

# Adjusting the Column so it shows code of the corresponding State, if it's Unknown or outside of the US, label accordingly
df['Location'] = df['Location'].apply(lambda location: location.split(', ')[-1] if \
                                      len(location.split(', ')) > 1 and len(location.split(', ')[-1]) == 2 else \
                                      'Unknown/Foreign Location')

In [22]:
#Cleaning the 'Make' Column
characters_to_remove = ['(', ')', ',', '.', '%', '?']
df['Make'] = df['Make'].str.title().fillna('UNKNOWN')
df['Make'] = df['Make'].map(lambda x: ''.join(char for char in x if char not in characters_to_remove))
top_50_makes = df['Make'].value_counts().index[:50]
df = df[df['Make'].isin(top_50_makes)]

In [23]:
#Replacing NaN values for 'Unknown in 'Purpose of flight':
df['Purpose.of.flight'].fillna('Unknown', inplace=True)

In [24]:
#Cleaning the "Model" column

df['Model'] = df['Model'].str.title().fillna('UNKNOWN')

## Visualize KPIs

## Plot features

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66240 entries, 0 to 80300
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Location                66240 non-null  object 
 1   Country                 66240 non-null  object 
 2   Aircraft.damage         66240 non-null  object 
 3   Make                    66240 non-null  object 
 4   Model                   66240 non-null  object 
 5   Number.of.Engines       66240 non-null  object 
 6   Engine.Type             66240 non-null  object 
 7   Purpose.of.flight       66240 non-null  object 
 8   Total.Fatal.Injuries    66240 non-null  float64
 9   Total.Serious.Injuries  66240 non-null  float64
 10  Total.Minor.Injuries    66240 non-null  float64
 11  Total.Uninjured         66240 non-null  float64
 12  Weather.Condition       66240 non-null  object 
 13  Total.Passengers        66240 non-null  float64
 14  Total.Injured           66240 non-null

In [26]:
popular_makes = list(df['Make'].value_counts()[:50].index)

make_percentages = pd.DataFrame(columns = ['Destroyed_Percent', 'Substantial_Percent', 'Minor_Percent', 'Num_Incidents'])

for make in popular_makes:
    make_info = df[df['Make'] == make]['Aircraft.damage'].value_counts(normalize=True) * 100

    if 'Minor' not in make_info.index:
        make_info['Minor'] = 0

    num_incidents = df['Make'].value_counts()[make]

    make_percentages.loc[make] =[make_info['Destroyed'], make_info['Substantial'], make_info['Minor'], num_incidents]

make_percentages

Unnamed: 0,Destroyed_Percent,Substantial_Percent,Minor_Percent,Num_Incidents
Cessna,18.404015,79.542581,1.310194,26103.0
Piper,22.488505,75.560819,1.302773,14354.0
Beech,28.714623,67.118711,3.007075,5088.0
Bell,24.337748,71.978477,1.779801,2416.0
Boeing,5.131129,33.124287,25.940707,1754.0
Mooney,27.434312,70.092736,1.777434,1294.0
Grumman,25.500435,72.236728,1.479547,1149.0
Bellanca,22.363281,76.660156,0.683594,1024.0
Robinson,18.452381,80.654762,0.297619,1008.0
Hughes,19.570136,77.941176,1.131222,884.0


In [32]:
df.groupby(['Make','Aircraft.damage'])['Percent.Injured'].mean().xs("Destroyed", level='Aircraft.damage').sort_values()[:10]

Make
Helio                64.015152
Grumman              64.732651
Hiller               66.949153
Ayres                68.253968
Grumman-Schweizer    68.750000
Enstrom              72.093023
Schweizer            73.809524
Mcdonnell Douglas    75.233909
Air Tractor          75.883838
Hughes               80.385356
Name: Percent.Injured, dtype: float64

In [34]:
df.groupby(['Make','Aircraft.damage'])['Percent.Injured'].mean().xs("Substantial", level='Aircraft.damage').sort_values()[:10]

Make
Airbus Industrie      1.251576
Swearingen           10.257713
Helio                10.283688
Embraer              14.666266
Airbus               14.955280
Mitsubishi           15.534826
Boeing               16.157806
Maule                17.330097
Douglas              17.965427
Mcdonnell Douglas    17.994197
Name: Percent.Injured, dtype: float64

In [40]:
df.groupby(['Make','Aircraft.damage'])['Percent.Died'].mean().xs("Minor", level='Aircraft.damage').sort_values()[:10]

Make
Aero Commander                 0.0
Stinson                        0.0
Sikorsky                       0.0
Schweizer                      0.0
Rockwell                       0.0
Robinson Helicopter Company    0.0
Robinson Helicopter            0.0
Robinson                       0.0
Lake                           0.0
Helio                          0.0
Name: Percent.Died, dtype: float64

In [36]:
df.columns

Index(['Location', 'Country', 'Aircraft.damage', 'Make', 'Model',
       'Number.of.Engines', 'Engine.Type', 'Purpose.of.flight',
       'Total.Fatal.Injuries', 'Total.Serious.Injuries',
       'Total.Minor.Injuries', 'Total.Uninjured', 'Weather.Condition',
       'Total.Passengers', 'Total.Injured', 'Percent.Injured', 'Percent.Died',
       'Passenger.Load', 'Airplane.Type', 'Event.Day', 'Event.Month',
       'Event.Year'],
      dtype='object')

In [29]:
df['Airplane.Type'] = df['Total.Passengers'].apply(lambda x: 'Private' if x < 50 else 'Commercial')

In [48]:
#Showing only accidents were the damage left the plane Destroyed, then grouping Creating index of Commercial/Private and then dividing per makes.
# Finally showing the mean of people that died.

df.loc[(df['Aircraft.damage'] == 'Destroyed') & (df['Airplane.Type'] == 'Commercial')].groupby(['Make','Model'])['Percent.Died','Percent.Injured'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Percent.Died,Percent.Injured
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1
Airbus,A310,99.346405,100.0
Airbus,A310-304,0.0,17.333333
Airbus,A320,100.0,100.0
Airbus,A321,100.0,100.0
Airbus,A330,100.0,100.0
Airbus Industrie,A300B4-605R,100.0,100.0
Airbus Industrie,A320,90.625,100.0
Airbus Industrie,A320-211,100.0,100.0
Beech,95-A55,8.139535,100.0
Beech,A100,0.628931,1.257862


In [41]:
df.loc[df['Aircraft.damage'] == 'Substantial'].groupby(['Airplane.Type','Make'])['Percent.Died'].mean()

Airplane.Type  Make                       
Commercial     Aero Commander                  0.000000
               Aerospatiale                    0.000000
               Airbus                          0.000000
               Airbus Industrie                0.000000
               Boeing                          0.443730
               Cessna                          0.000000
               De Havilland                    0.000000
               Douglas                         0.095785
               Embraer                         0.000000
               Mcdonnell Douglas               0.049076
Private        Aero Commander                  6.289911
               Aeronca                         4.127580
               Aerospatiale                    5.281636
               Air Tractor                     6.138614
               Air Tractor Inc                11.229947
               Airbus                          4.761905
               Airbus Industrie                0.000000
     

Notes:

I think the Make percentage column should be used once we narrowed down to less airplanes.
How are we measuring the capacity of the plane


In [50]:
df.to_csv('Airplanes_clean.csv')