## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [1]:
# Your code here - remember to use markdown cells for comments as well!

In [34]:
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('data/Aviation_Data.csv')
df.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.9222,-81.8781,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


In [4]:
df.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [5]:
# Event.Id #drop, uninformative
# Investigation.Type #drop, uninformative
# Accident.Number 
# Event.Date #split into month and year
# Location #keep to view specifics of US incidents
# Country #keep but change to US and non-US
# Injury.Severity  #drop b/c repeats info from the later injury categories 
# Aircraft.damage #keep
# Registration.Number #drop, uninformative
# Make #keep, maybe use to engineer a new feature using LLM 
# Model #keep, maybe use to engineer a new feature using LLM 
# Amateur.Built #keep: few nan (1.5k) vals, most are not amateur built (80k)
# Number.of.Engines #keep: few NAN values (7.5k), most are have 1 (70k)
# Engine.Type #keep, few unknown (2k) and nan vals (8.5k), most are reciprocating (70k)
# Purpose.of.flight #keep: few unknown (6.8k) and nan (7.6k) vals, most are personal (50k)
# Total.Fatal.Injuries #keep
# Total.Serious.Injuries #keep
# Total.Minor.Injuries #keep
# Total.Uninjured #keep
# Weather.Condition #keep, mostly split between low and high visibility 
# Report.Status #DROP, about 70k are probable cause or nan 
# Publication.Date #DROP, less important than event date; also, skewed towards 2020

In [6]:
#drop columns with more than 20% nans 
nan_cols = list(df.loc[:,df.isna().sum()/len(df) > .2].columns) 

#drop uninformative columns
nan_cols.extend(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Injury.Severity', 'Registration.Number', 'Report.Status', 'Publication.Date'])
nan_cols

['Latitude',
 'Longitude',
 'Airport.Code',
 'Airport.Name',
 'Aircraft.Category',
 'FAR.Description',
 'Schedule',
 'Air.carrier',
 'Broad.phase.of.flight',
 'Event.Id',
 'Investigation.Type',
 'Accident.Number',
 'Injury.Severity',
 'Registration.Number',
 'Report.Status',
 'Publication.Date']

In [7]:
# get a list of columns to keep
non_nan_cols = [x for x in df.columns if x not in nan_cols]
non_nan_cols

['Event.Date',
 'Location',
 'Country',
 'Aircraft.damage',
 'Make',
 'Model',
 'Amateur.Built',
 'Number.of.Engines',
 'Engine.Type',
 'Purpose.of.flight',
 'Total.Fatal.Injuries',
 'Total.Serious.Injuries',
 'Total.Minor.Injuries',
 'Total.Uninjured',
 'Weather.Condition']

In [8]:
# remove columns from df
df = df[non_nan_cols].copy()
df.head()

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Amateur.Built,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,UNK
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,No,1.0,Reciprocating,Personal,4.0,0.0,0.0,0.0,UNK
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,No,1.0,Reciprocating,Personal,3.0,,,,IMC
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,No,1.0,Reciprocating,Personal,2.0,0.0,0.0,0.0,IMC
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,No,,,Personal,1.0,2.0,,0.0,VMC


## Engineered Features

In [9]:
# Check if any of the injury columns have true values. If any do, fill nans with 0s. 
# If all columns are nans, assume the data was not logged and keep them as nans. 
injury_cols = ['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured']
injury_data_exists = df[injury_cols].apply(lambda col: any(col), axis = 1)

for col in injury_cols:
    df.loc[injury_data_exists & df[col].isna(), col] = 0

In [10]:
#Make note of dividing by zero situation in pecert.Injured and Percent.Died / Number of Passengers

df['Total.Passengers'] = df['Total.Fatal.Injuries'] + df['Total.Serious.Injuries'] + df['Total.Minor.Injuries']+ df['Total.Uninjured']
df['Total.Injured'] = df['Total.Fatal.Injuries'] + df['Total.Serious.Injuries'] + df['Total.Minor.Injuries']
df['Percent.Injured'] = (df['Total.Injured'] / df['Total.Passengers']) * 100
df['Percent.Uninjured'] = (df['Total.Uninjured'] / df['Total.Passengers']) * 100
df['Percent.Died'] = (df['Total.Fatal.Injuries'] / df['Total.Passengers']) * 100
df.loc[df['Total.Passengers'] == 0, ['Percent.Injured','Percent.Died']] = 0

## We need to decide how are we measuring the size of the plane, by "Passenger load" or type of plane, having that one being just "Private" and Commercial, or if you want to just keep both

In [11]:
#Aircraft Capacity
print(df['Total.Passengers'].apply(lambda x: 'Small' if x < 10 else \
                                   ('Mid-Size' if x < 40 else 'Large')).value_counts(dropna=False))
print("----------------------------")
df['Passenger.Load'] = df['Total.Passengers'].apply(lambda x: 'Small' if x < 10 else ('Mid-Size' if x < 40 else 'Large'))
df.head()

Small       86808
Large        2433
Mid-Size     1107
Name: Total.Passengers, dtype: int64
----------------------------


Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Amateur.Built,Number.of.Engines,Engine.Type,Purpose.of.flight,...,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Uninjured,Percent.Died,Passenger.Load
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,No,1.0,Reciprocating,Personal,...,0.0,0.0,0.0,UNK,2.0,2.0,100.0,0.0,100.0,Small
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,No,1.0,Reciprocating,Personal,...,0.0,0.0,0.0,UNK,4.0,4.0,100.0,0.0,100.0,Small
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,No,1.0,Reciprocating,Personal,...,0.0,0.0,0.0,IMC,3.0,3.0,100.0,0.0,100.0,Small
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,No,1.0,Reciprocating,Personal,...,0.0,0.0,0.0,IMC,2.0,2.0,100.0,0.0,100.0,Small
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,No,,,Personal,...,2.0,0.0,0.0,VMC,3.0,3.0,100.0,0.0,33.333333,Small


In [12]:
df['Airplane.Type'] = df['Total.Passengers'].apply(lambda x: 'Private' if x < 50 else 'Commercial')

## Cleaning

In [13]:
# Remove rows where Amateur.Built is Yes or NaN, then remove Amateur.Built column
df = df.drop(df.loc[(df['Amateur.Built']=='Yes') |( df['Amateur.Built'].isna())].index)
df.reset_index(drop = True, inplace = True)
df.drop(columns = 'Amateur.Built', inplace = True)
df.head()

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,...,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Uninjured,Percent.Died,Passenger.Load,Airplane.Type
0,1948-10-24,"MOOSE CREEK, ID",United States,Destroyed,Stinson,108-3,1.0,Reciprocating,Personal,2.0,...,0.0,0.0,UNK,2.0,2.0,100.0,0.0,100.0,Small,Private
1,1962-07-19,"BRIDGEPORT, CA",United States,Destroyed,Piper,PA24-180,1.0,Reciprocating,Personal,4.0,...,0.0,0.0,UNK,4.0,4.0,100.0,0.0,100.0,Small,Private
2,1974-08-30,"Saltville, VA",United States,Destroyed,Cessna,172M,1.0,Reciprocating,Personal,3.0,...,0.0,0.0,IMC,3.0,3.0,100.0,0.0,100.0,Small,Private
3,1977-06-19,"EUREKA, CA",United States,Destroyed,Rockwell,112,1.0,Reciprocating,Personal,2.0,...,0.0,0.0,IMC,2.0,2.0,100.0,0.0,100.0,Small,Private
4,1979-08-02,"Canton, OH",United States,Destroyed,Cessna,501,,,Personal,1.0,...,0.0,0.0,VMC,3.0,3.0,100.0,0.0,33.333333,Small,Private


In [14]:
# Clean Weather Condition
df['Weather.Condition'].replace({'Unk':'Unknown','UNK':'Unknown','VMC':'Visual Meteorological Conditions',\
                                 'IMC':'Instrumental Meteorological Conditions'}, inplace = True)
df['Weather.Condition'].fillna('Unknown', inplace = True)
df['Weather.Condition'].value_counts(dropna=False)

Visual Meteorological Conditions          69084
Instrumental Meteorological Conditions     5831
Unknown                                    5397
Name: Weather.Condition, dtype: int64

In [15]:
# Check if Number.of.Engines = 0 corresponds to no Engine.Type
df[(df['Number.of.Engines']==0) & ((df['Engine.Type']!='NONE') & (df['Engine.Type'].notna())\
                                   & (df['Engine.Type'] != 'Unknown'))]

Unnamed: 0,Event.Date,Location,Country,Aircraft.damage,Make,Model,Number.of.Engines,Engine.Type,Purpose.of.flight,Total.Fatal.Injuries,...,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Total.Passengers,Total.Injured,Percent.Injured,Percent.Uninjured,Percent.Died,Passenger.Load,Airplane.Type
21149,1989-03-31,"GREAT HARBOR, Bahamas",Bahamas,Destroyed,Cessna,421B,0.0,Reciprocating,Unknown,1.0,...,0.0,1.0,Visual Meteorological Conditions,2.0,1.0,50.0,50.0,50.0,Small,Private
67210,2013-09-02,"Hampshire, IL",United States,Substantial,SCHWEIZER,SGS 2 33A,0.0,,Instructional,0.0,...,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,100.0,0.0,Small,Private
67790,2014-03-15,"Wrightwood, CA",United States,Substantial,DG FLUGZEUGBAU GMBH,DG 1000S,0.0,,Personal,0.0,...,0.0,1.0,Visual Meteorological Conditions,2.0,1.0,50.0,50.0,0.0,Small,Private
68251,2014-07-04,"Nephi, UT",United States,Substantial,CENTRAIR,101A,0.0,,Personal,0.0,...,0.0,0.0,Visual Meteorological Conditions,1.0,1.0,100.0,0.0,0.0,Small,Private
69487,2015-06-14,"Pell City, AL",United States,Substantial,WEATHERLY,201B,0.0,Reciprocating,Aerial Application,0.0,...,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,100.0,0.0,Small,Private
72160,2017-04-22,"Park City, UT",United States,Substantial,BALONY KUBICEK SPOL SRO,BB70Z,0.0,,Business,0.0,...,0.0,9.0,Visual Meteorological Conditions,9.0,0.0,0.0,100.0,0.0,Small,Private
72596,2017-07-21,"Deatsville, AL",United States,Substantial,LET,L23,0.0,,Instructional,0.0,...,0.0,0.0,Visual Meteorological Conditions,1.0,1.0,100.0,0.0,0.0,Small,Private
73491,2018-03-28,"Cave Creek, AZ",United States,Destroyed,ULTRAMAGIC SA,N 300,0.0,,Business,0.0,...,0.0,13.0,Visual Meteorological Conditions,13.0,0.0,0.0,100.0,0.0,Mid-Size,Private
73682,2018-05-13,"Tunbridge, VT",United States,Substantial,SCHWEIZER,SGS123,0.0,,Personal,0.0,...,0.0,1.0,Visual Meteorological Conditions,1.0,0.0,0.0,100.0,0.0,Small,Private
77888,2021-05-16,"Hampshire, IL",United States,Substantial,I.C.A.-BRASOV (ROMANIA),IS-28B2,0.0,,Instructional,0.0,...,1.0,0.0,Visual Meteorological Conditions,2.0,2.0,100.0,0.0,0.0,Small,Private


In [16]:
# Replace Number.of.Engines = 0 with Number.of.Engines = Unknown where Engine.Type exists.
#Drop Unknown or null values for number of Engines
replace_indx = list(df[(df['Number.of.Engines']==0) 
                    & ((df['Engine.Type']!='NONE') 
                       & (df['Engine.Type'].notna()) 
                       & (df['Engine.Type'] != 'Unknown'))].index)

df.loc[replace_indx, 'Number.of.Engines'] = 'Unknown'
df.dropna(subset=['Number.of.Engines'],axis=0,inplace=True)
df = df[df['Number.of.Engines'] != 'Unknown']

In [17]:
# Clean Engine Type
df['Engine.Type'] = df.apply(lambda row: 'None' if row['Number.of.Engines']==0 else row['Engine.Type'], axis = 1)
df['Engine.Type'].fillna('UNK', inplace = True)
df['Engine.Type'].replace({'UNK':'Unknown'}, inplace = True)
df['Engine.Type'].value_counts(dropna=False)

Reciprocating      61261
Turbo Shaft         3501
Turbo Prop          3249
Turbo Fan           2369
Unknown             2353
None                1159
Turbo Jet            682
Geared Turbofan       11
Electric               8
LR                     2
Hybrid Rocket          1
Name: Engine.Type, dtype: int64

In [18]:
# Aircraft Damage
df['Aircraft.damage'].fillna('Unknown', inplace = True)
df['Aircraft.damage'].value_counts(dropna = False)

Substantial    55477
Destroyed      14888
Minor           2175
Unknown         2056
Name: Aircraft.damage, dtype: int64

In [19]:
#Transform the Event Date into three separate Date columns that specify day, month and year of the incident.
#Dropping 'Event.Date' once the columns are created:

df['Event.Date'] = pd.to_datetime(df['Event.Date'])
df['Event.Day'] = df['Event.Date'].map(lambda x: x.day)
df['Event.Month'] = df['Event.Date'].map(lambda x: x.month)
df['Event.Year'] = df['Event.Date'].map(lambda x: x.year)
df.drop(['Event.Date'], axis=1, inplace=True)

In [20]:
#Grouping all the Countries outside of the US as "Foreign Country"
df.loc[df['Country'] != 'United States', 'Country'] = 'Foreign Country'

In [21]:
#Filling the Null Values with "UNKNOWN" the the Location Column
df['Location'] = df['Location'].fillna('UNKNOWN')

# Adjusting the Column so it shows code of the corresponding State, if it's Unknown or outside of the US, label accordingly
df['Location'] = df['Location'].apply(lambda location: location.split(', ')[-1] if \
                                      len(location.split(', ')) > 1 and len(location.split(', ')[-1]) == 2 else \
                                      'Unknown/Foreign Location')

In [22]:
#Cleaning the 'Make' Column
characters_to_remove = ['(', ')', ',', '.', '%', '?','-']
df['Make'] = df['Make'].str.title().fillna('UNKNOWN')
df['Make'] = df['Make'].map(lambda x: ''.join(char for char in x if char not in characters_to_remove))
names_var = {'Boeing': 'Boeing', 'Airbus':'Airbus','Douglas':'Mcdonnel Douglas', \
             'Air Tractor':'Air Tractor','Embraer':'Embraer','Bombardier':'Bombardier'}
for key, value in names_var.items():
    df.loc[df['Make'].str.contains(key), 'Make'] = value
top_50_makes = df['Make'].value_counts().index[:50]
df = df[df['Make'].isin(top_50_makes)]

In [23]:
#Replacing NaN values for 'Unknown in 'Purpose of flight':
df['Purpose.of.flight'].fillna('Unknown', inplace=True)

In [24]:
#Cleaning the "Model" column

df['Model'] = df['Model'].str.title().fillna('UNKNOWN')

In [25]:
#Cleaning the "Make" column

df['Make'].unique()

array(['Stinson', 'Piper', 'Cessna', 'Rockwell', 'Mcdonnel Douglas',
       'North American', 'Beech', 'Bellanca', 'Enstrom', 'Bell',
       'Grumman', 'Maule', 'Air Tractor', 'Aerospatiale', 'Mooney',
       'Boeing', 'Lockheed', 'Embraer', 'Hughes', 'Swearingen',
       'De Havilland', 'Sikorsky', 'Aeronca', 'Mitsubishi', 'Taylorcraft',
       'Robinson', 'Schweizer', 'Hiller', 'Helio', 'Luscombe', 'Champion',
       'Aero Commander', 'Ryan', 'Ayres', 'Fairchild', 'Lake', 'Learjet',
       'Grumman American', 'Waco', 'Ercoupe Eng & Research Corp',
       'GrummanSchweizer', 'Airbus', 'Socata', 'Cirrus Design Corp',
       'Aviat', 'Eurocopter', 'Bombardier', 'Cirrus',
       'Robinson Helicopter Company', 'Robinson Helicopter'], dtype=object)

## Visualize KPIs

## Plot features

In [26]:
df_priv = df[df['Airplane.Type'] == 'Private'].reset_index(drop = True)
df_comm = df[df['Airplane.Type'] == 'Commercial'].reset_index(drop = True)

In [27]:
pop_makes_priv = df_priv['Make'].value_counts()
pop_makes_priv = list(pop_makes_priv[pop_makes_priv > 100].index)

pop_makes_comm = df_comm['Make'].value_counts()
pop_makes_comm = list(pop_makes_comm[pop_makes_comm > 100].index)
pop_makes_comm

['Boeing', 'Mcdonnel Douglas', 'Airbus']

In [28]:
## Damage Index for Private Planes

In [29]:
pop_means_priv_df = df_priv.groupby('Make').mean(numeric_only=True)

di_df_priv = pd.DataFrame(columns = ['Damage_Index', 'Num_Incidents', 'Uninjured_P', 'Injured_P', 'Died_P'], )

for make in pop_makes_priv:
    make_info = df_priv[df_priv['Make'] == make]['Aircraft.damage'].value_counts(normalize=True)

    if 'Minor' not in make_info.index:
        make_info['Minor'] = 0

    damage_index = (make_info['Destroyed'] * 4) + (make_info['Substantial'] * 2) + (make_info['Minor'] * 1)
    
    num_incidents = df_priv['Make'].value_counts()[make]
    
  

In [30]:
pop_means_priv_df.loc['Boeing'].reindex(['Percent.Uninjured', 'Percent.Injured', 'Percent.Died'])

Percent.Uninjured    75.015416
Percent.Injured      20.805772
Percent.Died          6.407597
Name: Boeing, dtype: float64

In [31]:
df_priv.columns

Index(['Location', 'Country', 'Aircraft.damage', 'Make', 'Model',
       'Number.of.Engines', 'Engine.Type', 'Purpose.of.flight',
       'Total.Fatal.Injuries', 'Total.Serious.Injuries',
       'Total.Minor.Injuries', 'Total.Uninjured', 'Weather.Condition',
       'Total.Passengers', 'Total.Injured', 'Percent.Injured',
       'Percent.Uninjured', 'Percent.Died', 'Passenger.Load', 'Airplane.Type',
       'Event.Day', 'Event.Month', 'Event.Year'],
      dtype='object')

In [32]:
pop_means_priv_df = df_priv.groupby('Make').mean(numeric_only=True)

di_df_priv = pd.DataFrame(columns = ['Damage_Index', 'Num_Incidents', 'Uninjured_P', 'Injured_P', 'Died_P'], )

for make in pop_makes_priv:
    make_info = df_priv[df_priv['Make'] == make]['Aircraft.damage'].value_counts(normalize=True)

    if 'Minor' not in make_info.index:
        make_info['Minor'] = 0

    damage_index = (make_info['Destroyed'] * 4) + (make_info['Substantial'] * 2) + (make_info['Minor'] * 1)
    
    num_incidents = df_priv['Make'].value_counts()[make]
    
    make_stats = list(pop_means_priv_df.loc[make].reindex(['Percent.Uninjured', 'Percent.Injured', 'Percent.Died']))
    
    di_df_priv.loc[make] =[damage_index, num_incidents] + make_stats

di_df_priv.sort_values(by='Damage_Index')

Unnamed: 0,Damage_Index,Num_Incidents,Uninjured_P,Injured_P,Died_P
Airbus,0.940171,117.0,61.674464,18.016278,7.336182
Embraer,1.54,150.0,75.3906,23.296899,14.297424
Boeing,1.575972,849.0,75.015416,20.805772,6.407597
Fairchild,1.978571,140.0,69.202705,30.577314,10.779937
Swearingen,1.99375,160.0,79.431314,20.568686,11.395833
De Havilland,2.099217,383.0,68.173331,31.743571,14.931907
Robinson Helicopter,2.105263,228.0,58.662281,41.337719,13.925439
Mcdonnel Douglas,2.106095,443.0,65.358702,33.468342,14.102879
Waco,2.107143,140.0,72.619048,27.380952,4.047619
Robinson Helicopter Company,2.13089,191.0,59.035088,40.750436,12.958115


In [35]:
threshold = 1000
penalty_rate = 0.5
di_df_priv['Penalty'] = di_df_priv['Num_Incidents'].apply(lambda x: 1 + penalty_rate * np.log(threshold/x) if x < threshold else 1)
di_df_priv['Adj_Damage_Index'] = di_df_priv['Damage_Index'] * di_df_priv['Penalty'] 
di_df_priv.sort_values(by = 'Adj_Damage_Index')

Unnamed: 0,Damage_Index,Num_Incidents,Uninjured_P,Injured_P,Died_P,Penalty,Adj_Damage_Index
Boeing,1.575972,849.0,75.015416,20.805772,6.407597,1.081848,1.704962
Airbus,0.940171,117.0,61.674464,18.016278,7.336182,2.072791,1.948778
Cessna,2.340602,26092.0,64.340746,35.616887,14.190254,1.0,2.340602
Robinson,2.354167,1008.0,57.860226,41.972553,18.715278,1.0,2.354167
Piper,2.424181,14350.0,58.79808,41.167466,18.578451,1.0,2.424181
Bell,2.430877,2416.0,52.809656,46.995019,14.626752,1.0,2.430877
Bellanca,2.43457,1024.0,55.102539,44.897461,18.953451,1.0,2.43457
Grumman,2.481707,1148.0,66.202872,33.797128,9.503444,1.0,2.481707
Hughes,2.355606,883.0,57.900605,42.051718,10.349188,1.062215,2.50216
Air Tractor,2.338747,862.0,59.416087,40.583913,14.365816,1.07425,2.512399


In [38]:
849+1839

2688

In [None]:
## Damage Index for Commercial Planes

In [36]:
pop_means_comm_df = df_comm.groupby('Make').mean(numeric_only=True)

di_df_comm = pd.DataFrame(columns = ['Damage_Index', 'Num_Incidents', 'Uninjured_P', 'Injured_P', 'Died_P'], )

for make in pop_makes_comm:
    make_info = df_comm[df_comm['Make'] == make]['Aircraft.damage'].value_counts(normalize=True)

    if 'Minor' not in make_info.index:
        make_info['Minor'] = 0

    damage_index = (make_info['Destroyed'] * 4) + (make_info['Substantial'] * 2) + (make_info['Minor'] * 1)
    
    num_incidents = df['Make'].value_counts()[make]
    
    make_stats = list(pop_means_comm_df.loc[make].reindex(['Percent.Uninjured', 'Percent.Injured', 'Percent.Died']))
    
    di_df_comm.loc[make] =[damage_index, num_incidents] + make_stats

di_df_comm.sort_values(by='Damage_Index')

Unnamed: 0,Damage_Index,Num_Incidents,Uninjured_P,Injured_P,Died_P
Airbus,0.737864,323.0,94.739431,5.260569,3.355007
Boeing,0.79899,1839.0,95.875352,4.124648,2.073749
Mcdonnel Douglas,0.938596,785.0,94.988345,5.011655,2.177668


In [37]:
threshold = 1000
penalty_rate = 0.5
di_df_comm['Penalty'] = di_df_comm['Num_Incidents'].apply(lambda x: 1 + penalty_rate * np.log(threshold/x) if x < threshold else 1)
di_df_comm['Adj_Damage_Index'] = di_df_comm['Damage_Index'] * di_df_comm['Penalty'] 
di_df_comm.sort_values(by = 'Adj_Damage_Index')

Unnamed: 0,Damage_Index,Num_Incidents,Uninjured_P,Injured_P,Died_P,Penalty,Adj_Damage_Index
Boeing,0.79899,1839.0,95.875352,4.124648,2.073749,1.0,0.79899
Mcdonnel Douglas,0.938596,785.0,94.988345,5.011655,2.177668,1.121036,1.0522
Airbus,0.737864,323.0,94.739431,5.260569,3.355007,1.565051,1.154795


In [None]:
df.info()

In [None]:
popular_makes = list(df['Make'].value_counts()[:50].index)

make_percentages = pd.DataFrame(columns = ['Destroyed_Percent', 'Substantial_Percent', 'Minor_Percent', 'Num_Incidents'])

for make in popular_makes:
    make_info = df[df['Make'] == make]['Aircraft.damage'].value_counts(normalize=True) * 100

    if 'Minor' not in make_info.index:
        make_info['Minor'] = 0

    num_incidents = df['Make'].value_counts()[make]

    make_percentages.loc[make] =[make_info['Destroyed'], make_info['Substantial'], make_info['Minor'], num_incidents]

make_percentages

In [None]:
df['Airplane.Type'] = df['Total.Passengers'].apply(lambda x: 'Private' if x < 50 else 'Commercial')

In [None]:
#Showing only accidents were the damage left the plane Destroyed, then grouping Creating index of Commercial/Private and then dividing per makes.
# Finally showing the mean of people that died.

df.loc[(df['Aircraft.damage'] == 'Destroyed') & (df['Airplane.Type'] == 'Commercial')].groupby(['Make','Model'])['Percent.Died','Percent.Injured'].mean()

In [None]:
df.loc[df['Aircraft.damage'] == 'Substantial'].groupby(['Airplane.Type','Make'])['Percent.Died'].mean()

Notes:

I think the Make percentage column should be used once we narrowed down to less airplanes.
How are we measuring the capacity of the plane


In [None]:
df.to_csv('Airplanes_clean.csv')

## Choosing the best planes for commercial purposes

In [None]:
des_best_comm = df.loc[(df['Aircraft.damage'] == 'Destroyed') & (df['Airplane.Type'] == 'Commercial')]

In [None]:
subs_best_comm = df.loc[(df['Aircraft.damage'] == 'Substantial') & (df['Airplane.Type'] == 'Commercial') & (df['Percent.Died'] == 0)]

In [None]:
minor_best_comm = df.loc[(df['Aircraft.damage'] == 'Minor') & (df['Airplane.Type'] == 'Commercial') & (df['Percent.Died'] == 0) & (df['Percent.Injured'] == 0)]

## Choosing the best planes for Private Purposes

## Necesito ver si hay marcas de aviones que sean solo privadas o solo commerciales

In [None]:
len(df['Make'].value_counts())

In [None]:
des_best_priv = df.loc[(df['Aircraft.damage'] == 'Destroyed') & (df['Airplane.Type'] == 'Private') & (df['Event.Year'] > 1989) & (df['Total.Passengers'] != 0)]
subs_best_priv = df.loc[(df['Aircraft.damage'] == 'Substantial') & (df['Airplane.Type'] == 'Private') & (df['Percent.Died'] == 0)& (df['Event.Year'] > 1989) & (df['Total.Passengers'] != 0)]
minor_best_priv = df.loc[(df['Aircraft.damage'] == 'Minor') & (df['Airplane.Type'] == 'Private') & (df['Percent.Died'] == 0) & (df['Percent.Injured'] == 0) & (df['Event.Year'] > 1989) & (df['Total.Passengers'] != 0)]

In [None]:
#Amount of incidents in which the airplanes fullfilled the requirements to be a "good" private plane

((minor_best_comm['Make'].value_counts() + subs_best_comm['Make'].value_counts()+ des_best_comm['Make'].value_counts()) /df['Make'].value_counts()).sort_values(ascending=False)

In [None]:
df['Make'].value_counts()

In [None]:
df['Event.Year'].value_counts().sort_index()[:15].sum()

In [None]:
subs_best_comm