# Parse probability of birdstrike by airport

### import data and packages

In [1]:
import pandas as pd

In [2]:
# read dat file (as csv) into memory as dataframe

# df = pd.read_csv('airports.dat', index_col=0, names=['1', 'name', 'city', 'country', 'IATA', 'ICAO', 'lat', 'lon', 'alt', 'tz', 'dst', 'dbtz', 'type', 'source'])
# df = df.reset_index()
# df.drop(['1'], axis=1, inplace=True)

# I found a better datasource - see below

In [3]:
# import CSV file to memory as pandas dataframe

df = pd.read_csv('airports.csv')

In [4]:
# preview dataframe to ensure data came thru ok

df.head()

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,6523,00A,heliport,Total Rf Heliport,40.070801,-74.933601,11.0,,US,US-PA,Bensalem,no,00A,,00A,,,
1,323361,00AA,small_airport,Aero B Ranch Airport,38.704022,-101.473911,3435.0,,US,US-KS,Leoti,no,00AA,,00AA,,,
2,6524,00AK,small_airport,Lowell Field,59.947733,-151.692524,450.0,,US,US-AK,Anchor Point,no,00AK,,00AK,,,
3,6525,00AL,small_airport,Epps Airpark,34.864799,-86.770302,820.0,,US,US-AL,Harvest,no,00AL,,00AL,,,
4,6526,00AR,closed,Newport Hospital & Clinic Heliport,35.6087,-91.254898,237.0,,US,US-AR,Newport,no,,,,,,00AR


In [5]:
# read list of airports that exist in flights table from text

airport_list = tailno_list = open("airports.txt").read().splitlines()

In [6]:
# instantiate collector dataframe

airports = pd.DataFrame(columns=df.columns)

### filter dataset

In [7]:
# iterate through target airport codes, adding valid rows to collector dataframe

for airport in airport_list:
    airports = pd.concat([airports, df[df.iata_code == airport]], ignore_index=True)

In [8]:
airports

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,3679,KMFR,medium_airport,Rogue Valley International Medford Airport,42.374199,-122.873001,1335.0,,US,US-OR,Medford,yes,KMFR,MFR,MFR,,https://en.wikipedia.org/wiki/Rogue_Valley_Int...,
1,3416,KBLI,medium_airport,Bellingham International Airport,48.792801,-122.538002,170.0,,US,US-WA,Bellingham,yes,KBLI,BLI,BLI,,https://en.wikipedia.org/wiki/Bellingham_Inter...,
2,3843,KRDM,medium_airport,Roberts Field,44.254101,-121.150001,3080.0,,US,US-OR,Redmond,yes,KRDM,RDM,RDM,,https://en.wikipedia.org/wiki/Roberts_Field,
3,3872,KSCK,medium_airport,Stockton Metropolitan Airport,37.894199,-121.237999,33.0,,US,US-CA,Stockton,no,KSCK,SCK,SCK,http://www.sjgov.org/airport/,https://en.wikipedia.org/wiki/Stockton_Metropo...,
4,3946,KVLD,medium_airport,Valdosta Regional Airport,30.782499,-83.276703,203.0,,US,US-GA,Valdosta,yes,KVLD,VLD,VLD,,https://en.wikipedia.org/wiki/Valdosta_Regiona...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,3899,KSRQ,medium_airport,Sarasota Bradenton International Airport,27.395399,-82.554398,30.0,,US,US-FL,Sarasota/Bradenton,yes,KSRQ,SRQ,SRQ,,https://en.wikipedia.org/wiki/Sarasota-Bradent...,
371,3948,KVPS,medium_airport,Destin-Fort Walton Beach Airport,30.483200,-86.525398,87.0,,US,US-FL,Valparaiso,yes,KVPS,VPS,VPS,http://www.flyvps.com/,https://en.wikipedia.org/wiki/Destin%E2%80%93F...,Eglin AFB
372,3547,KFWA,medium_airport,Fort Wayne International Airport,40.978500,-85.195099,814.0,,US,US-IN,Fort Wayne,yes,KFWA,FWA,FWA,https://fwairport.com/,https://en.wikipedia.org/wiki/Fort_Wayne_Inter...,Baer Field
373,3847,KRIC,large_airport,Richmond International Airport,37.505199,-77.319702,167.0,,US,US-VA,Richmond,yes,KRIC,RIC,RIC,,https://en.wikipedia.org/wiki/Richmond_Interna...,


### hub/not hub

In [9]:
# instantiate a list of hub airports

hubs = ['ANC', 'PHX', 'BUR', 'LAX', 'OAK', 'SAN', 'SFO', 'SJC', 'DEN', 'FLL', 'MCO', 'TPA', 'SFB', 'ATL', 'HNL', 'OGG', 'BOI', 'MDW', 'ORD', 'IND', 'CVG', 'SDF', 'BWI', 'BOS', 'DTW', 'MSP', 'STL', 'LAS', 'ACY', 'EWR', 'TTN', 'JFK', 'LGA', 'CLT', 'GSO', 'RDU', 'CLE', 'LUK', 'DAY', 'PDX', 'PHL', 'PIT', 'CHS', 'CAE', 'MEM', 'BNA', 'DFW', 'DAL', 'IAH', 'SLC', 'ORF', 'IAD', 'DCA', 'SEA']

In [10]:
# instantiate new column for hub, fill with 0

airports['hub'] = 0

In [11]:
# change 'type' to 'hub' if IATA code matches our list of hubs

airports.loc[airports.iata_code.isin(hubs), 'hub'] = 1

In [12]:
# verify above change

airports.hub.value_counts()

0    322
1     53
Name: hub, dtype: int64

### region

In [13]:
# convert iso_region to region (N - North East W - West M - Mid West S - South O - Other)

states = {
        'US-AK': 'O',
        'US-AL': 'S',
        'US-AR': 'S',
        'US-AS': 'O',
        'US-AZ': 'W',
        'US-CA': 'W',
        'US-CO': 'W',
        'US-CT': 'N',
        'US-DC': 'N',
        'US-DE': 'N',
        'US-FL': 'S',
        'US-GA': 'S',
        'US-GU': 'O',
        'US-HI': 'O',
        'US-IA': 'M',
        'US-ID': 'W',
        'US-IL': 'M',
        'US-IN': 'M',
        'US-KS': 'M',
        'US-KY': 'S',
        'US-LA': 'S',
        'US-MA': 'N',
        'US-MD': 'N',
        'US-ME': 'N',
        'US-MI': 'W',
        'US-MN': 'M',
        'US-MO': 'M',
        'US-MP': 'O',
        'US-MS': 'S',
        'US-MT': 'W',
        'US-NA': 'O',
        'US-NC': 'S',
        'US-ND': 'M',
        'US-NE': 'W',
        'US-NH': 'N',
        'US-NJ': 'N',
        'US-NM': 'W',
        'US-NV': 'W',
        'US-NY': 'N',
        'US-OH': 'M',
        'US-OK': 'S',
        'US-OR': 'W',
        'US-PA': 'N',
        'US-PR': 'O',
        'US-RI': 'N',
        'US-SC': 'S',
        'US-SD': 'M',
        'US-TN': 'S',
        'US-TX': 'S',
        'US-UT': 'W',
        'US-VA': 'S',
        'US-VI': 'O',
        'US-VT': 'N',
        'US-WA': 'W',
        'US-WI': 'M',
        'US-WV': 'S',
        'US-WY': 'W',
        'PR-U-A': 'O',
        'MP-U-A': 'O',
        'VI-U-A': 'O',
        'AS-WT': 'O',
        'GU-U-A': 'O',}

airports['iso_region'].replace(states, inplace=True)

In [14]:
# verify values

airports.iso_region.value_counts()

S    113
W    109
M     69
N     47
O     37
Name: iso_region, dtype: int64

In [15]:
# rename column to drop iso prefix

airports.rename({'iso_region':'region'}, axis=1, inplace=True)

In [16]:
# drop irrelevant columns

airports.drop(['id', 'ident', 'name', 'latitude_deg', 'longitude_deg', 'elevation_ft', 'continent', 'iso_country', 'municipality', 'scheduled_service', 'gps_code', 'local_code', 'wikipedia_link', 'keywords', 'hub', 'home_link'], axis=1, inplace=True)

In [17]:
airports

Unnamed: 0,type,region,iata_code
0,medium_airport,W,MFR
1,medium_airport,W,BLI
2,medium_airport,W,RDM
3,medium_airport,W,SCK
4,medium_airport,S,VLD
...,...,...,...
370,medium_airport,S,SRQ
371,medium_airport,S,VPS
372,medium_airport,M,FWA
373,large_airport,S,RIC


### calculate number of birdstrikes and number of flights per airport

In [18]:
# import flights_by_IATA.csv, containing number of flights in dataset per airport

flights = pd.read_csv('flights_by_IATA.csv', index_col=0)

In [19]:
# reverse column order for readability

flights = flights[flights.columns[::-1]]

In [20]:
flights

Unnamed: 0,IATA,numflights
0,ABE,12027
1,ABI,4291
2,ABQ,54363
3,ABR,1492
4,ABY,2020
...,...,...
371,XWA,418
372,YAK,1442
373,YKM,2514
374,YNG,2


In [21]:
# import birdstrike.csv, containing all birdstrikes between 2018-2019

strikes = pd.read_csv('birdstrike.csv', index_col=0)

In [22]:
strikes

Unnamed: 0,FLT,OPID,INCIDENT_DATE,TIME,TIME_OF_DAY,AIRPORT_ID,LATITUDE,LONGITUDE,FAAREGION,COUNT
0,2502,QXE,2019-12-31,12:30,Day,KSJC,37.36186,-121.92901,AWP,1
1,508,HAL,2019-12-31,17:51,Day,PHNL,21.31869,-157.92241,AWP,1
2,,UNK,2019-12-31,16:16,,KDFW,32.89595,-97.03720,ASW,1
3,,USCG,2019-12-31,16:55,Day,KPIE,27.91076,-82.68744,ASO,2
4,,UNK,2019-12-31,09:00,,KLGA,40.77724,-73.87261,AEA,1
...,...,...,...,...,...,...,...,...,...,...
33597,,UNK,2018-01-01,,,KMLI,41.44853,-90.50754,AGL,1
33598,,UNK,2018-01-01,,,KLIT,34.72940,-92.22425,ASW,1
33599,,UNK,2018-01-01,,,KSRQ,27.39533,-82.55411,ASO,1
33600,,UNK,2018-01-01,,,KDFW,32.89595,-97.03720,ASW,1


In [23]:
# calculate number of strikes per airport

strikes_byap = strikes.groupby('AIRPORT_ID').sum().reset_index()

In [24]:
# clear first digit from AIRPORT_ID to match flights DataFrame

strikes_byap['AIRPORT_ID'] = strikes_byap['AIRPORT_ID'].str[-3:]

In [25]:
# merge dataframes

birdstrike = flights.merge(strikes_byap, how='left', left_on='IATA', right_on='AIRPORT_ID')

In [26]:
birdstrike

Unnamed: 0,IATA,numflights,AIRPORT_ID,LATITUDE,LONGITUDE,COUNT
0,ABE,12027,ABE,2805.012840,-5205.387600,87.0
1,ABE,12027,ABE,303.898900,-809.190000,5.0
2,ABI,4291,ABI,259.290560,-797.455200,8.0
3,ABQ,54363,ABQ,1857.131660,-5650.287070,69.0
4,ABR,1492,ABR,772.634020,-1673.171110,21.0
...,...,...,...,...,...,...
376,XWA,418,XWA,96.521728,-207.502278,2.0
377,YAK,1442,,,,
378,YKM,2514,YKM,139.704510,-361.632180,3.0
379,YNG,2,YNG,123.782220,-242.037300,3.0


In [27]:
# Fill NaN birdstrike counts with value 0

birdstrike['COUNT'] = birdstrike['COUNT'].fillna(0)

In [28]:
# verify fill NaN with 0

birdstrike.COUNT.isna().value_counts()

False    381
Name: COUNT, dtype: int64

In [29]:
# drop irrelevant columns

birdstrike.drop(['AIRPORT_ID', 'LATITUDE', 'LONGITUDE'], axis=1, inplace=True)

In [30]:
# rename COUNT column for readability

birdstrike.rename({'COUNT' : 'numstrikes'}, axis=1, inplace=True)

In [31]:
birdstrike

Unnamed: 0,IATA,numflights,numstrikes
0,ABE,12027,87.0
1,ABE,12027,5.0
2,ABI,4291,8.0
3,ABQ,54363,69.0
4,ABR,1492,21.0
...,...,...,...
376,XWA,418,2.0
377,YAK,1442,0.0
378,YKM,2514,3.0
379,YNG,2,3.0


### merge numflights / numstrikes with airport dataframe

In [32]:
# merge the two dataframes using IATA_code

airports = birdstrike.merge(airports, how='left', left_on='IATA', right_on='iata_code')

In [33]:
# drop duplicate column

airports.drop(['iata_code'], axis=1, inplace=True)

In [34]:
airports

Unnamed: 0,IATA,numflights,numstrikes,type,region
0,ABE,12027,87.0,medium_airport,N
1,ABE,12027,5.0,medium_airport,N
2,ABI,4291,8.0,medium_airport,S
3,ABQ,54363,69.0,medium_airport,W
4,ABR,1492,21.0,medium_airport,M
...,...,...,...,...,...
376,XWA,418,2.0,medium_airport,M
377,YAK,1442,0.0,medium_airport,O
378,YKM,2514,3.0,medium_airport,W
379,YNG,2,3.0,medium_airport,M


### calculate probability of birdstrike / percentage of total flights

In [46]:
# initialize new columns, calculate

total_flights = airports.numflights.sum()

airports['strikeprob'] = airports.numstrikes / airports.numstrikes.sum()
airports['percentflights'] = airports.numflights / total_flights

In [47]:
airports

Unnamed: 0,IATA,numflights,numstrikes,type,region,strikeprob,percentflights
0,ABE,12027,87.0,medium_airport,N,0.001977,7.520689e-04
1,ABE,12027,5.0,medium_airport,N,0.000114,7.520689e-04
2,ABI,4291,8.0,medium_airport,S,0.000182,2.683236e-04
3,ABQ,54363,69.0,medium_airport,W,0.001568,3.399412e-03
4,ABR,1492,21.0,medium_airport,M,0.000477,9.329732e-05
...,...,...,...,...,...,...,...
376,XWA,418,2.0,medium_airport,M,0.000045,2.613826e-05
377,YAK,1442,0.0,medium_airport,O,0.000000,9.017073e-05
378,YKM,2514,3.0,medium_airport,W,0.000068,1.572047e-04
379,YNG,2,3.0,medium_airport,M,0.000068,1.250634e-07


In [51]:
airports_guide = '''
IATA: Airport IATA code
numflights: Count of entries from flights table, grouped by IATA
numstrikes: Count of entries from birdstrike.csv, grouped by IATA
type: Type of airport
region: 
    N - North East
    W - West 
    M - Mid West 
    S - South 
    O - Other
strikeprob: airports.numstrikes / airports.numstrikes.sum()
percentflights: airports.numflights / airports.numflights.sum()
'''

In [52]:
# export airports as csv, airports_guide as txt

airports.to_csv('airports_strikes.csv')


text = open('airports_strikes_description.txt', 'w')
text.write(airports_guide)
text.close()