In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import regex as re
pd.set_option('display.max_columns', None)

Data pre-processing:
* Download data from FAA ASRS Database (5000 rows at a time)
* Needed to convert xls files to csv, python can not read xls file (corrupted?)
* Skip the first row when reading csv (useless header), and third row (blank)
* Remove extra columns from 2006 to 2019 (called 'unnamed' before each other them --> last 5 cols)
* A lot of columns are null with entirely NaN values --> remove them
* 'Anomaly' labels incursion/excursion
* Only 1 'Narrative' column in df1, 'Narrative' and 'Narrative.1' in df2
* Only 1 ACN per Narrative
* Combined narratives in the 2006 - 2019 dataframe, should not matter for clustering (confirm with Dr. Gil)
* Extracted important columns to new dataframe and concatenated all data together (time, date, airport, state, narrative, synopsis, ...)
* Created new columns to label if an incident is an incursion or excursion (or both if both are 1)
* Jan 2006 Data is duplicated --> drop duplicate ACNs

In [47]:
# Reading in data from Jan 2000 to Jan 2006
df1 = pd.read_csv('data/ASRS_Jan2000_Jan2006.csv', skiprows = [0,2])
# Now Jan 2006 to Dec 2019
df2 = pd.read_csv('data/ASRS_Jan2006_Dec2019.csv', skiprows = [0,2])

  df2 = pd.read_csv('data/ASRS_Jan2006_Dec2019.csv', skiprows = [0,2])


In [48]:
print(df1.shape, df2.shape)

(4735, 126) (4586, 131)


In [49]:
df1.columns[-1]

'Unnamed: 125'

In [50]:
# Finding unnamed columns in df2
def find_unnamed(df):
    unnamed_cols = []
    for col in df.columns:
        if 'Unnamed' in col:
            unnamed_cols.append(col)
    return unnamed_cols

# Dropping unnamed columns
df1 = df1.drop(columns = find_unnamed(df1), axis = 1)
df2 = df2.drop(columns = find_unnamed(df2), axis = 1)

In [51]:
print(df1.shape, df2.shape)

(4735, 125) (4586, 125)


In [52]:
# Drop all columns with only null values
df1, df2 = df1.dropna(axis = 1, how = 'all'), df2.dropna(axis = 1, how = 'all')

In [53]:
print(df1.shape, df2.shape)

(4735, 75) (4586, 83)


In [54]:
# Grab important columns from each dataframe
df1_final = df1[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative', 'Synopsis']]
df2_final = df2[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative', 'Narrative.1', 'Synopsis']]

# Combining the Narrative columns from df2
df2_final['Narrative.1'] = df2_final['Narrative.1'].fillna('')
# Combining narratives together
df2_final['Narrative_comb'] = df2_final['Narrative'] + ' ' + df2_final['Narrative.1']
df2_final = df2_final.drop(['Narrative', 'Narrative.1'], axis = 1)
# Renaming column to keep consistent when concatenating
df2_final = df2_final.rename(columns = {'Narrative_comb': 'Narrative'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_final['Narrative.1'] = df2_final['Narrative.1'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_final['Narrative_comb'] = df2_final['Narrative'] + ' ' + df2_final['Narrative.1']


In [55]:
# Concatenating all of our data together
asrs_df = pd.concat([df1_final, df2_final]).drop_duplicates(subset = ['ACN']).reset_index(drop = True)

In [56]:
# Check for incursion in anomaly
def incursion_check(desc):
    if 'incursion' in desc.lower():
        return 1
    else:
        return 0
    
# Check for excursion    
def excursion_check(desc):
    if 'excursion' in desc.lower():
        return 1
    else:
        return 0
    
asrs_df['incursion'] = asrs_df['Anomaly'].apply(incursion_check)
asrs_df['excursion'] = asrs_df['Anomaly'].apply(excursion_check)

In [57]:
print(len(asrs_df[asrs_df['incursion'] == 1]), len(asrs_df[asrs_df['excursion'] == 1]))
print(len(asrs_df[(asrs_df.incursion == 1) & (asrs_df.excursion == 1)]))

6915 2466
78


In [58]:
asrs_df

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Anomaly,Narrative,Synopsis,incursion,excursion
0,459107,200001,0601-1200,DTW.Airport,MI,"Conflict Ground Conflict, Critical; Deviation ...","ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...,1,0
1,459230,200001,0601-1200,MCO.Airport,FL,Deviation / Discrepancy - Procedural FAR; Grou...,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...,1,0
2,459389,200001,0601-1200,LFPG.Airport,FO,Ground Excursion Taxiway,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",CAPT OF AN MD11 RAN OFF TXWY DURING TAXI IN RE...,0,1
3,459407,200001,0001-0600,SRB.Airport,TN,Aircraft Equipment Problem Critical; Deviation...,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",FA20 ENCOUNTERS SLICK RWY AT SRB RESULTING IN ...,0,1
4,459425,200001,1201-1800,ABE.Airport,PA,"Conflict Ground Conflict, Critical; Deviation ...",WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,A JS31 CREW LNDG ON RWY 31 AT ABE ARE INSTRUCT...,1,0
...,...,...,...,...,...,...,...,...,...,...
9298,1715980,201912,0001-0600,SFO.Tower,CA,"Conflict Ground Conflict, Critical; Ground Inc...",I told Aircraft X to hold short of RWY 28L. Th...,SFO Tower Controller reported a runway incursi...,1,0
9299,1716265,201912,0601-1200,ZZZ.Airport,US,"Conflict Ground Conflict, Critical; Deviation ...","Normal training environment, student [in] left...",PA44 student reported experiencing a runway ex...,0,1
9300,1722152,201907,,,,"Conflict Ground Conflict, Critical; Deviation ...","During final for Runway XX at ZZZ, my instruct...",Pilot reported that an aircraft not on CTAF ap...,1,0
9301,1724990,201912,,SFO.Airport,CA,Deviation / Discrepancy - Procedural Clearance...,I was assigned to taxi Aircraft X to Gate XX. ...,Technician reported that while taxiing an airc...,1,0


In [59]:
asrs_df.to_pickle('./data/trimmed_asrs.pkl')