In [2]:
import re
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline
np.set_printoptions(suppress=True)

2023-04-18 04:27:11.158047: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Preprocessing

In [3]:
df1 = pd.read_csv('ASRS_Jan2000_Jan2006.csv', skiprows=[0,2])
df2 = pd.read_csv('ASRS_Jan2006_Dec2019.csv', skiprows=[0,2])
print('df1 originial shape:', df1.shape)
print('df2 originial shape:', df2.shape)

df1 originial shape: (4735, 126)
df2 originial shape: (4586, 131)


  df2 = pd.read_csv('ASRS_Jan2006_Dec2019.csv', skiprows=[0,2])


In [4]:
# There are more columns in one dataset than the other, find additional columns
more_cols_1 = list(set(df2.columns) - set(df1.columns))

# Check whether these columns can be dropped
df2[more_cols_1].isnull().sum()

# Found that additional columns are unnamed and mostly NULL, decide to drop them

Unnamed: 126    4584
Unnamed: 130    4585
Unnamed: 127    4586
Unnamed: 129    4586
Unnamed: 128    4585
dtype: int64

In [5]:
# Found another unnamed column in the dataset, check if null
print(df1.shape[0], '\tNulls:', df1['Unnamed: 125'].isnull().sum())
print(df2.shape[0], '\tNulls:', df2['Unnamed: 125'].isnull().sum())

4735 	Nulls: 4733
4586 	Nulls: 4583


In [6]:
df1.drop(['Unnamed: 125'], axis=1, inplace=True) # drop column named: <Unnamed: 125>
df1.dropna(axis=1, how='all', inplace=True) # drop columns that are all NAs
print('df1 cleaned shape:', df1.shape)

df1 cleaned shape: (4735, 75)


In [7]:
remove_cols = more_cols_1 + ['Unnamed: 125']
df2.drop(remove_cols, axis=1, inplace=True) # drop 6 unnamed columns
df2.dropna(axis=1, how='all', inplace=True) # drop columns that are all NAs
print('df2 cleaned shape:', df2.shape)

df2 cleaned shape: (4586, 83)


In [8]:
df1.head()

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Relative Position.Angle.Radial,Relative Position.Distance.Nautical Miles,Altitude.AGL.Single Value,Altitude.MSL.Single Value,Flight Conditions,...,ASRS Report Number.Accession Number.1,Anomaly,Miss Distance,Detector,Result,Contributing Factors / Situations,Primary Problem,Narrative,Callback,Synopsis
0,459107,200001,0601-1200,DTW.Airport,MI,,0.0,0.0,,VMC,...,,"Conflict Ground Conflict, Critical; Deviation ...",Horizontal 100; Vertical 0,Person Flight Crew,Flight Crew Became Reoriented; Flight Crew Rej...,Airport; Human Factors,Human Factors,"ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",,AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...
1,459230,200001,0601-1200,MCO.Airport,FL,,0.0,0.0,,VMC,...,459374.0,Deviation / Discrepancy - Procedural FAR; Grou...,,Person Air Traffic Control,General None Reported / Taken,Human Factors,Human Factors,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...
2,459389,200001,0601-1200,LFPG.Airport,FO,,0.0,0.0,,VMC,...,459260.0,Ground Excursion Taxiway,,Person Flight Crew,General None Reported / Taken; General Mainten...,Human Factors,Human Factors,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",,CAPT OF AN MD11 RAN OFF TXWY DURING TAXI IN RE...
3,459407,200001,0001-0600,SRB.Airport,TN,,0.0,0.0,,Marginal,...,459406.0,Aircraft Equipment Problem Critical; Deviation...,,Person Flight Crew,General Maintenance Action; General None Repor...,Aircraft; Weather; Airport; Human Factors,Weather,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",,FA20 ENCOUNTERS SLICK RWY AT SRB RESULTING IN ...
4,459425,200001,1201-1800,ABE.Airport,PA,,0.0,0.0,,VMC,...,,"Conflict Ground Conflict, Critical; Deviation ...",Horizontal 1000,Person Air Traffic Control; Person Flight Crew,Air Traffic Control Issued New Clearance; Flig...,Airport; Human Factors,Human Factors,WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,,A JS31 CREW LNDG ON RWY 31 AT ABE ARE INSTRUCT...


In [9]:
df2.head()

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Relative Position.Angle.Radial,Relative Position.Distance.Nautical Miles,Altitude.AGL.Single Value,Altitude.MSL.Single Value,Flight Conditions,...,Detector,When Detected,Result,Contributing Factors / Situations,Primary Problem,Narrative,Callback,Narrative.1,Callback.1,Synopsis
0,683097,200601,1201-1800,FTK.Airport,KY,,,,3000.0,VMC,...,Person Flight Crew,,General None Reported / Taken,Human Factors,Human Factors,I HAD ENDORSED A PVT PLT STUDENT FOR A SOLO XC...,,,,INSTRUCTOR PLT ENDORSES STUDENT PLT FOR XCOUNT...
1,683301,200601,1201-1800,BUR.Airport,CA,,,0.0,,VMC,...,Person Air Traffic Control; Person Flight Crew,,General None Reported / Taken,Human Factors; Airport,Human Factors,"AFTER LNDG ON RWY 8 AT BUR, I WAS INSTRUCTED B...",,,,LR45 FLT CREW TAXIING AT BUR HAS A RWY INCURSI...
2,683377,200601,1201-1800,PBI.Airport,FL,,0.0,0.0,,VMC,...,Person Air Traffic Control,,General None Reported / Taken,Human Factors,Human Factors,I WAS WORKING THE GND CTL POS AT WEST PALM BEA...,,,,PBI GND CTLR WITNESSED ACFT LAND ON TXWY L IN ...
3,683394,200601,1801-2400,ZZZZ.Airport,FO,,,0.0,,VMC,...,Automation Aircraft Other Automation; Person F...,,Aircraft Aircraft Damaged; Flight Crew Landed ...,Aircraft; Human Factors,Aircraft,"ACFT DURING THE TKOF ROLL, A WT ON WHEELS ANNU...",,,,CL65 FLT CREW CONTACTS FOD DURING TKOF. UPON L...
4,683395,200601,0001-0600,ZZZ.Airport,US,,,0.0,,,...,Person Flight Crew,,Aircraft Aircraft Damaged; General None Report...,Environment - Non Weather Related; Aircraft; A...,Human Factors,I LANDED A KING AIR 100 AND SHORTLY AFTER LNDG...,,,,B100 PLT LANDS ON ICY RWY AND LOSES DIRECTIONA...


In [10]:
# find columns difference, check they can dropped
df1_cols = df1.columns
df2_cols = df2.columns

common_cols = df1_cols.intersection(df2_cols)
diff_cols = df1_cols.difference(df2_cols)
diff_cols2 = df2_cols.difference(df1_cols)

diff_cols, diff_cols2

(Index(['Cabin Activity', 'Cabin Lighting',
        'Maintenance Status.Released For Service.1',
        'Maintenance Status.Required / Correct Doc On Board.1',
        'Number Of Seats.Number.1', 'Passengers On Board.Number.1'],
       dtype='object'),
 Index(['Aircraft Zone', 'Cabin Activity.1', 'Callback.1',
        'Communication Breakdown', 'Communication Breakdown.1',
        'Crew Size Flight Attendant.Number Of Crew', 'Human Factors.1',
        'Location In Aircraft', 'Location In Aircraft.1',
        'Maintenance Status.Records Complete', 'Narrative.1',
        'Were Passengers Involved In Event', 'When Detected',
        'Work Environment Factor'],
       dtype='object'))

In [11]:
df1[diff_cols].isnull().sum()

Cabin Activity                                          4734
Cabin Lighting                                          4733
Maintenance Status.Released For Service.1               4734
Maintenance Status.Required / Correct Doc On Board.1    4734
Number Of Seats.Number.1                                4725
Passengers On Board.Number.1                            4733
dtype: int64

In [12]:
df2[diff_cols2].isnull().sum()

Aircraft Zone                                4584
Cabin Activity.1                             4585
Callback.1                                   4579
Communication Breakdown                      3325
Communication Breakdown.1                    4285
Crew Size Flight Attendant.Number Of Crew    4584
Human Factors.1                              3921
Location In Aircraft                         1708
Location In Aircraft.1                       3828
Maintenance Status.Records Complete          4584
Narrative.1                                  3731
Were Passengers Involved In Event            4185
When Detected                                1312
Work Environment Factor                      4552
dtype: int64

In [13]:
# Concat Narrative and Narrative.1 column in df2
df2['Narrative.1'] = df2['Narrative.1'].fillna('')
df2['Narrative_comb'] = df2['Narrative'] + ' ' + df2['Narrative.1']

Found that there are instances when event is incursion, excursion, or both.

In [14]:
df1_trim = df1[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative', 'Synopsis']]
df2_trim = df2[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative_comb', 'Synopsis']]
df2_trim.rename(columns={'Narrative_comb': 'Narrative'}, inplace=True)
df3 = pd.concat([df1_trim, df2_trim])
print('df3 original shape:', df3.shape)

df3 original shape: (9321, 8)


In [15]:
# Noticed that there are duplicates in the data, so drop duplicates when concatenating the two trimmed dfs such that data from 2000-2005 is combined with data from 2006-2019
df3 = pd.concat([df1_trim, df2_trim]).drop_duplicates(subset=['ACN'])
print('numbers of duplicate rows:', df1_trim.shape[0]+df2_trim.shape[0])
print('df3 original shape:', df3.shape)

numbers of duplicate rows: 9321
df3 original shape: (9303, 8)


In [16]:
# Make sure Narrative column has correct datatype
df3.dtypes

ACN                   int64
Date                  int64
Local Time Of Day    object
Locale Reference     object
State Reference      object
Anomaly              object
Narrative            object
Synopsis             object
dtype: object

In [17]:
df3.Narrative.iloc[0]

'ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TXWY M FROM TXWY F, BUT STOPPED SEVERAL FT PAST TXWY M ON TXWY F WELL CLR OF RWY 3L. A B757 WAS ON TKOF ROLL. WAS NOT SURE WHAT WE WERE GOING TO DO, SO ABORTED TKOF.'

In [18]:
# replace abreviations
abbr_dict = {"TXWY": "TAXIWAY", 
             "RWY": "RUNWAY", 
             "TKOF": "TAKEOFF", 
             "ACFT": "AIRCRAFT",
             "TWR": "TOWER",
             "CAPT": "CAPTAIN",
             "CLRED": "CLEARED",
             "LNDG": "LANDING",
             "VFR": "VISUAL FLIGHT RULE",
             "IFR": "INSTRUMENT FLIGHT RULE",
             "NOTAM": "NOTICE TO AIRMEN"}
df3['Narrative'] = df3['Narrative'].replace(abbr_dict, regex=True)

In [19]:
# Label the rows based on anomaly column
def incursion_check(desc):
    if 'incursion' in desc.lower():
        return 1
    else:
        return 0
    
def excursion_check(desc):
    if 'excursion' in desc.lower():
        return 1
    else:
        return 0
    
df3['incursion'] = df3['Anomaly'].apply(incursion_check)
df3['excursion'] = df3['Anomaly'].apply(excursion_check)

In [20]:
print('new df3 shape:', df3.shape)
df3.head(2)

new df3 shape: (9303, 10)


Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Anomaly,Narrative,Synopsis,incursion,excursion
0,459107,200001,0601-1200,DTW.Airport,MI,"Conflict Ground Conflict, Critical; Deviation ...","ON TAXI OUT TO DTW RUNWAY 3L, MISSED TURN ONTO...",AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...,1,0
1,459230,200001,0601-1200,MCO.Airport,FL,Deviation / Discrepancy - Procedural FAR; Grou...,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...,1,0


In [21]:
print('number of incursion data:', df3[(df3['incursion']==1) & (df3['excursion']==0)].shape[0])
print('number of excursion data:', df3[(df3['incursion']==0) & (df3['excursion']==1)].shape[0])
print('number of both data:', df3[(df3['incursion']==1) & (df3['excursion']==1)].shape[0])

number of incursion data: 6837
number of excursion data: 2388
number of both data: 78


In [22]:
# Extract incursions
df_incur = df3[['ACN', 'Narrative']].loc[df3['incursion'] == 1]
# Extract excursions
df_excur = df3[['ACN', 'Narrative']].loc[df3['excursion'] == 1]
# Extract both
df_both = pd.merge(df_incur, df_excur, how='inner')

print(df_incur.shape, df_excur.shape, df_both.shape)

(6915, 2) (2466, 2) (78, 2)


In [23]:
incur_docs = df_incur.Narrative.tolist()
incur_docs = [doc.lower() for doc in incur_docs]

In [24]:
excur_docs = df_excur.Narrative.tolist()
excur_docs = [doc.lower() for doc in excur_docs]

In [25]:
both_docs = df_both.Narrative.tolist()
both_docs = [doc.lower() for doc in both_docs]

In [26]:
incur_docs[0:3]

['on taxi out to dtw runway 3l, missed turn onto taxiway m from taxiway f, but stopped several ft past taxiway m on taxiway f well clr of runway 3l. a b757 was on takeoff roll. was not sure what we were going to do, so aborted takeoff.',
 'taxied out of ramp area before talking to gnd. we were third of 3 aircraft and we thought we were cleared to runway 17, when in fact we were not acknowledged by gnd. we cleared it up immediately after gnd asked who the third b737 was on the taxiway. no probs resulted.',
 "we were cleared for the visual apch runway 31 about 7 sm out. we heard coms on tower freq with ga tfc on what we thought was on downwind for same runway. we were entering the tfc pattern on a l base. fo was looking for the tfc that looked very close on tcasii, but never saw it. we were cleared to land runway 31 on final. runway 31 intersects runway 6 at 3200 ft, and taxiway a at approx 3000 ft. on landing roll and about 100 ft short of taxiway a, tower instructed us to 'exit at a.' 

------

# Embedding, Dimension Reduction

Main steps: https://towardsdatascience.com/let-us-extract-some-topics-from-text-data-part-iv-bertopic-46ddf3c91622

types of pretrained model for embedding: https://www.sbert.net/docs/pretrained_models.html

## Incursion

In [None]:
# base model
incur_umap_model1 = UMAP(n_neighbors=20, 
                         n_components=15, 
                         min_dist=0.0, 
                         metric='cosine', 
                         random_state=42)

incur_cv1 = CountVectorizer(ngram_range=(1, 2), stop_words="english")

incur_model1 = BERTopic(language="english", 
                        umap_model=incur_umap_model1, 
                        vectorizer_model=incur_cv1,
                       )

incur_topics1, incur_probs1 = incur_model1.fit_transform(incur_docs)

In [None]:
incur_model1.topic_labels_

RESULT: {-1: '-1_runway_aircraft_taxiway_short',
 0: '0_runway_aircraft_taxiway_taxi',
 1: '1_cherokee_runway_cleared_17l'}

In [None]:
incur_model1.get_topic_info()

In [None]:
# fine tuned model
incur_umap_model2 = UMAP(n_neighbors=5, 
                         n_components=10, 
                         min_dist=0.0, 
                         metric='cosine', 
                         random_state=42)

incur_cv2 = CountVectorizer(ngram_range=(1, 3), stop_words="english", max_df=0.9)

incur_model2 = BERTopic(language="english", 
                        umap_model=incur_umap_model2, 
                        embedding_model=SentenceTransformer("all-mpnet-base-v2"),
                        vectorizer_model=incur_cv2,
                        nr_topics=25,
                       )

incur_topics2, incur_probs2 = incur_model2.fit_transform(incur_docs)

In [None]:
incur_model2.topic_labels_

RESULT:
{-1: '-1_ground_airport_clearance_control',
 0: '0_xing_acr_gnd ctlr_cleared takeoff',
 1: '1_ground_clearance_controller_control',
 2: '2_hdg_spd_turn taxiway_roll',
 3: '3_line runway_short line runway_hold short lines_short lines',
 4: '4_notice_notice airmen_airmen_notice airmens',
 5: '5_sight_ils_wx_visual flight rule',
 6: '6_signage_ground_taxiway taxiway_diagram',
 7: '7_approach_airport_pilot_traffic',
 8: '8_rptr_arpt diagram_diagram_chart',
 9: '9_pattern_unicom_ctaf_announced',
 10: '10_runway incursion_22_runway 22_echo',
 11: '11_pos hold_cleared pos_hold runway_taxi pos',
 12: '12_snow_brake_brakes_ice',
 13: '13_fuel_power_engine_ground',
 14: '14_vehicle_vehicles_truck_airport',
 15: '15_push_pushback_tug_driver',
 16: '16_threshold_29_land runway_displaced',
 17: '17_ils_critical_ils critical_critical area',
 18: '18_gar_vehicle_cessna_ft agl',
 19: '19_lighting_green_ctrline_green ctrline',
 20: '20_apch ctl_cleared land_visual apch_landing clrnc',
 21: '21_mil_security_arpt ops_guard',
 22: '22_instrument flight rule_instrument flight_instrument_36',
 23: '23_sleep_nap_fatigue_hours'}

In [None]:
incur_model2.get_topic_info()

## Excursion

In [None]:
# base model
excur_umap_model1 = UMAP(n_neighbors=10, 
                         n_components=7, 
                         min_dist=0.0, 
                         metric='cosine', 
                         random_state=42)

excur_cv1 = CountVectorizer(ngram_range=(1, 2), stop_words="english")

excur_model1 = BERTopic(language="english", 
                umap_model=excur_umap_model1,
                vectorizer_model=excur_cv1)

excur_topics1, excur_probs1 = excur_model1.fit_transform(excur_docs)

In [None]:
excur_model1.topic_labels_

RESULT: {-1: '-1_aircraft_runway_landing_damage',
 0: '0_runway_aircraft_taxiway_landing',
 1: '1_threshold_displaced_runway_displaced threshold'}

In [None]:
excur_model1.get_topic_info()

In [None]:
# fine tuned model
excur_umap_model2 = UMAP(n_neighbors=10, 
                         n_components=7, 
                         min_dist=0.0, 
                         metric='cosine', 
                         random_state=42)

excur_cv2 = CountVectorizer(ngram_range=(1, 3), stop_words="english", max_df=0.9)

excur_model2 = BERTopic(language="english", 
                    umap_model=excur_umap_model2, 
                    embedding_model=SentenceTransformer("all-mpnet-base-v2"),
                    vectorizer_model=excur_cv2)

excur_topics2, excur_probs2 = excur_model2.fit_transform(excur_docs)

In [None]:
excur_model2.topic_labels_

RESULT: {-1: '-1_brakes_nose_flight_student',
 0: '0_ramp_line_gnd_taxiing',
 1: '1_brake_brakes_student_rudder',
 2: '2_crosswind_winds_approach_gust',
 3: '3_nose_main gear_nose gear_collapsed',
 4: '4_apch_arpt_flight_rule',
 5: '5_kts_degs_xwind_ctl',
 6: '6_tailwheel_rudder_tail_brake',
 7: '7_snow_ice_student_braking',
 8: '8_braking_approach_end_captain',
 9: '9_snow_ramp_braking_ice',
 10: '10_tire_takeoff_tires_normal',
 11: '11_plt_takeoff_pwr_eng',
 12: '12_radio_traffic_ctaf_pattern',
 13: '13_student_solo_instructor_flight',
 14: '14_soft_field_turf_soft field',
 15: '15_steering_nosewheel_nosewheel steering_ctl',
 16: '16_apch_spd_pwr_tfc',
 17: '17_veered_right wing tip_wing tip_rollout',
 18: '18_threshold_displaced_displaced threshold_markings',
 19: '19_student_control_student pilot_pilot',
 20: '20_ground_looped_ground looped_loop',
 21: '21_student_engine_instructor_throttles',
 22: '22_sign_spd_degree_plt',
 23: '23_winds_gusting_gust_162',
 24: '24_fuel_tank_engine_tanks',
 25: '25_thrust_braking_kts_apch',
 26: '26_hyd_pump_qrh_accumulator',
 27: '27_captain_braking_brakes_zone',
 28: '28_mud_pavement_older taxiway_older'}

In [None]:
excur_model2.get_topic_info()

## Both

In [None]:
both_umap_model1 = UMAP(n_neighbors=5, 
                        n_components=3, 
                        min_dist=0.0, 
                        metric='cosine', 
                        random_state=42)

both_cv1 = CountVectorizer(ngram_range=(1, 3), stop_words="english", max_df=0.9)

both_model1 = BERTopic(language="english", 
                        umap_model=both_umap_model1, 
                        embedding_model=SentenceTransformer("all-mpnet-base-v2"),
                        vectorizer_model=both_cv1,
                        nr_topics=4,
                       )

both_topics1, both_probs1 = both_model1.fit_transform(both_docs)

In [None]:
both_model1.topic_labels_

RESULT: {-1: '-1_crew_ctlr_taxiing_pushback',
 0: '0_centerline_rudder_nose_conditions',
 1: '1_radio_19_announced_rule',
 2: '2_yellow_yellow line_past_chart'}

In [None]:
both_model1.get_topic_info()

# Visualization

## Incursion

In [None]:
incur_model1.visualize_topics()

In [None]:
incur_model1.visualize_barchart()

In [None]:
incur_model2.visualize_topics()

In [None]:
incur_model2.visualize_barchart()

## Excursion

In [None]:
excur_model1.visualize_topics()

In [None]:
excur_model1.visualize_barchart()

In [None]:
excur_model2.visualize_topics()

In [None]:
excur_model2.visualize_barchart()

## Both

In [None]:
# both_model1.visualize_topics() # no plot due to too little data

In [None]:
both_model1.visualize_barchart()