In [19]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV

## Read in Data

In [20]:
# Data is downloaded as xls and we had to manually save the files as csv files as there were BOF errors while trying to read in with read_excel()
# Skip the first (column name category) and third row (blank line)
df_1 = pd.read_csv('ASRS_Jan2000_Jan2006.csv', skiprows=[0,2])
df_2 = pd.read_csv('ASRS_Jan2006_Dec2019.csv', skiprows=[0,2])

# Confirm successful data reading
print(df_1.shape, df_2.shape)

(4735, 126) (4586, 131)


  df_2 = pd.read_csv('ASRS_Jan2006_Dec2019.csv', skiprows=[0,2])


## Data Cleaning

In [21]:
# There are more columns in one dataset than the other, find additional columns
more_cols_1 = list(set(df_2.columns) - set(df_1.columns))

# Check whether these columns can be dropped
df_2[more_cols_1].isnull().sum()

Unnamed: 128    4585
Unnamed: 127    4586
Unnamed: 130    4585
Unnamed: 129    4586
Unnamed: 126    4584
dtype: int64

In [22]:
# Found that additional columns are unnamed and mostly NULL, decide to drop them
df_2.drop(more_cols_1, axis=1, inplace=True)

print(df_1.shape, df_2.shape)

(4735, 126) (4586, 126)


In [23]:
# Drop any other columns that have completely NULL values as they would not be useful
df_1.dropna(axis=1, how='all', inplace=True)
df_2.dropna(axis=1, how='all', inplace=True)

print(df_1.shape, df_2.shape)
print(df_1.columns, df_2.columns)

(4735, 76) (4586, 84)
Index(['ACN', 'Date', 'Local Time Of Day', 'Locale Reference',
       'State Reference', 'Relative Position.Angle.Radial',
       'Relative Position.Distance.Nautical Miles',
       'Altitude.AGL.Single Value', 'Altitude.MSL.Single Value',
       'Flight Conditions', 'Weather Elements / Visibility', 'Light',
       'Ceiling', 'RVR.Single Value', 'ATC / Advisory', 'Aircraft Operator',
       'Make Model Name', 'Crew Size', 'Operating Under FAR Part',
       'Flight Plan', 'Mission', 'Nav In Use', 'Flight Phase', 'Route In Use',
       'Airspace', 'Maintenance Status.Maintenance Deferred',
       'Maintenance Status.Released For Service',
       'Maintenance Status.Required / Correct Doc On Board',
       'Maintenance Status.Maintenance Type',
       'Maintenance Status.Maintenance Items Involved', 'Cabin Lighting',
       'Number Of Seats.Number', 'Passengers On Board.Number',
       'Aircraft Component', 'Manufacturer', 'Aircraft Reference', 'Problem',
       'ATC

In [24]:
# Found another unnamed column in the dataset, check if null
print(df_1.shape[0], '\tNulls:', df_1['Unnamed: 125'].isnull().sum())
print(df_2.shape[0], '\tNulls:', df_2['Unnamed: 125'].isnull().sum())

4735 	Nulls: 4733
4586 	Nulls: 4583


In [25]:
# Since Unnamed column was mostly NULL, decide to drop columns
df_1.drop('Unnamed: 125', axis=1, inplace=True)
df_2.drop('Unnamed: 125', axis=1, inplace=True)

print(df_1.shape, df_2.shape)

(4735, 75) (4586, 83)


In [26]:
# Two datasets have different number of columns, check if they can be dropped
more_cols_2 = list(set(df_2.columns) - set(df_1.columns))
df_2[more_cols_2].isnull().sum()

Location In Aircraft.1                       3828
Crew Size Flight Attendant.Number Of Crew    4584
Communication Breakdown.1                    4285
Were Passengers Involved In Event            4185
Human Factors.1                              3921
Aircraft Zone                                4584
Callback.1                                   4579
When Detected                                1312
Work Environment Factor                      4552
Maintenance Status.Records Complete          4584
Narrative.1                                  3731
Cabin Activity.1                             4585
Communication Breakdown                      3325
Location In Aircraft                         1708
dtype: int64

Since some of these columns seem important by column names, decided to keep in the dataset. Found that there is an additional column named 'Narrative.1' in df_2.

In [27]:
# df_1 has one Narrative column while df_2 has two columns
for cols in df_1.columns:
    if 'Narrative' in cols:
        print(cols)
for cols in df_2.columns:
    if 'Narrative' in cols:
        print(cols)

Narrative
Narrative
Narrative.1


Found that there are instances when event is incursion, excursion, or both.

In [28]:
# Display the number of records belonging to each anomaly type, found that there are instances where event is classified as both incursion and excusion
incur_idx = []
excur_idx = []
both_idx = []
for index, row in df_1.iterrows():
    # Event was classified as both incursion and excursion
    if 'Incursion' in row['Anomaly'] and 'Excursion' in row['Anomaly']:
        both_idx.append(index)
    # Event was classified as incursion only
    elif 'Incursion' in row['Anomaly']:
        incur_idx.append(index)
    # Event was classified as excursion only
    elif 'Excursion' in row['Anomaly']:
        excur_idx.append(index)
    
print(f'Number of incursion events: {len(incur_idx) + len(both_idx)}')
print(f'Number of excursion events: {len(excur_idx) + len(both_idx)}')
print(f'Number of events where it is both incursion and excursion: {len(both_idx)}')

# Confirm results add up to number of rows and all events fall into these three categories
print(f'Total indices counted: {len(incur_idx)+ len(excur_idx) + len(both_idx)}, Rows in df_1: {df_1.shape[0]}')

Number of incursion events: 3988
Number of excursion events: 781
Number of events where it is both incursion and excursion: 34
Total indices counted: 4735, Rows in df_1: 4735


Extract important columns and combine the two Narrative columns from df_2.

In [29]:
# Extract important columns from df_1
df_1_trim = df_1[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative', 'Synopsis']]

# Combine the two Narrative columns in df_2
df_2_tmp = df_2[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative', 'Narrative.1', 'Synopsis']]
df_2_tmp['Narrative.1'].fillna('', inplace=True)
# Add in an extra space between Narrative and Narrative.1 when it gets combined
df_2_tmp['Narrative_comb'] = df_2_tmp['Narrative'] + ' ' + df_2_tmp['Narrative.1']
# Extract important columns from df_2_tmp
df_2_trim = df_2_tmp[['ACN', 'Date', 'Local Time Of Day', 'Locale Reference', 'State Reference', 'Anomaly', 'Narrative_comb', 'Synopsis']]
df_2_trim.rename(columns={'Narrative_comb': 'Narrative'}, inplace=True)

# Check data has been correctly extracted
print(df_1_trim.shape, df_1.shape[0], df_2_trim.shape, df_2.shape[0])

(4735, 8) 4735 (4586, 8) 4586


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2_tmp['Narrative.1'].fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2_tmp['Narrative_comb'] = df_2_tmp['Narrative'] + ' ' + df_2_tmp['Narrative.1']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2_trim.rename(columns={'Narrative_comb': 'Narrative'}, inplace=True)


In [30]:
# Noticed that there are duplicates in the data, so drop duplicates when concatenating the two trimmed dfs such that data from 2000-2005 is combined with data from 2006-2019
df_master = pd.concat([df_1_trim, df_2_trim]).drop_duplicates(subset=['ACN'])

print(df_master.shape, df_1_trim.shape[0]+df_2_trim.shape[0])

(9303, 8) 9321


In [31]:
# Label the rows based on the anomaly type by adding two additional columns into the dataframe
def incursion_check(text):
    """
    Check if event is incursion
    """
    if 'incursion' in text.lower():
        return 1
    else:
        return 0

def excursion_check(text):
    """
    Check if event is excursion
    """
    if 'excursion' in text.lower():
        return 1
    else:
        return 0
    
df_master['Incursion'] = df_master['Anomaly'].apply(incursion_check)
df_master['Excursion'] = df_master['Anomaly'].apply(excursion_check)
df_master.head(5)

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Anomaly,Narrative,Synopsis,Incursion,Excursion
0,459107,200001,0601-1200,DTW.Airport,MI,"Conflict Ground Conflict, Critical; Deviation ...","ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...,1,0
1,459230,200001,0601-1200,MCO.Airport,FL,Deviation / Discrepancy - Procedural FAR; Grou...,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...,1,0
2,459389,200001,0601-1200,LFPG.Airport,FO,Ground Excursion Taxiway,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",CAPT OF AN MD11 RAN OFF TXWY DURING TAXI IN RE...,0,1
3,459407,200001,0001-0600,SRB.Airport,TN,Aircraft Equipment Problem Critical; Deviation...,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",FA20 ENCOUNTERS SLICK RWY AT SRB RESULTING IN ...,0,1
4,459425,200001,1201-1800,ABE.Airport,PA,"Conflict Ground Conflict, Critical; Deviation ...",WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,A JS31 CREW LNDG ON RWY 31 AT ABE ARE INSTRUCT...,1,0


In [32]:
# Extract incursions
df_incur = df_master[['ACN', 'Narrative']].loc[df_master['Incursion'] == 1]
# Extract excursions
df_excur = df_master[['ACN', 'Narrative']].loc[df_master['Excursion'] == 1]
# Extract both
df_both = pd.merge(df_incur, df_excur, how='inner')

print(df_incur.shape, df_excur.shape, df_both.shape)

(6915, 2) (2466, 2) (78, 2)


## Data Preprocessing with ngrams and Topic Modeling with LDA

In [33]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/vivianlin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vivianlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vivianlin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [34]:
# Data preprocessing functions
def lowercase(text):
    """
    Returns lowercase text
    """
    return text.lower()

def rm_punctuation(text):
    """
    Returns text with punctuations removed
    """
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def rm_stopwords(text):
    """
    Returns text without stopwords after tokenizing text
    """
    input = word_tokenize(text)
    return [word for word in input if word not in stopwords.words('english')]

def extract_ngrams(text, num):
    """
    Returns ngrams joined by underscore after tokenizing text
    """
    n_grams = ngrams(word_tokenize(text), num)
    res = []
    for grams in n_grams:
        res.append('_'.join(grams))
    return ' '.join(res)

def preprocess_2gram(text):
    """
    Data preprocessing with bigram
    """
    return extract_ngrams(' '.join(rm_stopwords(rm_punctuation(lowercase(text)))), 2)

def preproc_wstp_2gram(text):
    """
    Data preprocessing with bigram without stopword removal
    """
    return extract_ngrams(' '.join(word_tokenize(rm_punctuation(lowercase(text)))), 2)

def preprocess_3gram(text):
    """
    Data preprocessing with trigram
    """
    return extract_ngrams(' '.join(rm_stopwords(rm_punctuation(lowercase(text)))), 3)

def preprocess_4gram(text):
    """
    Data preprocessing with 4-gram
    """
    return extract_ngrams(' '.join(rm_stopwords(rm_punctuation(lowercase(text)))), 4)

def preprocess_5gram(text):
    """
    Data preprocessing with 5-gram
    """
    return extract_ngrams(' '.join(rm_stopwords(rm_punctuation(lowercase(text)))), 5)

In [35]:
# Running LDA model
def optimize_lda(dataset, col_name):
    """
    Vectorize and run LDA with GridSearch to find optimal number of topics
    """
    # Vectorize
    count_vect = CountVectorizer(max_df = 0.8, min_df = 2, stop_words = 'english')
    doc_term_matrix = count_vect.fit_transform(dataset[col_name].values.astype('U'))

    # Initize model
    lda = LDA(random_state = 0)

    # Using GridSearch to optimize LDA
    num_topics = {'n_components': [x for x in range(2, 11)]}
    model = GridSearchCV(lda, param_grid = num_topics)

    # Do the Grid Search
    model.fit(doc_term_matrix)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(doc_term_matrix))
    
    return count_vect, doc_term_matrix, best_lda_model

def label_dom_topics(dataset, model, matrix, gram_name):
    """
    Label each record in the dataset based on topic it is most closely associated with
    """
    topics = model.transform(matrix)
    # Adding topic tags to our dataframe
    labels = []
    for score in topics:
        labels.append(np.where(score == max(score))[0][0])
    dataset[f'Topic_{gram_name}'] = labels
    return dataset

def top_keywords(vector, model, n):
    """
    Extract the top n keywords (ngrams) from each topic
    """
    res = {}
    for i,topic in enumerate(model.components_):
        res[i] = [vector.get_feature_names_out()[i] for i in topic.argsort()[-n:]]
    return res

### Topic Modeling for Incursion

#### 1a. Running bigram with LDA

In [36]:
# Run preprocess on incursion
df_incur['Narrative_bigram'] = df_incur['Narrative'].apply(preprocess_2gram)
df_incur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram
0,459107,"ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",taxi_dtw dtw_rwy rwy_3l 3l_missed missed_turn ...
1,459230,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,taxied_ramp ramp_area area_talking talking_gnd...
4,459425,WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,clred_visual visual_apch apch_rwy rwy_31 31_7 ...
6,459561,PUSHBACK INTO UNKNOWN MOVEMENT AREA AT RSW. CO...,pushback_unknown unknown_movement movement_are...
7,459588,"ON JAN/XA/00 AT SAN MARCOS, TX, I WAS ENGAGED ...",janxa00_san san_marcos marcos_tx tx_engaged en...


In [37]:
# Incursion
count_vec, incur_vec, incur_model = optimize_lda(df_incur, 'Narrative_bigram')

# Label based on topics
df_incur = label_dom_topics(df_incur, incur_model, incur_vec, 'bigram')
print(df_incur.shape, df_incur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -1566737.9589938603
Model Perplexity:  31608.925532990164
(6915, 4) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram'], dtype='object')


In [40]:
# Number of records in each topic
df_incur['Topic_bigram'].value_counts()

0    4528
1    2387
Name: Topic_bigram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
incur_top10_2g = top_keywords(count_vec, incur_model, 10)
# Get top 20 keywords from each topic
incur_top20_2g = top_keywords(count_vec, incur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {incur_top10_2g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(incur_top20_2g[0]) - set(incur_top10_2g[0]))}")
print(f"Top 10 keywords for Topic #1: {incur_top10_2g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(incur_top20_2g[1]) - set(incur_top10_2g[1]))}")

Top 10 keywords for Topic #0: ['pos_hold', 'gnd_ctlr', 'end_rwy', 'onto_rwy', 'cross_rwy', 'taxi_rwy', 'gnd_ctl', 'short_line', 'short_rwy', 'hold_short']
Next 10 keywords for Topic #0: ['active_rwy', 'clred_us', 'hold_line', 'onto_txwy', 'read_back', 'told_us', 'clred_tkof', 'txwy_b', 'txwy_c', 'taxi_instructions']
Top 10 keywords for Topic #1: ['end_runway', 'taxi_instructions', 'read_back', 'cross_runway', 'first_officer', 'short_runway', 'short_line', 'ground_control', 'aircraft_x', 'hold_short']
Next 10 keywords for Topic #1: ['cleared_takeoff', 'takeoff_clearance', 'local_control', 'taxi_runway', 'holding_short', 'told_us', 'onto_runway', 'ground_controller', 'go_around', 'clear_runway']


#### 1b. Running bigram without stopword removal with LDA

In [None]:
# Run preprocess on incursion
df_incur['Narrative_bigram_wstp'] = df_incur['Narrative'].apply(preproc_wstp_2gram)
df_incur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_bigram_wstp
0,459107,"ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",taxi_dtw dtw_rwy rwy_3l 3l_missed missed_turn ...,0,on_taxi taxi_out out_to to_dtw dtw_rwy rwy_3l ...
1,459230,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,taxied_ramp ramp_area area_talking talking_gnd...,0,taxied_out out_of of_ramp ramp_area area_befor...
4,459425,WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,clred_visual visual_apch apch_rwy rwy_31 31_7 ...,0,we_were were_clred clred_for for_the the_visua...
6,459561,PUSHBACK INTO UNKNOWN MOVEMENT AREA AT RSW. CO...,pushback_unknown unknown_movement movement_are...,0,pushback_into into_unknown unknown_movement mo...
7,459588,"ON JAN/XA/00 AT SAN MARCOS, TX, I WAS ENGAGED ...",janxa00_san san_marcos marcos_tx tx_engaged en...,0,on_janxa00 janxa00_at at_san san_marcos marcos...


In [None]:
# Incursion
count_vec, incur_vec, incur_model = optimize_lda(df_incur, 'Narrative_bigram_wstp')

# Label based on topics
df_incur = label_dom_topics(df_incur, incur_model, incur_vec, 'bigram_wstp')
print(df_incur.shape, df_incur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -3608736.6425678497
Model Perplexity:  18628.51908834689
(6915, 6) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_bigram_wstp', 'Topic_bigram_wstp'],
      dtype='object')


In [None]:
# Get top 10 keywords from each topic
incur_top10_2g_wstp = top_keywords(count_vec, incur_model, 10)
# Get top 20 keywords from each topic
incur_top20_2g_wstp = top_keywords(count_vec, incur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {incur_top10_2g_wstp[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(incur_top20_2g_wstp[0]) - set(incur_top10_2g_wstp[0]))}")
print(f"Top 10 keywords for Topic #1: {incur_top10_2g_wstp[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(incur_top20_2g_wstp[1]) - set(incur_top10_2g_wstp[1]))}")

Top 10 keywords for Topic #0: ['the_twr', 'of_rwy', 'to_rwy', 'on_the', 'to_the', 'of_the', 'i_was', 'we_were', 'hold_short', 'the_rwy']
Next 10 keywords for Topic #0: ['the_acft', 'on_txwy', 'the_hold', 'on_rwy', 'in_the', 'at_the', 'did_not', 'to_taxi', 'short_of', 'i_had']
Top 10 keywords for Topic #1: ['aircraft_x', 'in_the', 'the_aircraft', 'hold_short', 'i_was', 'we_were', 'to_the', 'on_the', 'of_the', 'the_runway']
Next 10 keywords for Topic #1: ['the_hold', 'at_the', 'did_not', 'to_taxi', 'short_of', 'it_was', 'and_i', 'i_had', 'the_tower', 'and_the']


#### 2. Running trigram with LDA

In [41]:
# Run preprocess on incursion
df_incur['Narrative_trigram'] = df_incur['Narrative'].apply(preprocess_3gram)
df_incur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram
0,459107,"ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",taxi_dtw dtw_rwy rwy_3l 3l_missed missed_turn ...,0,taxi_dtw_rwy dtw_rwy_3l rwy_3l_missed 3l_misse...
1,459230,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,taxied_ramp ramp_area area_talking talking_gnd...,0,taxied_ramp_area ramp_area_talking area_talkin...
4,459425,WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,clred_visual visual_apch apch_rwy rwy_31 31_7 ...,0,clred_visual_apch visual_apch_rwy apch_rwy_31 ...
6,459561,PUSHBACK INTO UNKNOWN MOVEMENT AREA AT RSW. CO...,pushback_unknown unknown_movement movement_are...,0,pushback_unknown_movement unknown_movement_are...
7,459588,"ON JAN/XA/00 AT SAN MARCOS, TX, I WAS ENGAGED ...",janxa00_san san_marcos marcos_tx tx_engaged en...,0,janxa00_san_marcos san_marcos_tx marcos_tx_eng...


In [42]:
# Incursion
count_vec, incur_vec, incur_model = optimize_lda(df_incur, 'Narrative_trigram')
df_incur = label_dom_topics(df_incur, incur_model, incur_vec, 'trigram')
print(df_incur.shape, df_incur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -549787.5652144667
Model Perplexity:  37707.13980892424
(6915, 6) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram'],
      dtype='object')


In [43]:
# Number of records in each topic
df_incur['Topic_trigram'].value_counts()

0    3486
1    3429
Name: Topic_trigram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
incur_top10_3g = top_keywords(count_vec, incur_model, 10)
# Get top 20 keywords from each topic
incur_top20_3g = top_keywords(count_vec, incur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {incur_top10_3g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(incur_top20_3g[0]) - set(incur_top10_3g[0]))}")
print(f"Top 10 keywords for Topic #1: {incur_top10_3g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(incur_top20_3g[1]) - set(incur_top10_3g[1]))}")

Top 10 keywords for Topic #0: ['past_hold_short', 'short_line_rwy', 'rptr_revealed_following', 'conversation_rptr_revealed', 'revealed_following_info', 'crossed_hold_short', 'hold_short_lines', 'hold_short_runway', 'hold_short_rwy', 'hold_short_line']
Next 10 keywords for Topic #0: ['instructed_hold_short', 'callback_conversation_rptr', 'short_line_runway', 'supplemental_info_acn', 'hold_short_instructions', 'told_hold_short', 'contained_additional_information', 'holding_short_runway', 'us_hold_short', 'following_info_rptr']
Top 10 keywords for Topic #1: ['turn_onto_txwy', 'clred_land_rwy', 'hold_short_txwy', 'apch_end_rwy', 'holding_short_rwy', 'pos_hold_rwy', 'clred_cross_rwy', 'hold_short_line', 'supplemental_info_acn', 'hold_short_rwy']
Next 10 keywords for Topic #1: ['callback_conversation_rptr', 'revealed_following_info', 'conversation_rptr_revealed', 'told_hold_short', 'twr_told_us', 'rptr_revealed_following', 'twr_clred_us', 'us_hold_short', '180_deg_turn', 'clred_taxi_rwy']


#### 3. Running 4gram with LDA

In [44]:
# Run preprocess on incursion
df_incur['Narrative_4gram'] = df_incur['Narrative'].apply(preprocess_4gram)
df_incur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram,Topic_trigram,Narrative_4gram
0,459107,"ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",taxi_dtw dtw_rwy rwy_3l 3l_missed missed_turn ...,0,taxi_dtw_rwy dtw_rwy_3l rwy_3l_missed 3l_misse...,1,taxi_dtw_rwy_3l dtw_rwy_3l_missed rwy_3l_misse...
1,459230,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,taxied_ramp ramp_area area_talking talking_gnd...,0,taxied_ramp_area ramp_area_talking area_talkin...,1,taxied_ramp_area_talking ramp_area_talking_gnd...
4,459425,WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,clred_visual visual_apch apch_rwy rwy_31 31_7 ...,0,clred_visual_apch visual_apch_rwy apch_rwy_31 ...,1,clred_visual_apch_rwy visual_apch_rwy_31 apch_...
6,459561,PUSHBACK INTO UNKNOWN MOVEMENT AREA AT RSW. CO...,pushback_unknown unknown_movement movement_are...,0,pushback_unknown_movement unknown_movement_are...,0,pushback_unknown_movement_area unknown_movemen...
7,459588,"ON JAN/XA/00 AT SAN MARCOS, TX, I WAS ENGAGED ...",janxa00_san san_marcos marcos_tx tx_engaged en...,0,janxa00_san_marcos san_marcos_tx marcos_tx_eng...,1,janxa00_san_marcos_tx san_marcos_tx_engaged ma...


In [45]:
# Incursion
count_vec, incur_vec, incur_model = optimize_lda(df_incur, 'Narrative_4gram')
df_incur = label_dom_topics(df_incur, incur_model, incur_vec, '4gram')
print(df_incur.shape, df_incur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -143733.08466663957
Model Perplexity:  16475.939353242462
(6915, 8) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram', 'Narrative_4gram', 'Topic_4gram'],
      dtype='object')


In [46]:
df_incur['Topic_4gram'].value_counts()

0    3605
1    3310
Name: Topic_4gram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
incur_top10_4g = top_keywords(count_vec, incur_model, 10)
# Get top 20 keywords from each topic
incur_top20_4g = top_keywords(count_vec, incur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {incur_top10_4g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(incur_top20_4g[0]) - set(incur_top10_4g[0]))}")
print(f"Top 10 keywords for Topic #1: {incur_top10_4g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(incur_top20_4g[1]) - set(incur_top10_4g[1]))}")

Top 10 keywords for Topic #0: ['see_hold_short_line', 'across_hold_short_line', 'taxi_hold_short_rwy', 'hold_short_rwy_25r', 'hold_short_line_runway', 'report_narrative_contained_additional', 'hold_short_line_rwy', 'narrative_contained_additional_information', 'past_hold_short_line', 'crossed_hold_short_line']
Next 10 keywords for Topic #0: ['passed_hold_short_line', 'txwy_hold_short_rwy', 'make_180_deg_turn', 'rwy_hold_short_line', 'xing_hold_short_line', 'cleared_aircraft_x_takeoff', 'gnd_ctl_clred_us', 'clred_us_taxi_rwy', 'taxi_pos_hold_rwy', 'read_back_hold_short']
Top 10 keywords for Topic #1: ['us_hold_short_rwy', 'told_hold_short_rwy', 'following_info_rptr_stated', 'told_us_hold_short', 'crossed_hold_short_line', 'hold_short_line_rwy', 'revealed_following_info_rptr', 'callback_conversation_rptr_revealed', 'conversation_rptr_revealed_following', 'rptr_revealed_following_info']
Next 10 keywords for Topic #1: ['past_hold_short_line', 'hold_short_rwy_4l', 'twr_clred_us_tkof', 'txwy

#### 4. Run 5gram with LDA

In [47]:
# Run preprocess on incursion
df_incur['Narrative_5gram'] = df_incur['Narrative'].apply(preprocess_5gram)
df_incur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram,Topic_trigram,Narrative_4gram,Topic_4gram,Narrative_5gram
0,459107,"ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",taxi_dtw dtw_rwy rwy_3l 3l_missed missed_turn ...,0,taxi_dtw_rwy dtw_rwy_3l rwy_3l_missed 3l_misse...,1,taxi_dtw_rwy_3l dtw_rwy_3l_missed rwy_3l_misse...,0,taxi_dtw_rwy_3l_missed dtw_rwy_3l_missed_turn ...
1,459230,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,taxied_ramp ramp_area area_talking talking_gnd...,0,taxied_ramp_area ramp_area_talking area_talkin...,1,taxied_ramp_area_talking ramp_area_talking_gnd...,0,taxied_ramp_area_talking_gnd ramp_area_talking...
4,459425,WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,clred_visual visual_apch apch_rwy rwy_31 31_7 ...,0,clred_visual_apch visual_apch_rwy apch_rwy_31 ...,1,clred_visual_apch_rwy visual_apch_rwy_31 apch_...,1,clred_visual_apch_rwy_31 visual_apch_rwy_31_7 ...
6,459561,PUSHBACK INTO UNKNOWN MOVEMENT AREA AT RSW. CO...,pushback_unknown unknown_movement movement_are...,0,pushback_unknown_movement unknown_movement_are...,0,pushback_unknown_movement_area unknown_movemen...,0,pushback_unknown_movement_area_rsw unknown_mov...
7,459588,"ON JAN/XA/00 AT SAN MARCOS, TX, I WAS ENGAGED ...",janxa00_san san_marcos marcos_tx tx_engaged en...,0,janxa00_san_marcos san_marcos_tx marcos_tx_eng...,1,janxa00_san_marcos_tx san_marcos_tx_engaged ma...,1,janxa00_san_marcos_tx_engaged san_marcos_tx_en...


In [48]:
# Incursion
count_vec, incur_vec, incur_model = optimize_lda(df_incur, 'Narrative_5gram')
df_incur = label_dom_topics(df_incur, incur_model, incur_vec, '5gram')
print(df_incur.shape, df_incur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -37510.28752852576
Model Perplexity:  5716.401285670505
(6915, 10) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram', 'Narrative_4gram', 'Topic_4gram',
       'Narrative_5gram', 'Topic_5gram'],
      dtype='object')


In [49]:
df_incur['Topic_5gram'].value_counts()

0    4801
1    2114
Name: Topic_5gram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
incur_top10_5g = top_keywords(count_vec, incur_model, 10)
# Get top 20 keywords from each topic
incur_top20_5g = top_keywords(count_vec, incur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {incur_top10_5g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(incur_top20_5g[0]) - set(incur_top10_5g[0]))}")
print(f"Top 10 keywords for Topic #1: {incur_top10_4g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(incur_top20_5g[1]) - set(incur_top10_5g[1]))}")

Top 10 keywords for Topic #0: ['txwy_b_hold_short_rwy', 'crossed_hold_short_line_rwy', 'txwy_c_hold_short_rwy', 'past_hold_short_line_rwy', 'txwy_k_hold_short_rwy', 'read_back_hold_short_instructions', 'txwy_f_hold_short_rwy', 'gnd_ctl_clred_us_taxi', 'taxied_past_hold_short_line', 'twr_clred_us_cross_rwy']
Next 10 keywords for Topic #0: ['rwy_12l_hold_short_rwy', 'cross_rwy_19l_hold_short', '180_deg_turn_hold_short', 'gave_us_phone_number_call', 'rwy_19l_hold_short_rwy', 'txwy_p_hold_short_rwy', 'rwy_24_hold_short_rwy', 'crossed_hold_short_line_runway', '19l_hold_short_rwy_19r', 'txwy_b_hold_short_txwy']
Top 10 keywords for Topic #1: ['us_hold_short_rwy', 'told_hold_short_rwy', 'following_info_rptr_stated', 'told_us_hold_short', 'crossed_hold_short_line', 'hold_short_line_rwy', 'revealed_following_info_rptr', 'callback_conversation_rptr_revealed', 'conversation_rptr_revealed_following', 'rptr_revealed_following_info']
Next 10 keywords for Topic #1: ['clred_us_pos_hold_rwy', 'twr_told_

### Topic Modeling for Excursion
Repeat all steps in incursion for excursion

#### 1. Running bigram with LDA

In [50]:
# Run preprocess on excursion
df_excur['Narrative_bigram'] = df_excur['Narrative'].apply(preprocess_2gram)
df_excur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram
2,459389,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",capt_flying flying_r r_seat seat_lndg lndg_cdg...
3,459407,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",ils_rwy rwy_4 4_srb srb_capt capt_flying flyin...
5,459476,WE HAD JUST LANDED AND CLRED RWY 12R AT MSP. I...,landed_clred clred_rwy rwy_12r 12r_msp msp_con...
8,459603,LNDG INCIDENT IN TAILDRAGGER TYPE ACFT (1966 C...,lndg_incident incident_taildragger taildragger...
10,459623,"AFTER DARK ON JAN/XA/00, WE TAXIED OUR LEAR 31...",dark_janxa00 janxa00_taxied taxied_lear lear_3...


In [51]:
# Excursion
count_vec, excur_vec, excur_model = optimize_lda(df_excur, 'Narrative_bigram')

# Label based on topics
df_excur = label_dom_topics(df_excur, excur_model, excur_vec, 'bigram')
print(df_excur.shape, df_excur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -442539.87978962547
Model Perplexity:  22831.71658704897
(2466, 4) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram'], dtype='object')


In [52]:
df_excur['Topic_bigram'].value_counts()

0    1342
1    1124
Name: Topic_bigram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
excur_top10_2g = top_keywords(count_vec, excur_model, 10)
# Get top 20 keywords from each topic
excur_top20_2g = top_keywords(count_vec, excur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {excur_top10_2g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(excur_top20_2g[0]) - set(excur_top10_2g[0]))}")
print(f"Top 10 keywords for Topic #1: {excur_top10_2g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(excur_top20_2g[1]) - set(excur_top10_2g[1]))}")

Top 10 keywords for Topic #0: ['directional_control', 'first_officer', 'left_side', 'right_main', 'main_gear', 'braking_action', 'right_rudder', 'nose_wheel', 'side_runway', 'end_runway']
Next 10 keywords for Topic #0: ['control_aircraft', 'nose_gear', 'right_side', 'left_rudder', 'back_onto', 'landing_gear', 'left_main', 'came_stop', 'damage_aircraft', 'go_around']
Top 10 keywords for Topic #1: ['l_main', 'lndg_rwy', 'nose_gear', 'damage_acft', 'r_main', 'braking_action', 'main_gear', 'lndg_gear', 'side_rwy', 'end_rwy']
Next 10 keywords for Topic #1: ['conversation_rptr', 'ft_rwy', 'revealed_following', 'supplemental_info', 'r_rudder', 'info_acn', 'l_side', 'r_side', 'rptr_revealed', 'following_info']


#### 2. Running trigram with LDA

In [53]:
# Run preprocess on excursion
df_excur['Narrative_trigram'] = df_excur['Narrative'].apply(preprocess_3gram)
df_excur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram
2,459389,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",capt_flying flying_r r_seat seat_lndg lndg_cdg...,1,capt_flying_r flying_r_seat r_seat_lndg seat_l...
3,459407,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",ils_rwy rwy_4 4_srb srb_capt capt_flying flyin...,1,ils_rwy_4 rwy_4_srb 4_srb_capt srb_capt_flying...
5,459476,WE HAD JUST LANDED AND CLRED RWY 12R AT MSP. I...,landed_clred clred_rwy rwy_12r 12r_msp msp_con...,1,landed_clred_rwy clred_rwy_12r rwy_12r_msp 12r...
8,459603,LNDG INCIDENT IN TAILDRAGGER TYPE ACFT (1966 C...,lndg_incident incident_taildragger taildragger...,1,lndg_incident_taildragger incident_taildragger...
10,459623,"AFTER DARK ON JAN/XA/00, WE TAXIED OUR LEAR 31...",dark_janxa00 janxa00_taxied taxied_lear lear_3...,1,dark_janxa00_taxied janxa00_taxied_lear taxied...


In [54]:
# Excursion
count_vec, excur_vec, excur_model = optimize_lda(df_excur, 'Narrative_trigram')

# Label based on topics
df_excur = label_dom_topics(df_excur, excur_model, excur_vec, 'trigram')
print(df_excur.shape, df_excur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -98178.96360423027
Model Perplexity:  15081.755529159636
(2466, 6) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram'],
      dtype='object')


In [55]:
df_excur['Topic_trigram'].value_counts()

0    1250
1    1216
Name: Topic_trigram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
excur_top10_3g = top_keywords(count_vec, excur_model, 10)
# Get top 20 keywords from each topic
excur_top20_3g = top_keywords(count_vec, excur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {excur_top10_3g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(excur_top20_3g[0]) - set(excur_top10_3g[0]))}")
print(f"Top 10 keywords for Topic #1: {excur_top10_3g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(excur_top20_3g[1]) - set(excur_top10_3g[1]))}")

Top 10 keywords for Topic #0: ['main_lndg_gear', 'nose_wheel_steering', 'main_landing_gear', 'braking_action_good', '180_deg_turn', 'supplemental_info_acn', 'right_side_runway', 'left_main_gear', 'back_onto_runway', 'left_side_runway']
Next 10 keywords for Topic #0: ['narrative_contained_additional', 'right_main_gear', 'approach_end_runway', 'report_narrative_contained', 'aircraft_came_rest', 'full_right_rudder', 'came_complete_stop', 'r_side_rwy', 'contained_additional_information', 'hold_short_line']
Top 10 keywords for Topic #1: ['left_side_runway', 'info_rptr_stated', 'right_main_gear', 'supplemental_info_acn', 'l_side_rwy', 'following_info_rptr', 'callback_conversation_rptr', 'conversation_rptr_revealed', 'rptr_revealed_following', 'revealed_following_info']
Next 10 keywords for Topic #1: ['main_lndg_gear', 'acft_came_stop', 'right_side_runway', 'l_main_gear', 'hold_short_line', 'r_side_rwy', 'r_main_gear', 'full_r_rudder', 'aircraft_came_stop', 'rwy_edge_light']


#### 3. Running 4gram with LDA

In [56]:
# Run preprocess on excursion
df_excur['Narrative_4gram'] = df_excur['Narrative'].apply(preprocess_4gram)
df_excur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram,Topic_trigram,Narrative_4gram
2,459389,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",capt_flying flying_r r_seat seat_lndg lndg_cdg...,1,capt_flying_r flying_r_seat r_seat_lndg seat_l...,0,capt_flying_r_seat flying_r_seat_lndg r_seat_l...
3,459407,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",ils_rwy rwy_4 4_srb srb_capt capt_flying flyin...,1,ils_rwy_4 rwy_4_srb 4_srb_capt srb_capt_flying...,0,ils_rwy_4_srb rwy_4_srb_capt 4_srb_capt_flying...
5,459476,WE HAD JUST LANDED AND CLRED RWY 12R AT MSP. I...,landed_clred clred_rwy rwy_12r 12r_msp msp_con...,1,landed_clred_rwy clred_rwy_12r rwy_12r_msp 12r...,1,landed_clred_rwy_12r clred_rwy_12r_msp rwy_12r...
8,459603,LNDG INCIDENT IN TAILDRAGGER TYPE ACFT (1966 C...,lndg_incident incident_taildragger taildragger...,1,lndg_incident_taildragger incident_taildragger...,0,lndg_incident_taildragger_type incident_taildr...
10,459623,"AFTER DARK ON JAN/XA/00, WE TAXIED OUR LEAR 31...",dark_janxa00 janxa00_taxied taxied_lear lear_3...,1,dark_janxa00_taxied janxa00_taxied_lear taxied...,0,dark_janxa00_taxied_lear janxa00_taxied_lear_3...


In [57]:
# Excursion
count_vec, excur_vec, excur_model = optimize_lda(df_excur, 'Narrative_4gram')

# Label based on topics
df_excur = label_dom_topics(df_excur, excur_model, excur_vec, '4gram')
print(df_excur.shape, df_excur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -15900.766770105587
Model Perplexity:  3383.0160690311027
(2466, 8) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram', 'Narrative_4gram', 'Topic_4gram'],
      dtype='object')


In [58]:
df_excur['Topic_4gram'].value_counts()

0    1491
1     975
Name: Topic_4gram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
excur_top10_4g = top_keywords(count_vec, excur_model, 10)
# Get top 20 keywords from each topic
excur_top20_4g = top_keywords(count_vec, excur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {excur_top10_4g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(excur_top20_4g[0]) - set(excur_top10_4g[0]))}")
print(f"Top 10 keywords for Topic #1: {excur_top10_4g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(excur_top20_4g[1]) - set(excur_top10_4g[1]))}")

Top 10 keywords for Topic #0: ['full_power_go_around', 'bring_aircraft_complete_stop', 'departed_r_side_rwy', 'exited_right_side_runway', 'applied_full_left_rudder', 'kts_gusting_20_kts', 'left_main_landing_gear', 'aircraft_back_onto_runway', 'full_stop_taxi_back', 'right_main_landing_gear']
Next 10 keywords for Topic #0: ['acft_departed_r_side', 'degs_20_kts_gusting', 'aircraft_departed_runway_left', 'maintain_directional_control_aircraft', 'taxi_back_onto_runway', 'aircraft_exited_right_side', 'applied_full_l_rudder', 'twr_clred_us_tkof', 'left_full_right_rudder', 'right_rudder_right_brake']
Top 10 keywords for Topic #1: ['l_main_lndg_gear', 'r_main_lndg_gear', 'revealed_following_info_reporter', 'report_narrative_contained_additional', 'narrative_contained_additional_information', 'following_info_rptr_stated', 'revealed_following_info_rptr', 'callback_conversation_rptr_revealed', 'rptr_revealed_following_info', 'conversation_rptr_revealed_following']
Next 10 keywords for Topic #1: [

#### Run 5gram with LDA

In [59]:
# Run preprocess on excursion
df_excur['Narrative_5gram'] = df_excur['Narrative'].apply(preprocess_5gram)
df_excur.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram,Topic_trigram,Narrative_4gram,Topic_4gram,Narrative_5gram
2,459389,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",capt_flying flying_r r_seat seat_lndg lndg_cdg...,1,capt_flying_r flying_r_seat r_seat_lndg seat_l...,0,capt_flying_r_seat flying_r_seat_lndg r_seat_l...,0,capt_flying_r_seat_lndg flying_r_seat_lndg_cdg...
3,459407,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",ils_rwy rwy_4 4_srb srb_capt capt_flying flyin...,1,ils_rwy_4 rwy_4_srb 4_srb_capt srb_capt_flying...,0,ils_rwy_4_srb rwy_4_srb_capt 4_srb_capt_flying...,0,ils_rwy_4_srb_capt rwy_4_srb_capt_flying 4_srb...
5,459476,WE HAD JUST LANDED AND CLRED RWY 12R AT MSP. I...,landed_clred clred_rwy rwy_12r 12r_msp msp_con...,1,landed_clred_rwy clred_rwy_12r rwy_12r_msp 12r...,1,landed_clred_rwy_12r clred_rwy_12r_msp rwy_12r...,1,landed_clred_rwy_12r_msp clred_rwy_12r_msp_con...
8,459603,LNDG INCIDENT IN TAILDRAGGER TYPE ACFT (1966 C...,lndg_incident incident_taildragger taildragger...,1,lndg_incident_taildragger incident_taildragger...,0,lndg_incident_taildragger_type incident_taildr...,1,lndg_incident_taildragger_type_acft incident_t...
10,459623,"AFTER DARK ON JAN/XA/00, WE TAXIED OUR LEAR 31...",dark_janxa00 janxa00_taxied taxied_lear lear_3...,1,dark_janxa00_taxied janxa00_taxied_lear taxied...,0,dark_janxa00_taxied_lear janxa00_taxied_lear_3...,0,dark_janxa00_taxied_lear_31 janxa00_taxied_lea...


In [60]:
# Excursion
count_vec, excur_vec, excur_model = optimize_lda(df_excur, 'Narrative_5gram')

# Label based on topics
df_excur = label_dom_topics(df_excur, excur_model, excur_vec, '5gram')
print(df_excur.shape, df_excur.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -3359.4695406483706
Model Perplexity:  554.2617198158927
(2466, 10) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram', 'Narrative_4gram', 'Topic_4gram',
       'Narrative_5gram', 'Topic_5gram'],
      dtype='object')


In [61]:
df_excur['Topic_5gram'].value_counts()

0    2248
1     218
Name: Topic_5gram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
excur_top10_5g = top_keywords(count_vec, excur_model, 10)
# Get top 20 keywords from each topic
excur_top20_5g = top_keywords(count_vec, excur_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {excur_top10_5g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(excur_top20_5g[0]) - set(excur_top10_5g[0]))}")
print(f"Top 10 keywords for Topic #1: {excur_top10_5g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(excur_top20_5g[1]) - set(excur_top10_5g[1]))}")

Top 10 keywords for Topic #0: ['following_info_rptr_stated_acft', 'revealed_following_info_rptr_said', 'aircraft_exited_right_side_runway', 'following_info_rptr_stated_airplane', 'acft_departed_r_side_rwy', 'report_narrative_contained_additional_information', 'revealed_following_info_rptr_stated', 'rptr_revealed_following_info_rptr', 'callback_conversation_rptr_revealed_following', 'conversation_rptr_revealed_following_info']
Next 10 keywords for Topic #0: ['following_info_rptr_stated_nose', 'aircraft_departed_left_side_runway', 'started_veer_left_applied_right', 'left_departed_left_side_runway', 'revealed_following_info_rptr_indicated', '10_kts_gusting_20_kts', 'applying_full_power_aircraft_started', 'twr_clred_us_tkof_rwy', 'veer_left_applied_right_rudder', 'rwy_callback_conversation_rptr_revealed']
Top 10 keywords for Topic #1: ['20_kts_gusting_30_kts', '20_kts_gusting_25_kts', 'degs_20_kts_gusting_25', 'acft_departed_l_side_rwy', 'l_applied_full_r_rudder', 'left_main_gear_collapsed

### Topic Model for instances where both incursion and excursion occurred
Repeat all steps again

#### 1. Run bigram with LDA

In [62]:
# Run preprocess on excursion
df_both['Narrative_bigram'] = df_both['Narrative'].apply(preprocess_2gram)
df_both.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram
0,473633,"LNDG ON RWY 20 RIC, WINDS LIGHT AND DOWN THE R...",lndg_rwy rwy_20 20_ric ric_winds winds_light l...
1,481176,"ACFT Z, AN A320, HAD JUST EXITED RWY 23 AT TXW...",acft_z z_a320 a320_exited exited_rwy rwy_23 23...
2,483919,CAMPAIGN STOP AT ERI. B727 PARKED ON GA RAMP. ...,campaign_stop stop_eri eri_b727 b727_parked pa...
3,492690,"AFTER NORMAL LNDG ON RWY 31 AT BFI, CAPT BEGAN...",normal_lndg lndg_rwy rwy_31 31_bfi bfi_capt ca...
4,495620,"AFTER LNDG ON RWY 33 IN BUR, WE WERE CLRED TO ...",lndg_rwy rwy_33 33_bur bur_clred clred_cross c...


In [63]:
# Both
count_vec, both_vec, both_model = optimize_lda(df_both, 'Narrative_bigram')

# Label based on topics
df_both = label_dom_topics(df_both, both_model, both_vec, 'bigram')
print(df_both.shape, df_both.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -3565.412521998158
Model Perplexity:  770.652971452012
(78, 4) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram'], dtype='object')


In [64]:
df_both['Topic_bigram'].value_counts()

1    47
0    31
Name: Topic_bigram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
both_top10_2g = top_keywords(count_vec, both_model, 10)
# Get top 20 keywords from each topic
both_top20_2g = top_keywords(count_vec, both_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {both_top10_2g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(both_top20_2g[0]) - set(both_top10_2g[0]))}")
print(f"Top 10 keywords for Topic #1: {both_top10_2g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(both_top20_2g[1]) - set(both_top10_2g[1]))}")

Top 10 keywords for Topic #0: ['taxiway_c', 'left_rudder', 'thrust_reversers', 'ramp_control', 'directional_control', 'nose_wheel', 'ground_control', 'runway_28r', 'aircraft_x', 'first_officer']
Next 10 keywords for Topic #0: ['side_runway', 'onto_taxiway', 'landing_runway', 'taxi_light', 'airport_operations', 'active_runway', 'main_gear', 'aircraft_began', 'damage_aircraft', 'go_around']
Top 10 keywords for Topic #1: ['left_brake', 'txwy_c', 'onto_txwy', 'taxi_back', 'twr_ctlr', 'runway_xx', 'lndg_rwy', 'acft_x', 'short_line', 'hold_short']
Next 10 keywords for Topic #1: ['runway_xxl', 'base_rwy', 'taxied_back', 'onto_rwy', 'rwy_21', 'rwy_17', 'short_final', 'taxi_rwy', 'called_back', 'taxi_instructions']


#### 2. Run trigram with LDA

In [65]:
# Run preprocess on excursion
df_both['Narrative_trigram'] = df_both['Narrative'].apply(preprocess_3gram)
df_both.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram
0,473633,"LNDG ON RWY 20 RIC, WINDS LIGHT AND DOWN THE R...",lndg_rwy rwy_20 20_ric ric_winds winds_light l...,1,lndg_rwy_20 rwy_20_ric 20_ric_winds ric_winds_...
1,481176,"ACFT Z, AN A320, HAD JUST EXITED RWY 23 AT TXW...",acft_z z_a320 a320_exited exited_rwy rwy_23 23...,1,acft_z_a320 z_a320_exited a320_exited_rwy exit...
2,483919,CAMPAIGN STOP AT ERI. B727 PARKED ON GA RAMP. ...,campaign_stop stop_eri eri_b727 b727_parked pa...,1,campaign_stop_eri stop_eri_b727 eri_b727_parke...
3,492690,"AFTER NORMAL LNDG ON RWY 31 AT BFI, CAPT BEGAN...",normal_lndg lndg_rwy rwy_31 31_bfi bfi_capt ca...,1,normal_lndg_rwy lndg_rwy_31 rwy_31_bfi 31_bfi_...
4,495620,"AFTER LNDG ON RWY 33 IN BUR, WE WERE CLRED TO ...",lndg_rwy rwy_33 33_bur bur_clred clred_cross c...,1,lndg_rwy_33 rwy_33_bur 33_bur_clred bur_clred_...


In [66]:
# Both
count_vec, both_vec, both_model = optimize_lda(df_both, 'Narrative_trigram')

# Label based on topics
df_both = label_dom_topics(df_both, both_model, both_vec, 'trigram')
print(df_both.shape, df_both.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -236.73343939780074
Model Perplexity:  92.99212630553704
(78, 6) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram'],
      dtype='object')


In [67]:
df_both['Topic_trigram'].value_counts()

0    51
1    27
Name: Topic_trigram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
both_top10_3g = top_keywords(count_vec, both_model, 10)
# Get top 20 keywords from each topic
both_top20_3g = top_keywords(count_vec, both_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {both_top10_3g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(both_top20_3g[0]) - set(both_top10_3g[0]))}")
print(f"Top 10 keywords for Topic #1: {both_top10_3g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(both_top20_3g[1]) - set(both_top10_3g[1]))}")

Top 10 keywords for Topic #0: ['crossed_hold_short', 'short_line_applied', 'applied_brakes_stop', 'brakes_stop_plane', 'supplemental_info_acn', 'came_full_stop', 'taxi_back_ramp', 'runway_hold_short', 'hold_short_runway', 'hold_short_line']
Next 10 keywords for Topic #0: ['right_main_gear', 'short_line_runway', 'first_officer_made', 'best_course_action', 'airplane_complete_stop', 'brought_airplane_complete', 'stop_hold_short', 'applied_full_left', 'pulled_hold_short', 'left_main_gear']
Top 10 keywords for Topic #1: ['r_base_rwy', 'upon_exiting_rwy', 'revealed_following_info', 'rptr_revealed_following', 'conversation_rptr_revealed', 'callback_conversation_rptr', 'onto_txwy_c', 'turned_onto_txwy', 'hold_short_rwy', 'ifr_flt_plan']
Next 10 keywords for Topic #1: ['gnd_ctl_told', 'filed_ifr_flt', 'edge_paved_runway', 'turning_base_final', 'turn_onto_txwy', 'commercial_chart_pages', 'approx_200_ft', 'applied_full_brakes', 'holding_short_rwy', 'taxi_lights_damage']


#### 3. Run 4gram with LDA

In [68]:
# Run preprocess on excursion
df_both['Narrative_4gram'] = df_both['Narrative'].apply(preprocess_4gram)
df_both.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram,Topic_trigram,Narrative_4gram
0,473633,"LNDG ON RWY 20 RIC, WINDS LIGHT AND DOWN THE R...",lndg_rwy rwy_20 20_ric ric_winds winds_light l...,1,lndg_rwy_20 rwy_20_ric 20_ric_winds ric_winds_...,1,lndg_rwy_20_ric rwy_20_ric_winds 20_ric_winds_...
1,481176,"ACFT Z, AN A320, HAD JUST EXITED RWY 23 AT TXW...",acft_z z_a320 a320_exited exited_rwy rwy_23 23...,1,acft_z_a320 z_a320_exited a320_exited_rwy exit...,0,acft_z_a320_exited z_a320_exited_rwy a320_exit...
2,483919,CAMPAIGN STOP AT ERI. B727 PARKED ON GA RAMP. ...,campaign_stop stop_eri eri_b727 b727_parked pa...,1,campaign_stop_eri stop_eri_b727 eri_b727_parke...,0,campaign_stop_eri_b727 stop_eri_b727_parked er...
3,492690,"AFTER NORMAL LNDG ON RWY 31 AT BFI, CAPT BEGAN...",normal_lndg lndg_rwy rwy_31 31_bfi bfi_capt ca...,1,normal_lndg_rwy lndg_rwy_31 rwy_31_bfi 31_bfi_...,0,normal_lndg_rwy_31 lndg_rwy_31_bfi rwy_31_bfi_...
4,495620,"AFTER LNDG ON RWY 33 IN BUR, WE WERE CLRED TO ...",lndg_rwy rwy_33 33_bur bur_clred clred_cross c...,1,lndg_rwy_33 rwy_33_bur 33_bur_clred bur_clred_...,1,lndg_rwy_33_bur rwy_33_bur_clred 33_bur_clred_...


In [69]:
# Both
count_vec, both_vec, both_model = optimize_lda(df_both, 'Narrative_4gram')

# Label based on topics
df_both = label_dom_topics(df_both, both_model, both_vec, '4gram')
print(df_both.shape, df_both.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -25.5282500858868
Model Perplexity:  18.123104365466453
(78, 8) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram', 'Narrative_4gram', 'Topic_4gram'],
      dtype='object')


In [70]:
df_both['Topic_4gram'].value_counts()

0    71
1     7
Name: Topic_4gram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
both_top10_4g = top_keywords(count_vec, both_model, 10)
# Get top 20 keywords from each topic
both_top20_4g = top_keywords(count_vec, both_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {both_top10_4g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(both_top20_4g[0]) - set(both_top10_4g[0]))}")
print(f"Top 10 keywords for Topic #1: {both_top10_4g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(both_top20_4g[1]) - set(both_top10_4g[1]))}")

Top 10 keywords for Topic #0: ['conversation_rptr_revealed_following', 'brought_airplane_complete_stop', 'filed_ifr_flt_plan', 'pulled_hold_short_line', 'hold_short_line_runway', 'crossed_hold_short_line', 'stop_hold_short_line', 'hold_short_line_applied', 'applied_brakes_stop_plane', 'runway_hold_short_line']
Next 10 keywords for Topic #0: ['rptr_revealed_following_info', 'callback_conversation_rptr_revealed']
Top 10 keywords for Topic #1: ['runway_hold_short_line', 'stop_hold_short_line', 'crossed_hold_short_line', 'hold_short_line_runway', 'pulled_hold_short_line', 'filed_ifr_flt_plan', 'brought_airplane_complete_stop', 'conversation_rptr_revealed_following', 'callback_conversation_rptr_revealed', 'rptr_revealed_following_info']
Next 10 keywords for Topic #1: ['applied_brakes_stop_plane', 'hold_short_line_applied']


#### 4. Run 5gram with LDA

In [71]:
# Run preprocess on excursion
df_both['Narrative_5gram'] = df_both['Narrative'].apply(preprocess_5gram)
df_both.head(5)

Unnamed: 0,ACN,Narrative,Narrative_bigram,Topic_bigram,Narrative_trigram,Topic_trigram,Narrative_4gram,Topic_4gram,Narrative_5gram
0,473633,"LNDG ON RWY 20 RIC, WINDS LIGHT AND DOWN THE R...",lndg_rwy rwy_20 20_ric ric_winds winds_light l...,1,lndg_rwy_20 rwy_20_ric 20_ric_winds ric_winds_...,1,lndg_rwy_20_ric rwy_20_ric_winds 20_ric_winds_...,1,lndg_rwy_20_ric_winds rwy_20_ric_winds_light 2...
1,481176,"ACFT Z, AN A320, HAD JUST EXITED RWY 23 AT TXW...",acft_z z_a320 a320_exited exited_rwy rwy_23 23...,1,acft_z_a320 z_a320_exited a320_exited_rwy exit...,0,acft_z_a320_exited z_a320_exited_rwy a320_exit...,0,acft_z_a320_exited_rwy z_a320_exited_rwy_23 a3...
2,483919,CAMPAIGN STOP AT ERI. B727 PARKED ON GA RAMP. ...,campaign_stop stop_eri eri_b727 b727_parked pa...,1,campaign_stop_eri stop_eri_b727 eri_b727_parke...,0,campaign_stop_eri_b727 stop_eri_b727_parked er...,0,campaign_stop_eri_b727_parked stop_eri_b727_pa...
3,492690,"AFTER NORMAL LNDG ON RWY 31 AT BFI, CAPT BEGAN...",normal_lndg lndg_rwy rwy_31 31_bfi bfi_capt ca...,1,normal_lndg_rwy lndg_rwy_31 rwy_31_bfi 31_bfi_...,0,normal_lndg_rwy_31 lndg_rwy_31_bfi rwy_31_bfi_...,0,normal_lndg_rwy_31_bfi lndg_rwy_31_bfi_capt rw...
4,495620,"AFTER LNDG ON RWY 33 IN BUR, WE WERE CLRED TO ...",lndg_rwy rwy_33 33_bur bur_clred clred_cross c...,1,lndg_rwy_33 rwy_33_bur 33_bur_clred bur_clred_...,1,lndg_rwy_33_bur rwy_33_bur_clred 33_bur_clred_...,0,lndg_rwy_33_bur_clred rwy_33_bur_clred_cross 3...


In [72]:
# Both
count_vec, both_vec, both_model = optimize_lda(df_both, 'Narrative_5gram')

# Label based on topics
df_both = label_dom_topics(df_both, both_model, both_vec, '5gram')
print(df_both.shape, df_both.columns)

Best Model's Params:  {'n_components': 2}
Best Log Likelihood Score:  -2.1749312553644735
Model Perplexity:  3.7297393626807835
(78, 10) Index(['ACN', 'Narrative', 'Narrative_bigram', 'Topic_bigram',
       'Narrative_trigram', 'Topic_trigram', 'Narrative_4gram', 'Topic_4gram',
       'Narrative_5gram', 'Topic_5gram'],
      dtype='object')


In [73]:
df_both['Topic_5gram'].value_counts()

0    78
Name: Topic_5gram, dtype: int64

In [None]:
# Get top 10 keywords from each topic
both_top10_5g = top_keywords(count_vec, both_model, 10)
# Get top 20 keywords from each topic
both_top20_5g = top_keywords(count_vec, both_model, 20)

In [None]:
print(f"Top 10 keywords for Topic #0: {both_top10_5g[0]}")
print(f"Next 10 keywords for Topic #0: {list(set(both_top20_5g[0]) - set(both_top10_5g[0]))}")
print(f"Top 10 keywords for Topic #1: {both_top10_5g[1]}")
print(f"Next 10 keywords for Topic #1: {list(set(both_top20_5g[1]) - set(both_top10_5g[1]))}")

Top 10 keywords for Topic #0: ['callback_conversation_rptr_revealed_following', 'conversation_rptr_revealed_following_info']
Next 10 keywords for Topic #0: []
Top 10 keywords for Topic #1: ['conversation_rptr_revealed_following_info', 'callback_conversation_rptr_revealed_following']
Next 10 keywords for Topic #1: []
