In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV

In [4]:
asrs_df = pd.read_pickle('./data/trimmed_asrs.pkl')
asrs_df

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Anomaly,Narrative,Synopsis,incursion,excursion
0,459107,200001,0601-1200,DTW.Airport,MI,"Conflict Ground Conflict, Critical; Deviation ...","ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...,1,0
1,459230,200001,0601-1200,MCO.Airport,FL,Deviation / Discrepancy - Procedural FAR; Grou...,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...,1,0
2,459389,200001,0601-1200,LFPG.Airport,FO,Ground Excursion Taxiway,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",CAPT OF AN MD11 RAN OFF TXWY DURING TAXI IN RE...,0,1
3,459407,200001,0001-0600,SRB.Airport,TN,Aircraft Equipment Problem Critical; Deviation...,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",FA20 ENCOUNTERS SLICK RWY AT SRB RESULTING IN ...,0,1
4,459425,200001,1201-1800,ABE.Airport,PA,"Conflict Ground Conflict, Critical; Deviation ...",WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,A JS31 CREW LNDG ON RWY 31 AT ABE ARE INSTRUCT...,1,0
...,...,...,...,...,...,...,...,...,...,...
9298,1715980,201912,0001-0600,SFO.Tower,CA,"Conflict Ground Conflict, Critical; Ground Inc...",I told Aircraft X to hold short of RWY 28L. Th...,SFO Tower Controller reported a runway incursi...,1,0
9299,1716265,201912,0601-1200,ZZZ.Airport,US,"Conflict Ground Conflict, Critical; Deviation ...","Normal training environment, student [in] left...",PA44 student reported experiencing a runway ex...,0,1
9300,1722152,201907,,,,"Conflict Ground Conflict, Critical; Deviation ...","During final for Runway XX at ZZZ, my instruct...",Pilot reported that an aircraft not on CTAF ap...,1,0
9301,1724990,201912,,SFO.Airport,CA,Deviation / Discrepancy - Procedural Clearance...,I was assigned to taxi Aircraft X to Gate XX. ...,Technician reported that while taxiing an airc...,1,0


# NLP Pre-processing
* remove stopwords
* remove punctuation
* tokenize
* normalize

In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/abrun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/abrun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/abrun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/abrun/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

Text cleaning

In [6]:
# Returns text with punctuations removed
def rm_punctuation(text):
    text = text.lower()
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

# Returns text without stopwords after tokenizing text
def rm_stopwords(text):
    input_tokens = word_tokenize(text)
    return [word for word in input_tokens if word not in stopwords.words('english')]

# Returns text after normalizing by lemmatizer after tokenizing text
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    input_tokens = word_tokenize(text)
    res = []
    for word in input_tokens:
        res.append(lemmatizer.lemmatize(word))
    return ' '.join(res)

# Pipelines the entire process
def preprocess(text):
    # return lemmatize(' '.join(rm_stopwords(rm_punctuation(text))))
    return ' '.join(rm_stopwords(rm_punctuation(text)))

In [7]:
asrs_df['narrative_clean'] = asrs_df['Narrative'].apply(preprocess)
asrs_df

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Anomaly,Narrative,Synopsis,incursion,excursion,narrative_clean
0,459107,200001,0601-1200,DTW.Airport,MI,"Conflict Ground Conflict, Critical; Deviation ...","ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...,1,0,taxi dtw rwy 3l missed turn onto txwy txwy f s...
1,459230,200001,0601-1200,MCO.Airport,FL,Deviation / Discrepancy - Procedural FAR; Grou...,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...,1,0,taxied ramp area talking gnd third 3 acft thou...
2,459389,200001,0601-1200,LFPG.Airport,FO,Ground Excursion Taxiway,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",CAPT OF AN MD11 RAN OFF TXWY DURING TAXI IN RE...,0,1,capt flying r seat lndg cdg fo rwy 27 turned h...
3,459407,200001,0001-0600,SRB.Airport,TN,Aircraft Equipment Problem Critical; Deviation...,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",FA20 ENCOUNTERS SLICK RWY AT SRB RESULTING IN ...,0,1,ils rwy 4 srb capt flying wx 900 ft broken 150...
4,459425,200001,1201-1800,ABE.Airport,PA,"Conflict Ground Conflict, Critical; Deviation ...",WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,A JS31 CREW LNDG ON RWY 31 AT ABE ARE INSTRUCT...,1,0,clred visual apch rwy 31 7 sm heard coms twr f...
...,...,...,...,...,...,...,...,...,...,...,...
9298,1715980,201912,0001-0600,SFO.Tower,CA,"Conflict Ground Conflict, Critical; Ground Inc...",I told Aircraft X to hold short of RWY 28L. Th...,SFO Tower Controller reported a runway incursi...,1,0,told aircraft x hold short rwy 28l pilot read ...
9299,1716265,201912,0601-1200,ZZZ.Airport,US,"Conflict Ground Conflict, Critical; Deviation ...","Normal training environment, student [in] left...",PA44 student reported experiencing a runway ex...,0,1,normal training environment student left seat ...
9300,1722152,201907,,,,"Conflict Ground Conflict, Critical; Deviation ...","During final for Runway XX at ZZZ, my instruct...",Pilot reported that an aircraft not on CTAF ap...,1,0,final runway xx zzz instructor noticed piper s...
9301,1724990,201912,,SFO.Airport,CA,Deviation / Discrepancy - Procedural Clearance...,I was assigned to taxi Aircraft X to Gate XX. ...,Technician reported that while taxiing an airc...,1,0,assigned taxi aircraft x gate xx engine starts...


* A lot of the words within the original narratives are stemmed already
* 'narrative_clean' column is what we will be using for our LDA analysis

In [8]:
asrs_df.to_pickle('./data/cleaned_asrs.pkl')

In [64]:
asrs_df = pd.read_pickle('./data/cleaned_asrs.pkl')

# LDA

Function to identify and remove corpus specific stopwords

In [65]:
asrs_df

Unnamed: 0,ACN,Date,Local Time Of Day,Locale Reference,State Reference,Anomaly,Narrative,Synopsis,incursion,excursion,narrative_clean
0,459107,200001,0601-1200,DTW.Airport,MI,"Conflict Ground Conflict, Critical; Deviation ...","ON TAXI OUT TO DTW RWY 3L, MISSED TURN ONTO TX...",AN A320 OVERSHOOTS THE TXWY FOR RWY 3L (TXWY M...,1,0,taxi dtw rwy 3l missed turn onto txwy txwy f s...
1,459230,200001,0601-1200,MCO.Airport,FL,Deviation / Discrepancy - Procedural FAR; Grou...,TAXIED OUT OF RAMP AREA BEFORE TALKING TO GND....,FLC OF A B737 TAXIED WITHOUT CLRNC DUE TO FOLL...,1,0,taxied ramp area talking gnd third 3 acft thou...
2,459389,200001,0601-1200,LFPG.Airport,FO,Ground Excursion Taxiway,"CAPT WAS FLYING, I WAS IN THE R SEAT. AFTER LN...",CAPT OF AN MD11 RAN OFF TXWY DURING TAXI IN RE...,0,1,capt flying r seat lndg cdg fo rwy 27 turned h...
3,459407,200001,0001-0600,SRB.Airport,TN,Aircraft Equipment Problem Critical; Deviation...,"ILS RWY 4 (SRB), CAPT FLYING. WX: 900 FT BROKE...",FA20 ENCOUNTERS SLICK RWY AT SRB RESULTING IN ...,0,1,ils rwy 4 srb capt flying wx 900 ft broken 150...
4,459425,200001,1201-1800,ABE.Airport,PA,"Conflict Ground Conflict, Critical; Deviation ...",WE WERE CLRED FOR THE VISUAL APCH RWY 31 ABOUT...,A JS31 CREW LNDG ON RWY 31 AT ABE ARE INSTRUCT...,1,0,clred visual apch rwy 31 7 sm heard coms twr f...
...,...,...,...,...,...,...,...,...,...,...,...
9298,1715980,201912,0001-0600,SFO.Tower,CA,"Conflict Ground Conflict, Critical; Ground Inc...",I told Aircraft X to hold short of RWY 28L. Th...,SFO Tower Controller reported a runway incursi...,1,0,told aircraft x hold short rwy 28l pilot read ...
9299,1716265,201912,0601-1200,ZZZ.Airport,US,"Conflict Ground Conflict, Critical; Deviation ...","Normal training environment, student [in] left...",PA44 student reported experiencing a runway ex...,0,1,normal training environment student left seat ...
9300,1722152,201907,,,,"Conflict Ground Conflict, Critical; Deviation ...","During final for Runway XX at ZZZ, my instruct...",Pilot reported that an aircraft not on CTAF ap...,1,0,final runway xx zzz instructor noticed piper s...
9301,1724990,201912,,SFO.Airport,CA,Deviation / Discrepancy - Procedural Clearance...,I was assigned to taxi Aircraft X to Gate XX. ...,Technician reported that while taxiing an airc...,1,0,assigned taxi aircraft x gate xx engine starts...


In [66]:
# Corpus for some jargon seen in the model so far
jargon_dict = {'txwy': 'taxiway', 'rwy': 'runway', 'twr': 'tower', 'lndg': 'landing', 'apch': 'approach',
              'l': 'left', 'r': 'right', 'acft': 'aircraft', 'gnd': 'ground', 'ft': 'feet'}

def rmv_jargon(text):
    split_text = text.split()
    for idx, word in enumerate(split_text):
        if word in list(jargon_dict.keys()):
            split_text[idx] = jargon_dict[word]
    final_text = ' '.join(split_text)
    return final_text
            
asrs_df['narrative_clean_v2'] = asrs_df['narrative_clean'].apply(rmv_jargon)

In [67]:
# Identifying stopwords in the corpus
def identify_corpus_stopwords(df, column, threshhold):
    
    # List of words to remove
    removals = []
    # Create a list of words per entry
    narratives = df[column].values
    # List of dictionaries for each entry
    word_tally_list = []
    for nar in narratives:
        word_dict = {}
        for word in nar.split():
            # Getting word counts
            word_dict[word] = word_dict.get(word, 0) + 1
        word_tally_list.append(word_dict)
        
    # Loop through list and remove words appearing in percentage or more of dictionaries
    word_counts = {}
    for entry in word_tally_list:
        for word, count in entry.items():
            word_counts[word] = word_counts.get(word, 0) + 1
            
    # Identify words above the threshhold
    for word, count in word_counts.items():
        if count / len(word_tally_list) >= threshhold:
            removals.append(word)
    return removals

# Removing stopwords from the narrative column
def remove_corpus_stopwords(text, stopwords):
    for word in stopwords:
        text = text.replace(word, '')
    return text.strip()
            
# Function calls
removal_words = identify_corpus_stopwords(asrs_df, 'narrative_clean_v2', .80)
asrs_df['narrative_clean_v3'] = asrs_df['narrative_clean_v2'].apply(remove_corpus_stopwords, stopwords = removal_words)

Splitting our combined dataframe into incursion, excursion, and combined cases

In [68]:
incur_df = asrs_df[asrs_df['incursion'] == 1]
excur_df = asrs_df[asrs_df['excursion'] == 1]
incur_excur_df = asrs_df[(asrs_df['incursion'] == 1) & (asrs_df['excursion'] == 1)]

Running LDA for incursions

In [69]:
def optimize_lda(dataset):
    
    # Vectorize
    count_vect = CountVectorizer(max_df = 0.8, min_df = 2, stop_words = 'english')
    doc_term_matrix = count_vect.fit_transform(dataset['narrative_clean_v3'].values.astype('U'))

    # Initize model
    lda = LDA(random_state = 0)

    # Using GridSearch to optimize LDA
    num_topics = {'n_components': [x for x in range(2, 11)]}
    model = GridSearchCV(lda, param_grid = num_topics)

    # Do the Grid Search
    model.fit(doc_term_matrix)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    # print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(doc_term_matrix))
    
    return count_vect, doc_term_matrix, best_lda_model

LDA for incursion cases

In [70]:
# Incursion
incur_cv, incur_vec, incur_model = optimize_lda(incur_df)
topics = incur_model.transform(incur_vec)
# Adding topic tags to our dataframe
incur_topics = []
for score in topics:
    incur_topics.append(np.where(score == max(score))[0][0])
incur_df['Topic'] = incur_topics

Best Model's Params:  {'n_components': 2}
Model Perplexity:  1287.5043528581384


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incur_df['Topic'] = incur_topics


LDA for excursions

In [71]:
# Excursion
excur_cv, excur_vec, excur_model = optimize_lda(excur_df)
topics = excur_model.transform(excur_vec)
# Adding topic tags to our dataframe
excur_topics = []
for score in topics:
    excur_topics.append(np.where(score == max(score))[0][0])
excur_df['Topic'] = excur_topics

Best Model's Params:  {'n_components': 2}
Model Perplexity:  1515.2680925592351


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excur_df['Topic'] = excur_topics


LDA for both cases

In [72]:
# Both
incur_excur_cv, incur_excur_vec, incur_excur_model = optimize_lda(incur_excur_df)
topics = incur_excur_model.transform(incur_excur_vec)
# Adding topic tags to our dataframe
incur_excur_topics = []
for score in topics:
    incur_excur_topics.append(np.where(score == max(score))[0][0])
incur_excur_df['Topic'] = incur_excur_topics

Best Model's Params:  {'n_components': 2}
Model Perplexity:  743.969992066698


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incur_excur_df['Topic'] = incur_excur_topics


In [73]:
# Group the df and print the most common words for each topic
def group_df(df):
    
    # Find the X most common words from the narrative clean column
    def x_most_common_words(x, df):
        words = ' '.join(df['narrative_clean_v2']).split()
        word_counts = Counter(words)
        return word_counts.most_common(x)
    
    topics = df.groupby('Topic')
    print('---------------------------------')
    for topic, class_df in topics:
        top_x_words = x_most_common_words(10, class_df)
        print('Topic #' + str(topic))
        print([word for word, counts in top_x_words])
    print('---------------------------------')

In [74]:
print('Incursion')
group_df(incur_df)
print('Excursion')
group_df(excur_df)
print('Incursion/Excursion')
group_df(incur_excur_df)

Incursion
---------------------------------
Topic #0
['runway', 'aircraft', 'taxiway', 'taxi', 'short', 'tower', 'hold', 'ground', 'us', 'clearance']
Topic #1
['runway', 'taxiway', 'hold', 'tower', 'aircraft', 'short', 'taxi', 'ground', 'us', 'clred']
---------------------------------
Excursion
---------------------------------
Topic #0
['runway', 'aircraft', 'taxiway', 'taxi', 'right', 'left', 'turn', 'landing', 'us', 'ramp']
Topic #1
['runway', 'aircraft', 'left', 'right', 'landing', 'plane', 'gear', 'airplane', 'feet', 'back']
---------------------------------
Incursion/Excursion
---------------------------------
Topic #0
['runway', 'taxiway', 'taxi', 'aircraft', 'landing', 'ramp', 'left', 'line', 'back', 'tower']
Topic #1
['aircraft', 'runway', 'tower', 'taxiway', 'right', 'left', 'landing', 'taxi', 'ground', 'plane']
---------------------------------


Printing components

In [77]:
def ntopwlst(model, features, ntopwords):
    '''create a list of the top topic words'''
    output = []
    for topic_idx, topic in enumerate(model.components_): # compose output message with top words
        output.append(str(topic_idx))
        output += [features[i] for i in topic.argsort()[:-ntopwords - 1:-1]] # [start (0 if omitted): end : slicing increment]
    return output

In [87]:
# Incursion
incur_feats = incur_cv.get_feature_names()
topwds = ntopwlst(incur_model, incur_feats, 10)
print(topwds[0], '\n', topwds[1:11], '\n', topwds[11], '\n', topwds[12:])

0 
 ['aircraft', 'taxiway', 'taxi', 'tower', 'short', 'ground', 'hold', 'clearance', 'airport', 'cleared'] 
 1 
 ['taxiway', 'hold', 'tower', 'short', 'taxi', 'aircraft', 'ground', 'clred', 'clrnc', 'tkof']


In [88]:
# Excursion
excur_feats = excur_cv.get_feature_names()
topwds = ntopwlst(excur_model, excur_feats, 10)
print(topwds[0], '\n', topwds[1:11], '\n', topwds[11], '\n', topwds[12:])

0 
 ['taxiway', 'aircraft', 'taxi', 'turn', 'right', 'ramp', 'area', 'tower', 'lights', 'left'] 
 1 
 ['aircraft', 'left', 'right', 'landing', 'plane', 'gear', 'airplane', 'damage', 'feet', 'brake']


In [89]:
# Incursion/Excursion
incur_excur_feats = incur_excur_cv.get_feature_names()
topwds = ntopwlst(incur_excur_model, incur_excur_feats, 10)
print(topwds[0], '\n', topwds[1:11], '\n', topwds[11], '\n', topwds[12:])

0 
 ['taxiway', 'taxi', 'ramp', 'landing', 'line', 'aircraft', 'left', 'hold', 'short', 'plane'] 
 1 
 ['aircraft', 'tower', 'right', 'taxiway', 'left', 'landing', 'ground', 'taxi', 'control', 'time']
