In [79]:
import xml.etree.ElementTree as ET
import json
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from pprint import pprint
from gensim import corpora
from gensim.models import lsimodel, ldamodel, doc2vec

file_dir = "./data/AviationData.xml"
narr_dir = "./data/NarrativeData_"

tree = ET.parse(file_dir)
root = tree.getroot()

In [2]:
### count for each types of nodes in XML
tag_count_dict = dict()
for node in tree.iter():
    if node.tag not in tag_count_dict:
        tag_count_dict[node.tag] = 0
    tag_count_dict[node.tag] += 1
all_count = tag_count_dict["{http://www.ntsb.gov}ROW"]
pprint(tag_count_dict)

{'{http://www.ntsb.gov}DATA': 1,
 '{http://www.ntsb.gov}ROW': 77257,
 '{http://www.ntsb.gov}ROWS': 1}


In [3]:
### count for event in JSON
narr_name = ["000"]
narr_name.extend(range(499, 71000, 500))
narr_name.append(999999)
narr_name = list(map(lambda x: str(x)+".json", narr_name))

report_count = 0
narr_count = 0
cause_count = 0
missing_count = 0

for name in narr_name:
    with open(narr_dir + name) as f:
        narr_data = json.load(f)["data"]   # a list of events
        for event in narr_data:
            report_count += int(event["EventId"] != "")
            narr_count += int(event["narrative"] != "")
            cause_count += int(event["probable_cause"] != "")
            missing_count += int(event["narrative"] == "" and event["probable_cause"] == "")

# short summary for JSON files
print("Count for -\nEvent:\t%d\nNarr:\t%d\nCause:\t%d\nMiss:\t%d" %
      (report_count, narr_count, cause_count, missing_count))

Count for -
Event:	76133
Narr:	75905
Cause:	49789
Miss:	227


In [4]:
# add items_d into dictionary with key eventId
def add_dict(d, eventId, items_d):
    ''' d   - the dictionary of all events
    eventId - the id of a event to be added
    items_d - items (dict) to be added to eventId'''
    
    assert type(d) == dict and type(items_d) == dict and type(eventId) == str, "input type error"
    
    if eventId not in d:
        d[eventId] = dict()
    d[eventId]
    for key, value in items_d.items():
        if key in d[eventId] and d[eventId][key] != value:
            pass
            print("weird thing happens to %s" % eventId)
        else:
            d[eventId][key] = value

In [5]:
### iterate through XML to add information
accident_dict = dict()  # AccidentNumber as unique key
eid_anum_dict = dict()  # use EventId to get AccidentNumber
for node in tree.iter(tag="{http://www.ntsb.gov}ROW"):
    anum = node.attrib["AccidentNumber"]
    eid = node.attrib["EventId"]
    accident_dict[anum] = node.attrib
    if eid not in eid_anum_dict:
        eid_anum_dict[eid] = list()
    eid_anum_dict[eid].append(anum)

In [289]:
df_accident = pd.DataFrame.from_dict(accident_dict, orient='index')
df_accident.loc[["WPR15LA253A","WPR15LA253B","WPR15FA243A","WPR15FA243B"],:] # EventId is not unique

Unnamed: 0,EventId,InvestigationType,AccidentNumber,EventDate,Location,Country,Latitude,Longitude,AirportCode,AirportName,...,PurposeOfFlight,AirCarrier,TotalFatalInjuries,TotalSeriousInjuries,TotalMinorInjuries,TotalUninjured,WeatherCondition,BroadPhaseOfFlight,ReportStatus,PublicationDate
WPR15LA253A,20150831X30510,Accident,WPR15LA253A,08/28/2015,"Las Vegas, NV",United States,36.099444,-115.1625,LAS,McCarran International,...,Business,,,,,5.0,VMC,STANDING,Preliminary,09/03/2015
WPR15LA253B,20150831X30510,Accident,WPR15LA253B,08/28/2015,"Las Vegas, NV",United States,36.099444,-115.1625,LAS,McCarran International,...,Business,,,,,5.0,VMC,STANDING,Preliminary,09/03/2015
WPR15FA243A,20150816X60452,Accident,WPR15FA243A,08/16/2015,"San Diego, CA",United States,32.578611,-116.957778,SDM,BROWN FIELD MUNI,...,Personal,,5.0,,,,VMC,APPROACH,Preliminary,08/26/2015
WPR15FA243B,20150816X60452,Accident,WPR15FA243B,08/16/2015,"San Diego, CA",United States,32.578611,-116.957778,SDM,BROWN FIELD MUNI,...,Other Work Use,,5.0,,,,VMC,APPROACH,Preliminary,08/26/2015


In [7]:
### iterate through JSON to add information
narr_dict = dict()
for name in narr_name:
    with open(narr_dir + name) as f:
        narr_data = json.load(f)["data"]   # a list of events
        for event in narr_data:
            eid = event["EventId"]
            ''' # check for duplicate - NO duplicate
            if eid in narr_dict:
                print("weird:\t", eid)
            '''
            narr_dict[eid] = event

In [8]:
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Folder "/var/folders/3c/y1mdkx6x2ms87ngd9f9m8jyw0000gn/T" will be used to save temporary dictionary and corpus.


In [9]:
import string
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def preprocess(d=narr_dict, item="both", rm_stop=True, rm_digit=True,
               rm_punc=True, lower=True, stemming=True, freq_threshold=1):
    '''
    
    '''
    assert type(d) == dict and item in ["narr", "cause", "both"], "Input Error"
    documents = list()
    eventID = list()
    frequency = Counter()
    
    # which item to use
    if item == "narr":
        items = ["narrative", None]
    elif item == "cause":
        items = ["probable_cause", None]
    else:
        items = ["narrative", "probable_cause"]
    
    # what to remove
    rm_digit_list = ["", string.digits]
    rm_punc_list = ["", string.punctuation]
    rm_str = rm_digit_list[int(rm_digit)] + rm_punc_list[int(rm_punc)]
    
    # stopwords
    stoplist = set(stopwords.words('english')) if rm_stop else []
    
    for key, value in narr_dict.items():
        sent = value.get(str(items[0]), "") + value.get(str(items[1]), "")
        sent = sent.translate(str.maketrans('', '', rm_str))
        if lower:
            sent = sent.lower()
        if stemming:
            ps = PorterStemmer()
            text = [ps.stem(word) for word in sent.split() if word not in stoplist]
        else:
            text = [word for word in sent.split() if word not in stoplist]
        documents.append(text)
        eventID.append(key)
        frequency.update(text)
    
    # remove unfrequent words
    documents = [[word for word in doc if frequency[word] > freq_threshold] for doc in documents]
    
    return documents, eventID, frequency

In [40]:
### preprocess only probable_cause docs
docs_cause, eventID, freq_cause = preprocess(item="cause")

### create dictionary between tokens and counts
dict_cause = corpora.Dictionary(docs_cause)
dict_cause.save(os.path.join(TEMP_FOLDER, 'aviation_cause.dict'))  # store the dictionary, for future reference

### create corpus for each document
corpus_cause = [dict_cause.doc2bow(doc) for doc in docs_cause]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'aviation_cause.mm'), corpus_cause) # store to disk


### Latent Semantic Analysis
# extract 10 LSI topics; use the default one-pass algorithm
lsi_cause = lsimodel.LsiModel(corpus=corpus_cause, id2word=dict_cause, num_topics=10)


### Latent Dirichlet Allocation
# extract 10 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda_cause = ldamodel.LdaModel(corpus=corpus_cause, id2word=dict_cause, num_topics=10, update_every=1, passes=1)

2019-03-10 18:33:59,415 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-03-10 18:33:59,536 : INFO : adding document #10000 to Dictionary(2918 unique tokens: ['assess', 'clearanc', 'commun', 'contribut', 'crew']...)
2019-03-10 18:33:59,651 : INFO : adding document #20000 to Dictionary(3629 unique tokens: ['assess', 'clearanc', 'commun', 'contribut', 'crew']...)
2019-03-10 18:33:59,774 : INFO : adding document #30000 to Dictionary(4023 unique tokens: ['assess', 'clearanc', 'commun', 'contribut', 'crew']...)
2019-03-10 18:33:59,895 : INFO : adding document #40000 to Dictionary(4289 unique tokens: ['assess', 'clearanc', 'commun', 'contribut', 'crew']...)
2019-03-10 18:34:00,011 : INFO : adding document #50000 to Dictionary(4444 unique tokens: ['assess', 'clearanc', 'commun', 'contribut', 'crew']...)
2019-03-10 18:34:00,124 : INFO : adding document #60000 to Dictionary(4554 unique tokens: ['assess', 'clearanc', 'commun', 'contribut', 'crew']...)
2019-03-10 18:34:00,258 :

2019-03-10 18:34:02,687 : INFO : topic #2(84.623): 0.819*"land" + 0.234*"gear" + -0.216*"fuel" + -0.170*"pilot" + -0.138*"condit" + -0.117*"flight" + 0.093*"result" + -0.087*"accid" + -0.087*"inadequ" + 0.084*"main"
2019-03-10 18:34:02,688 : INFO : topic #3(80.763): -0.555*"failur" + 0.314*"pilot" + -0.313*"maintain" + -0.268*"control" + 0.256*"factor" + 0.189*"condit" + 0.189*"land" + -0.172*"engin" + 0.158*"inadequ" + 0.143*"improp"
2019-03-10 18:34:02,689 : INFO : topic #4(71.638): -0.573*"fuel" + 0.294*"factor" + -0.266*"result" + 0.262*"condit" + 0.219*"terrain" + 0.208*"loss" + 0.196*"reason" + -0.191*"pilot" + 0.180*"undetermin" + 0.171*"engin"
2019-03-10 18:34:02,691 : INFO : preparing a new chunk of documents
2019-03-10 18:34:02,773 : INFO : using 100 extra samples and 2 power iterations
2019-03-10 18:34:02,774 : INFO : 1st phase: constructing (4649, 110) action matrix
2019-03-10 18:34:02,859 : INFO : orthonormalizing (4649, 110) action matrix
2019-03-10 18:34:02,943 : INFO : 

2019-03-10 18:34:05,016 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:34:05,021 : INFO : topic #8 (0.100): 0.054*"wind" + 0.046*"pilot" + 0.041*"condit" + 0.033*"factor" + 0.030*"failur" + 0.028*"accid" + 0.027*"compens" + 0.025*"inadequ" + 0.020*"contribut" + 0.019*"gust"
2019-03-10 18:34:05,022 : INFO : topic #5 (0.100): 0.038*"pilot" + 0.032*"inadequ" + 0.032*"failur" + 0.031*"fuel" + 0.029*"engin" + 0.029*"due" + 0.027*"factor" + 0.025*"preflight" + 0.022*"result" + 0.022*"airplan"
2019-03-10 18:34:05,023 : INFO : topic #4 (0.100): 0.056*"pilot" + 0.036*"flight" + 0.033*"land" + 0.033*"failur" + 0.029*"student" + 0.028*"control" + 0.026*"maintain" + 0.023*"factor" + 0.023*"instructor" + 0.019*"airplan"
2019-03-10 18:34:05,025 : INFO : topic #9 (0.100): 0.060*"land" + 0.046*"result" + 0.041*"failur" + 0.039*"pilot" + 0.033*"loss" + 0.028*"control" + 0.023*"power" + 0.022*"engin" + 0.022*"fuel" + 0.022*"improp"
2019-03-10 18:34:05,026 : IN

2019-03-10 18:34:07,500 : INFO : topic #7 (0.100): 0.036*"takeoff" + 0.032*"failur" + 0.027*"pilot" + 0.027*"result" + 0.025*"runway" + 0.020*"perform" + 0.018*"tail" + 0.016*"abort" + 0.016*"procedur" + 0.015*"rotor"
2019-03-10 18:34:07,501 : INFO : topic #2 (0.100): 0.081*"engin" + 0.073*"loss" + 0.071*"power" + 0.043*"reason" + 0.043*"undetermin" + 0.036*"land" + 0.030*"factor" + 0.028*"terrain" + 0.027*"forc" + 0.027*"failur"
2019-03-10 18:34:07,502 : INFO : topic #0 (0.100): 0.077*"pilot" + 0.045*"failur" + 0.039*"maintain" + 0.032*"result" + 0.029*"factor" + 0.025*"airspe" + 0.024*"flight" + 0.023*"control" + 0.023*"accid" + 0.021*"stall"
2019-03-10 18:34:07,502 : INFO : topic #5 (0.100): 0.050*"inadequ" + 0.041*"preflight" + 0.033*"pilot" + 0.031*"inspect" + 0.030*"mainten" + 0.029*"failur" + 0.028*"due" + 0.025*"airplan" + 0.021*"result" + 0.020*"fuel"
2019-03-10 18:34:07,503 : INFO : topic diff=0.286173, rho=0.353553
2019-03-10 18:34:07,504 : INFO : PROGRESS: pass 0, at docume

2019-03-10 18:34:10,219 : INFO : topic #1 (0.100): 0.134*"fuel" + 0.055*"pilot" + 0.043*"result" + 0.040*"engin" + 0.036*"power" + 0.036*"due" + 0.033*"loss" + 0.030*"exhaust" + 0.025*"inadequ" + 0.024*"starvat"
2019-03-10 18:34:10,220 : INFO : topic #2 (0.100): 0.094*"engin" + 0.080*"loss" + 0.079*"power" + 0.048*"reason" + 0.048*"undetermin" + 0.036*"land" + 0.030*"factor" + 0.030*"forc" + 0.029*"terrain" + 0.025*"due"
2019-03-10 18:34:10,220 : INFO : topic #0 (0.100): 0.078*"pilot" + 0.042*"failur" + 0.037*"maintain" + 0.032*"result" + 0.028*"factor" + 0.025*"flight" + 0.024*"airspe" + 0.023*"accid" + 0.022*"stall" + 0.021*"inadvert"
2019-03-10 18:34:10,221 : INFO : topic diff=0.235732, rho=0.267261
2019-03-10 18:34:10,222 : INFO : PROGRESS: pass 0, at document #30000/76133
2019-03-10 18:34:10,570 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:34:10,574 : INFO : topic #1 (0.100): 0.132*"fuel" + 0.055*"pilot" + 0.043*"result" + 0.041*"engin

2019-03-10 18:34:12,829 : INFO : topic #6 (0.100): 0.078*"pilot" + 0.051*"factor" + 0.043*"failur" + 0.032*"terrain" + 0.028*"maintain" + 0.027*"runway" + 0.027*"land" + 0.027*"accid" + 0.025*"clearanc" + 0.022*"condit"
2019-03-10 18:34:12,830 : INFO : topic #4 (0.100): 0.060*"flight" + 0.045*"pilot" + 0.044*"student" + 0.041*"instructor" + 0.031*"failur" + 0.030*"inadequ" + 0.028*"action" + 0.024*"remedi" + 0.020*"supervis" + 0.019*"delay"
2019-03-10 18:34:12,831 : INFO : topic diff=0.209265, rho=0.223607
2019-03-10 18:34:12,832 : INFO : PROGRESS: pass 0, at document #42000/76133
2019-03-10 18:34:13,173 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:34:13,179 : INFO : topic #6 (0.100): 0.079*"pilot" + 0.050*"factor" + 0.044*"failur" + 0.032*"terrain" + 0.029*"runway" + 0.028*"maintain" + 0.028*"land" + 0.026*"accid" + 0.025*"clearanc" + 0.020*"condit"
2019-03-10 18:34:13,179 : INFO : topic #1 (0.100): 0.134*"fuel" + 0.055*"pilot" + 0.044*"re

2019-03-10 18:34:14,848 : INFO : topic #7 (0.100): 0.050*"takeoff" + 0.031*"failur" + 0.029*"result" + 0.021*"procedur" + 0.020*"rotor" + 0.019*"tail" + 0.018*"abort" + 0.018*"perform" + 0.017*"flap" + 0.017*"pilot"
2019-03-10 18:34:14,848 : INFO : topic diff=0.149161, rho=0.196116
2019-03-10 18:34:14,850 : INFO : PROGRESS: pass 0, at document #54000/76133
2019-03-10 18:34:15,180 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:34:15,184 : INFO : topic #3 (0.100): 0.131*"control" + 0.101*"pilot" + 0.092*"failur" + 0.089*"maintain" + 0.071*"direct" + 0.054*"land" + 0.041*"airplan" + 0.029*"aircraft" + 0.025*"takeoff" + 0.025*"roll"
2019-03-10 18:34:15,185 : INFO : topic #5 (0.100): 0.065*"inadequ" + 0.059*"mainten" + 0.038*"inspect" + 0.037*"preflight" + 0.036*"personnel" + 0.023*"due" + 0.023*"airplan" + 0.023*"failur" + 0.018*"pilot" + 0.017*"cabl"
2019-03-10 18:34:15,186 : INFO : topic #8 (0.100): 0.112*"wind" + 0.099*"condit" + 0.070*"pilot"

2019-03-10 18:34:17,332 : INFO : topic diff=0.138481, rho=0.176777
2019-03-10 18:34:17,333 : INFO : PROGRESS: pass 0, at document #66000/76133
2019-03-10 18:34:17,660 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:34:17,665 : INFO : topic #8 (0.100): 0.113*"wind" + 0.101*"condit" + 0.069*"pilot" + 0.069*"factor" + 0.064*"inadequ" + 0.060*"compens" + 0.052*"crosswind" + 0.037*"accid" + 0.032*"gust" + 0.030*"gusti"
2019-03-10 18:34:17,666 : INFO : topic #2 (0.100): 0.098*"engin" + 0.080*"loss" + 0.080*"power" + 0.049*"reason" + 0.048*"undetermin" + 0.035*"land" + 0.033*"terrain" + 0.032*"forc" + 0.031*"factor" + 0.030*"due"
2019-03-10 18:34:17,667 : INFO : topic #5 (0.100): 0.065*"inadequ" + 0.058*"mainten" + 0.038*"inspect" + 0.037*"preflight" + 0.034*"personnel" + 0.022*"due" + 0.022*"failur" + 0.022*"airplan" + 0.019*"instal" + 0.016*"exceed"
2019-03-10 18:34:17,669 : INFO : topic #7 (0.100): 0.052*"takeoff" + 0.031*"failur" + 0.028*"result"

2019-03-10 18:34:19,347 : INFO : PROGRESS: pass 0, at document #76133/76133
2019-03-10 18:34:19,372 : INFO : merging changes from 133 documents into a model of 76133 documents
2019-03-10 18:34:19,378 : INFO : topic #8 (0.100): 0.117*"wind" + 0.105*"condit" + 0.069*"factor" + 0.065*"pilot" + 0.062*"inadequ" + 0.057*"compens" + 0.052*"crosswind" + 0.036*"accid" + 0.028*"gusti" + 0.026*"gust"
2019-03-10 18:34:19,379 : INFO : topic #4 (0.100): 0.071*"flight" + 0.050*"student" + 0.040*"instructor" + 0.037*"pilot" + 0.035*"inadequ" + 0.031*"action" + 0.027*"failur" + 0.023*"remedi" + 0.023*"supervis" + 0.021*"delay"
2019-03-10 18:34:19,380 : INFO : topic #1 (0.100): 0.135*"fuel" + 0.056*"pilot" + 0.045*"result" + 0.044*"engin" + 0.040*"power" + 0.038*"due" + 0.038*"loss" + 0.036*"exhaust" + 0.031*"inadequ" + 0.023*"improp"
2019-03-10 18:34:19,381 : INFO : topic #5 (0.100): 0.065*"inadequ" + 0.062*"mainten" + 0.035*"inspect" + 0.035*"personnel" + 0.033*"preflight" + 0.020*"due" + 0.020*"desig

In [41]:
# print the 10 most contributing words (both positively and negatively) for each of the ten topics
lsi_cause.print_topics(10, num_words=6)

2019-03-10 18:34:19,395 : INFO : topic #0(388.554): 0.547*"pilot" + 0.377*"failur" + 0.259*"factor" + 0.255*"land" + 0.233*"result" + 0.217*"maintain"
2019-03-10 18:34:19,396 : INFO : topic #1(205.284): -0.452*"engin" + -0.385*"loss" + -0.370*"power" + -0.364*"fuel" + 0.248*"pilot" + 0.234*"maintain"
2019-03-10 18:34:19,397 : INFO : topic #2(161.944): -0.815*"land" + 0.230*"fuel" + -0.216*"gear" + 0.170*"pilot" + 0.137*"condit" + 0.131*"flight"
2019-03-10 18:34:19,399 : INFO : topic #3(157.583): -0.570*"failur" + 0.323*"pilot" + -0.315*"maintain" + -0.260*"control" + 0.249*"factor" + 0.186*"condit"
2019-03-10 18:34:19,400 : INFO : topic #4(140.004): -0.582*"fuel" + 0.278*"factor" + 0.270*"condit" + -0.244*"result" + 0.220*"loss" + 0.200*"terrain"
2019-03-10 18:34:19,401 : INFO : topic #5(130.365): -0.588*"control" + 0.449*"failur" + -0.252*"loss" + -0.224*"result" + 0.215*"factor" + -0.207*"direct"
2019-03-10 18:34:19,402 : INFO : topic #6(123.525): -0.496*"pilot" + 0.493*"flight" + 0.

[(0,
  '0.547*"pilot" + 0.377*"failur" + 0.259*"factor" + 0.255*"land" + 0.233*"result" + 0.217*"maintain"'),
 (1,
  '-0.452*"engin" + -0.385*"loss" + -0.370*"power" + -0.364*"fuel" + 0.248*"pilot" + 0.234*"maintain"'),
 (2,
  '-0.815*"land" + 0.230*"fuel" + -0.216*"gear" + 0.170*"pilot" + 0.137*"condit" + 0.131*"flight"'),
 (3,
  '-0.570*"failur" + 0.323*"pilot" + -0.315*"maintain" + -0.260*"control" + 0.249*"factor" + 0.186*"condit"'),
 (4,
  '-0.582*"fuel" + 0.278*"factor" + 0.270*"condit" + -0.244*"result" + 0.220*"loss" + 0.200*"terrain"'),
 (5,
  '-0.588*"control" + 0.449*"failur" + -0.252*"loss" + -0.224*"result" + 0.215*"factor" + -0.207*"direct"'),
 (6,
  '-0.496*"pilot" + 0.493*"flight" + 0.296*"condit" + 0.272*"result" + 0.193*"inadequ" + -0.175*"power"'),
 (7,
  '0.566*"result" + -0.366*"fuel" + -0.329*"control" + -0.218*"direct" + -0.205*"factor" + 0.183*"airspe"'),
 (8,
  '0.486*"flight" + -0.318*"result" + -0.267*"factor" + -0.257*"wind" + -0.251*"condit" + -0.196*"maint

0 - pilot's failure + a little land

1 - no engine loss + pilot's failure to maintain

2 - fuel & gear but no land

3 - not pilot' failure

4 - not fuel

5 - not control related

6 - flight condition (no pilot)

7 - results from other stuff (maybe airspeed)

8 - flight

9 - no terrain, no maintain

In [39]:
for i in range(5):
    pprint(lsi_cause[corpus_cause[i]])
    print(narr_dict[eventID[i]]["probable_cause"])
    print("="*115)

[(0, 2.2134468057214542),
 (1, 0.8190752235190043),
 (2, 0.47576476167897175),
 (3, -0.7858322481996173),
 (4, -0.08831176774027856),
 (5, 1.1563023192273383),
 (6, 0.15392489135467155),
 (7, -0.2957487192706866),
 (8, -0.056537401229919904),
 (9, 0.39272119078359746)]
the pilot's failure to maintain clearance with the trees during a long-line operation.  Contributing factors were the Forest Service's inadequate communication between crews, failure to properly assess the safety of the intended drop zone, reduced visibility to the right side of the helicopter, and the trees.
[(0, 1.757388784218556),
 (1, 0.3988580512600002),
 (2, 0.2078773135521299),
 (3, -0.714342122147523),
 (4, -0.4980312492357833),
 (5, 0.3513008316546844),
 (6, 0.034228713900177225),
 (7, 1.4639629228453683),
 (8, -0.7464338845473747),
 (9, -0.7107830242959575)]
The pilot's failure to maintain adequate airspeed which resulted in a stall and subsequent in-flight collision with terrain. 
[(0, 0.4782396648436818),
 (1

In [21]:
lda_cause = ldamodel.LdaModel(corpus=corpus_cause, id2word=dict_cause, num_topics=10, update_every=1, passes=1)
# print the 10 most contributing words (both positively and negatively) for each of the ten topics
lda_cause.print_topics(10)

2019-03-10 18:05:49,765 : INFO : topic #0 (0.100): 0.104*"engin" + 0.103*"loss" + 0.102*"power" + 0.058*"reason" + 0.055*"undetermin" + 0.048*"land" + 0.043*"forc" + 0.037*"terrain" + 0.035*"due" + 0.033*"factor"
2019-03-10 18:05:49,766 : INFO : topic #1 (0.100): 0.074*"pilot" + 0.047*"improp" + 0.042*"flight" + 0.034*"student" + 0.033*"land" + 0.030*"lack" + 0.028*"factor" + 0.026*"inadequ" + 0.026*"accid" + 0.026*"instructor"
2019-03-10 18:05:49,767 : INFO : topic #2 (0.100): 0.174*"land" + 0.074*"gear" + 0.067*"result" + 0.053*"pilot" + 0.033*"hard" + 0.033*"nose" + 0.029*"failur" + 0.028*"flare" + 0.027*"main" + 0.025*"improp"
2019-03-10 18:05:49,768 : INFO : topic #3 (0.100): 0.048*"failur" + 0.031*"mainten" + 0.025*"engin" + 0.023*"due" + 0.022*"result" + 0.017*"personnel" + 0.017*"improp" + 0.015*"separ" + 0.015*"oil" + 0.014*"inadequ"
2019-03-10 18:05:49,769 : INFO : topic #4 (0.100): 0.039*"brake" + 0.037*"failur" + 0.018*"oper" + 0.017*"system" + 0.016*"crew" + 0.015*"determi

[(0,
  '0.104*"engin" + 0.103*"loss" + 0.102*"power" + 0.058*"reason" + 0.055*"undetermin" + 0.048*"land" + 0.043*"forc" + 0.037*"terrain" + 0.035*"due" + 0.033*"factor"'),
 (1,
  '0.074*"pilot" + 0.047*"improp" + 0.042*"flight" + 0.034*"student" + 0.033*"land" + 0.030*"lack" + 0.028*"factor" + 0.026*"inadequ" + 0.026*"accid" + 0.026*"instructor"'),
 (2,
  '0.174*"land" + 0.074*"gear" + 0.067*"result" + 0.053*"pilot" + 0.033*"hard" + 0.033*"nose" + 0.029*"failur" + 0.028*"flare" + 0.027*"main" + 0.025*"improp"'),
 (3,
  '0.048*"failur" + 0.031*"mainten" + 0.025*"engin" + 0.023*"due" + 0.022*"result" + 0.017*"personnel" + 0.017*"improp" + 0.015*"separ" + 0.015*"oil" + 0.014*"inadequ"'),
 (4,
  '0.039*"brake" + 0.037*"failur" + 0.018*"oper" + 0.017*"system" + 0.016*"crew" + 0.015*"determin" + 0.015*"runway" + 0.014*"could" + 0.014*"right" + 0.014*"normal"'),
 (5,
  '0.086*"pilot" + 0.081*"failur" + 0.068*"maintain" + 0.033*"result" + 0.033*"airspe" + 0.031*"altitud" + 0.031*"clearanc" + 

In [142]:
docs_cause_tag = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_cause)]
d2v_cause = doc2vec.Doc2Vec(documents=docs_cause_tag, vector_size=32, window=3, min_count=1, workers=4)

docvec_cause = np.array([d2v_cause.docvecs[i] for i in range(d2v_cause.docvecs.count)])
kmeans_cause = KMeans(n_clusters=10, random_state=0).fit(docvec_cause)
for i in range(10):
    pprint(d2v_cause.wv.most_similar(positive=[kmeans_cause.cluster_centers_[i]]))
    print("="*60)

2019-03-11 16:03:23,067 : INFO : collecting all words and their counts
2019-03-11 16:03:23,068 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-03-11 16:03:23,099 : INFO : PROGRESS: at example #10000, processed 97378 words (3226601/s), 2918 word types, 10000 tags
2019-03-11 16:03:23,134 : INFO : PROGRESS: at example #20000, processed 193264 words (2844139/s), 3629 word types, 20000 tags
2019-03-11 16:03:23,175 : INFO : PROGRESS: at example #30000, processed 288906 words (2360877/s), 4023 word types, 30000 tags
2019-03-11 16:03:23,219 : INFO : PROGRESS: at example #40000, processed 383376 words (2257079/s), 4289 word types, 40000 tags
2019-03-11 16:03:23,252 : INFO : PROGRESS: at example #50000, processed 478978 words (2969497/s), 4444 word types, 50000 tags
2019-03-11 16:03:23,283 : INFO : PROGRESS: at example #60000, processed 574622 words (3173475/s), 4554 word types, 60000 tags
2019-03-11 16:03:23,314 : INFO : PROGRESS: at example #70000, processe

[('altitudeclear', 0.7047466039657593),
 ('lowaltitud', 0.6986209750175476),
 ('maneuv', 0.6961405277252197),
 ('mountainhilli', 0.6720404028892517),
 ('quickstop', 0.6639651656150818),
 ('manuev', 0.6601356267929077),
 ('altitud', 0.6485608220100403),
 ('obstacl', 0.645787239074707),
 ('photograph', 0.6301299333572388),
 ('altitudedist', 0.6270813941955566)]
[('septemb', 0.9916329383850098),
 ('chairman', 0.9884711503982544),
 ('myocardi', 0.9876563549041748),
 ('dissent', 0.9868927597999573),
 ('symptom', 0.9838351607322693),
 ('channel', 0.9834141135215759),
 ('massiv', 0.9833166003227234),
 ('pratt', 0.9824110269546509),
 ('brass', 0.9823546409606934),
 ('whitney', 0.9813043475151062)]
[('perfrom', 0.43039458990097046),
 ('inrang', 0.3875208795070648),
 ('recordkeep', 0.3279991149902344),
 ('student', 0.3086649179458618),
 ('tot', 0.28656673431396484),
 ('sic', 0.2739953398704529),
 ('command', 0.25942033529281616),
 ('pilot', 0.2432379573583603),
 ('gusti', 0.23366084694862366),
 

In [113]:
docvec_cause = np.array([d2v_cause.docvecs[i] for i in range(d2v_cause.docvecs.count)])
kmeans_cause = KMeans(n_clusters=10, random_state=0).fit(docvec_cause)
for i in range(10):
    pprint(d2v_cause.wv.most_similar(positive=[kmeans_cause.cluster_centers_[i]]))
    print("="*60)

In [None]:
from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster import util

kclusterer = KMeansClusterer(10, distance=util.cosine_distance, repeats=1)
assigned_clusters = kclusterer.cluster(docvec_cause, assign_clusters=True)

In [None]:
freq_cause["aprx"]

In [194]:
d2v_cause.wv["aprx"]

array([ 0.00820441, -0.16543093, -0.19626006,  0.02976332,  0.01168264,
        0.01019207, -0.09975437,  0.02857459,  0.07900606,  0.08484948,
       -0.02887544, -0.03541361,  0.14646387,  0.07425206, -0.02884125,
       -0.19182128,  0.05875633,  0.19956933,  0.02336899, -0.07725609,
       -0.17262805,  0.08036457,  0.01805482,  0.00065945,  0.02575268,
       -0.09116887,  0.02671609,  0.02895975,  0.01625756, -0.00432351,
       -0.07310022,  0.0255926 ], dtype=float32)

In [179]:
freq_cause.most_common()[-5000]

('southwest', 4)

In [181]:
docs_cause_tag = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_cause)]
d2v_cause = doc2vec.Doc2Vec(documents=docs_cause_tag, vector_size=128,
                            window=5, min_count=2, workers=4, epochs=5)

docvec_cause = np.array([d2v_cause.docvecs[i] for i in range(d2v_cause.docvecs.count)])
kmeans_cause = KMeans(n_clusters=5, random_state=0).fit(docvec_cause)
for i in range(5):
    pprint(d2v_cause.wv.most_similar(positive=[kmeans_cause.cluster_centers_[i]]))
    print("="*60)

In [225]:
docs_cause_tag = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_cause)]
d2v_cause = doc2vec.Doc2Vec(documents=docs_cause_tag, vector_size=128,
                            window=5, min_count=2, workers=4, epochs=5)

docvec_cause = np.array([d2v_cause.docvecs[i] for i in range(d2v_cause.docvecs.count)])
kmeans_cause = KMeans(n_clusters=5, random_state=0).fit(docvec_cause)

pprint(d2v_cause.wv.most_similar(positive=[d2v_cause.docvecs[0]]))

for i in range(5):
    pprint(d2v_cause.wv.most_similar(positive=[kmeans_cause.cluster_centers_[i]]))
    print("="*60)

2019-03-11 16:50:45,220 : INFO : collecting all words and their counts
2019-03-11 16:50:45,221 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-03-11 16:50:45,252 : INFO : PROGRESS: at example #10000, processed 97378 words (3176140/s), 2918 word types, 10000 tags
2019-03-11 16:50:45,281 : INFO : PROGRESS: at example #20000, processed 193264 words (3336671/s), 3629 word types, 20000 tags
2019-03-11 16:50:45,317 : INFO : PROGRESS: at example #30000, processed 288906 words (2733862/s), 4023 word types, 30000 tags
2019-03-11 16:50:45,349 : INFO : PROGRESS: at example #40000, processed 383376 words (3010692/s), 4289 word types, 40000 tags
2019-03-11 16:50:45,383 : INFO : PROGRESS: at example #50000, processed 478978 words (2892206/s), 4444 word types, 50000 tags
2019-03-11 16:50:45,417 : INFO : PROGRESS: at example #60000, processed 574622 words (2866037/s), 4554 word types, 60000 tags
2019-03-11 16:50:45,452 : INFO : PROGRESS: at example #70000, processe

[('children', 0.8899456262588501),
 ('arpt', 0.8887323141098022),
 ('id', 0.8854444026947021),
 ('crowd', 0.8837155699729919),
 ('leve', 0.8832873106002808),
 ('distort', 0.882290244102478),
 ('van', 0.8816373348236084),
 ('rwi', 0.8794955015182495),
 ('encroach', 0.8790273070335388),
 ('barrier', 0.8784595727920532)]
[('leveloff', 0.7625458240509033),
 ('thangar', 0.75089430809021),
 ('grumman', 0.742224931716919),
 ('maintin', 0.7404541373252869),
 ('behalf', 0.7387354373931885),
 ('centrifug', 0.7363240718841553),
 ('thew', 0.7356348037719727),
 ('outofgroundeffect', 0.7219597101211548),
 ('apprehens', 0.7200928330421448),
 ('weightshiftcontrol', 0.7183264493942261)]
[('chairman', 0.9934841990470886),
 ('captur', 0.9919688105583191),
 ('aprx', 0.990843653678894),
 ('septemb', 0.9871060252189636),
 ('mi', 0.9867507219314575),
 ('urin', 0.98557448387146),
 ('pratt', 0.9843652248382568),
 ('tantalum', 0.9833601117134094),
 ('pst', 0.9830745458602905),
 ('civil', 0.9824439883232117)]
[(

In [268]:
DBSCAN_cause = DBSCAN(eps = 0.3, min_samples=20, metric="cosine").fit(docvec_cause)
print(np.unique(DBSCAN_cause.labels_))

[-1  0]


In [269]:
DBSCAN_cause = DBSCAN(eps = 0.3, min_samples=50, metric="cosine").fit(docvec_cause)
print(np.unique(DBSCAN_cause.labels_))

[-1  0  1]


In [270]:
DBSCAN_cause = DBSCAN(eps = 0.35, min_samples=20, metric="cosine").fit(docvec_cause)
print(np.unique(DBSCAN_cause.labels_))

[-1  0]


In [271]:
DBSCAN_cause = DBSCAN(eps = 0.35, min_samples=50, metric="cosine").fit(docvec_cause)
print(np.unique(DBSCAN_cause.labels_))

[-1  0]


In [280]:
docs_narr_tag = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_narr)]
d2v_narr = doc2vec.Doc2Vec(documents=docs_narr_tag, vector_size=128,
                            window=5, min_count=3, workers=4, epochs=5)

docvec_narr = np.array([d2v_narr.docvecs[i] for i in range(d2v_narr.docvecs.count)])
kmeans_narr = KMeans(n_clusters=5, random_state=0).fit(docvec_narr)

pprint(d2v_narr.wv.most_similar(positive=[d2v_narr.docvecs[0]]))

for i in range(5):
    pprint(d2v_narr.wv.most_similar(positive=[kmeans_narr.cluster_centers_[i]]))
    print("="*60)

2019-03-11 19:03:45,845 : INFO : collecting all words and their counts
2019-03-11 19:03:45,846 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-03-11 19:03:46,001 : INFO : PROGRESS: at example #10000, processed 823694 words (5337453/s), 12063 word types, 10000 tags
2019-03-11 19:03:46,141 : INFO : PROGRESS: at example #20000, processed 1649519 words (5925249/s), 15498 word types, 20000 tags
2019-03-11 19:03:46,273 : INFO : PROGRESS: at example #30000, processed 2467175 words (6259597/s), 17576 word types, 30000 tags
2019-03-11 19:03:46,403 : INFO : PROGRESS: at example #40000, processed 3275419 words (6230085/s), 19065 word types, 40000 tags
2019-03-11 19:03:46,536 : INFO : PROGRESS: at example #50000, processed 4089702 words (6177711/s), 20162 word types, 50000 tags
2019-03-11 19:03:46,665 : INFO : PROGRESS: at example #60000, processed 4907466 words (6333948/s), 20932 word types, 60000 tags
2019-03-11 19:03:46,797 : INFO : PROGRESS: at example #700

[('roundup', 0.5685921311378479),
 ('blivet', 0.5548129081726074),
 ('rotorengin', 0.5541119575500488),
 ('fruitless', 0.5497933626174927),
 ('hh', 0.5313423871994019),
 ('lineth', 0.5305643081665039),
 ('raptor', 0.529140055179596),
 ('bambi', 0.5254401564598083),
 ('rig’', 0.5249746441841125),
 ('reemphas', 0.5235704183578491)]
[('hampshiregu', 0.921766996383667),
 ('lymphoma', 0.9168108105659485),
 ('roadaldershot', 0.9097431898117065),
 ('emi', 0.8983251452445984),
 ('carabobo', 0.896308958530426),
 ('httpwwwtsbgcca', 0.8947284817695618),
 ('esd', 0.8934133052825928),
 ('mpa', 0.8934093117713928),
 ('avantair', 0.8931686878204346),
 ('subscrib', 0.8888248205184937)]
[('hesh', 0.9470241069793701),
 ('njl', 0.9327125549316406),
 ('mggt', 0.9290626049041748),
 ('kphl', 0.9237028360366821),
 ('guarani', 0.9226034283638),
 ('nicolo', 0.9216816425323486),
 ('silvio', 0.9215511083602905),
 ('chem', 0.9207011461257935),
 ('ofp', 0.9203321933746338),
 ('degpd', 0.9178388118743896)]
[('ndjam

In [288]:
freq_narr.most_common()[-35000]

('polar', 4)

In [294]:
pprint(df_accident.columns)

Index(['EventId', 'InvestigationType', 'AccidentNumber', 'EventDate',
       'Location', 'Country', 'Latitude', 'Longitude', 'AirportCode',
       'AirportName', 'InjurySeverity', 'AircraftDamage', 'AircraftCategory',
       'RegistrationNumber', 'Make', 'Model', 'AmateurBuilt',
       'NumberOfEngines', 'EngineType', 'FARDescription', 'Schedule',
       'PurposeOfFlight', 'AirCarrier', 'TotalFatalInjuries',
       'TotalSeriousInjuries', 'TotalMinorInjuries', 'TotalUninjured',
       'WeatherCondition', 'BroadPhaseOfFlight', 'ReportStatus',
       'PublicationDate'],
      dtype='object')


In [296]:
df_accident["InvestigationType"]

ANC00FA018     Accident
ANC00FA024     Accident
ANC00FA052     Accident
ANC00FA056     Accident
ANC00FA076     Accident
ANC00FA081     Accident
ANC00FA082     Accident
ANC00FA093     Accident
ANC00FA110     Accident
ANC00FA128     Accident
ANC00GA071     Accident
ANC00GA121     Accident
ANC00IA010     Incident
ANC00IA034     Incident
ANC00IA063     Incident
ANC00IA083     Incident
ANC00IA088     Incident
ANC00LA001     Accident
ANC00LA002     Accident
ANC00LA004     Accident
ANC00LA005     Accident
ANC00LA006     Accident
ANC00LA008     Accident
ANC00LA009     Accident
ANC00LA011     Accident
ANC00LA012     Accident
ANC00LA013     Accident
ANC00LA014     Accident
ANC00LA015     Accident
ANC00LA016     Accident
                 ...   
WPR15LA235     Accident
WPR15LA237     Accident
WPR15LA239     Accident
WPR15LA240     Accident
WPR15LA242     Accident
WPR15LA248     Accident
WPR15LA249     Accident
WPR15LA251     Accident
WPR15LA253A    Accident
WPR15LA253B    Accident
WPR15TA002     A

In [133]:
pprint(d2v_cause.wv.most_similar(positive=[d2v_cause.docvecs[0]]))

[('navaid', 0.732263445854187),
 ('companymanag', 0.730559766292572),
 ('picker', 0.7280705571174622),
 ('rebound', 0.7269872426986694),
 ('construct', 0.7238534688949585),
 ('ord', 0.7129491567611694),
 ('ondemand', 0.7099337577819824),
 ('mark', 0.7086915969848633),
 ('harmon', 0.707922101020813),
 ('backho', 0.7016193866729736)]


In [130]:
for i in range(10):
    pprint(d2v_cause.wv.most_similar(positive=[kmeans_cause.cluster_centers_[i]]))
    print("="*60)



[('clevi', 0.9583625793457031),
 ('nut', 0.956359326839447),
 ('retain', 0.9481732845306396),
 ('gener', 0.9457680583000183),
 ('lug', 0.9457526803016663),
 ('crankcas', 0.9446278810501099),
 ('ore', 0.9433245062828064),
 ('adapt', 0.9411141872406006),
 ('screw', 0.9408109188079834),
 ('hardwar', 0.9397497773170471)]
[('crash', 0.7829271554946899),
 ('ditch', 0.7758039236068726),
 ('build', 0.7612055540084839),
 ('embank', 0.7609546780586243),
 ('onground', 0.7578732967376709),
 ('drainag', 0.7572314739227295),
 ('noseov', 0.7528395652770996),
 ('rollout', 0.7515468597412109),
 ('roadway', 0.7515188455581665),
 ('site', 0.7484804391860962)]
[('statement', 0.9908180236816406),
 ('sumwalt', 0.9858838319778442),
 ('frontal', 0.9851624965667725),
 ('poison', 0.985145628452301),
 ('septemb', 0.984489917755127),
 ('magnitud', 0.9829285144805908),
 ('concur', 0.9828419089317322),
 ('airmen', 0.982098400592804),
 ('pax', 0.9818456172943115),
 ('latest', 0.9802587032318115)]
[('nonmechan', 0.68

In [119]:
kmeans_cause.cluster_centers_[0] # center 0

array([ 0.01957477, -0.02018932, -0.01784088, -0.02140116, -0.00232202,
       -0.02387729,  0.02633667,  0.02707147,  0.02683154,  0.02939575,
        0.02229942, -0.01895504,  0.00428111,  0.0081991 ,  0.00876583,
        0.02561462, -0.01175715,  0.03616961,  0.00886879, -0.02797291,
       -0.00174379, -0.03989746, -0.03059485, -0.01304858, -0.01840545,
       -0.04705837,  0.03074634, -0.01593476,  0.00688586, -0.02523971,
       -0.03364031,  0.01251351,  0.00814115, -0.04400116,  0.02839603,
       -0.01933283, -0.00418838, -0.05001875, -0.04672788,  0.00390902,
        0.01101736,  0.0328737 , -0.01524266,  0.05820297, -0.0489227 ,
        0.01762442, -0.03065389,  0.03617837,  0.02416482, -0.04013348,
       -0.03095809,  0.066081  ,  0.03011162, -0.02872144, -0.00332343,
        0.01273588, -0.0240224 , -0.03504089,  0.01284178, -0.01255567,
        0.03250001,  0.03597254,  0.02067798, -0.00440285], dtype=float32)

In [120]:
d2v_cause.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1a6122af28>

In [42]:
### preprocess only probable_narr docs
docs_narr, eventID, freq_narr = preprocess(item="narr")

### create dictionary between tokens and counts
dict_narr = corpora.Dictionary(docs_narr)
dict_narr.save(os.path.join(TEMP_FOLDER, 'aviation_narr.dict'))  # store the dictionary, for future reference

### create corpus for each document
corpus_narr = [dict_narr.doc2bow(doc) for doc in docs_narr]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'aviation_narr.mm'), corpus_narr) # store to disk


### Latent Semantic Analysis
# extract 10 LSI topics; use the default one-pass algorithm
lsi_narr = lsimodel.LsiModel(corpus=corpus_narr, id2word=dict_narr, num_topics=10)


### Latent Dirichlet Allocation
# extract 10 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda_narr = ldamodel.LdaModel(corpus=corpus_narr, id2word=dict_narr, num_topics=10, update_every=1, passes=1)

2019-03-10 18:38:56,765 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-03-10 18:38:57,355 : INFO : adding document #10000 to Dictionary(12063 unique tokens: ['aar', 'accid', 'action', 'addit', 'aircraft']...)
2019-03-10 18:38:57,937 : INFO : adding document #20000 to Dictionary(15498 unique tokens: ['aar', 'accid', 'action', 'addit', 'aircraft']...)
2019-03-10 18:38:58,511 : INFO : adding document #30000 to Dictionary(17576 unique tokens: ['aar', 'accid', 'action', 'addit', 'aircraft']...)
2019-03-10 18:38:59,089 : INFO : adding document #40000 to Dictionary(19065 unique tokens: ['aar', 'accid', 'action', 'addit', 'aircraft']...)
2019-03-10 18:38:59,661 : INFO : adding document #50000 to Dictionary(20162 unique tokens: ['aar', 'accid', 'action', 'addit', 'aircraft']...)
2019-03-10 18:39:00,239 : INFO : adding document #60000 to Dictionary(20932 unique tokens: ['aar', 'accid', 'action', 'addit', 'aircraft']...)
2019-03-10 18:39:00,817 : INFO : adding document #70000

2019-03-10 18:39:11,306 : INFO : topic #2(338.125): -0.705*"fuel" + -0.412*"engin" + -0.277*"tank" + 0.217*"airplan" + 0.212*"runway" + -0.150*"power" + 0.106*"pilot" + -0.091*"gallon" + 0.068*"feet" + 0.067*"wind"
2019-03-10 18:39:11,307 : INFO : topic #3(280.442): -0.489*"land" + -0.345*"gear" + 0.306*"flight" + -0.228*"airplan" + -0.225*"runway" + 0.192*"pilot" + 0.188*"helicopt" + -0.147*"left" + 0.134*"accid" + -0.120*"right"
2019-03-10 18:39:11,308 : INFO : topic #4(250.916): -0.541*"airplan" + 0.444*"pilot" + 0.364*"land" + 0.263*"helicopt" + 0.217*"gear" + -0.169*"acft" + 0.157*"flight" + -0.156*"investig" + -0.130*"plt" + -0.111*"travel"
2019-03-10 18:39:11,311 : INFO : preparing a new chunk of documents
2019-03-10 18:39:11,592 : INFO : using 100 extra samples and 2 power iterations
2019-03-10 18:39:11,595 : INFO : 1st phase: constructing (21580, 110) action matrix
2019-03-10 18:39:11,713 : INFO : orthonormalizing (21580, 110) action matrix
2019-03-10 18:39:12,111 : INFO : 2nd

2019-03-10 18:39:16,541 : INFO : topic #2 (0.100): 0.026*"flight" + 0.025*"land" + 0.022*"airplan" + 0.017*"investig" + 0.016*"gear" + 0.014*"pilot" + 0.011*"accid" + 0.010*"data" + 0.010*"aircraft" + 0.010*"prepar"
2019-03-10 18:39:16,542 : INFO : topic #8 (0.100): 0.026*"investig" + 0.025*"engin" + 0.022*"pilot" + 0.021*"airplan" + 0.016*"accid" + 0.016*"aircraft" + 0.013*"fuel" + 0.011*"travel" + 0.010*"sourc" + 0.010*"data"
2019-03-10 18:39:16,544 : INFO : topic #7 (0.100): 0.044*"fuel" + 0.020*"pilot" + 0.020*"investig" + 0.018*"engin" + 0.017*"accid" + 0.017*"tank" + 0.015*"aircraft" + 0.014*"land" + 0.012*"travel" + 0.012*"use"
2019-03-10 18:39:16,545 : INFO : topic #5 (0.100): 0.022*"investig" + 0.017*"accid" + 0.013*"flight" + 0.012*"airplan" + 0.012*"travel" + 0.012*"use" + 0.009*"aircraft" + 0.009*"engin" + 0.009*"pilot" + 0.008*"support"
2019-03-10 18:39:16,546 : INFO : topic #3 (0.100): 0.038*"airplan" + 0.024*"pilot" + 0.023*"runway" + 0.021*"investig" + 0.016*"accid" + 0

2019-03-10 18:39:23,054 : INFO : topic #2 (0.100): 0.041*"gear" + 0.039*"land" + 0.029*"flight" + 0.017*"airplan" + 0.013*"pilot" + 0.013*"investig" + 0.011*"instructor" + 0.009*"posit" + 0.009*"aircraft" + 0.008*"extend"
2019-03-10 18:39:23,055 : INFO : topic #9 (0.100): 0.047*"investig" + 0.038*"aircraft" + 0.030*"land" + 0.026*"accid" + 0.024*"pilot" + 0.024*"use" + 0.024*"travel" + 0.024*"sourc" + 0.024*"may" + 0.024*"ntsb"
2019-03-10 18:39:23,056 : INFO : topic #1 (0.100): 0.050*"acft" + 0.039*"plt" + 0.029*"investig" + 0.019*"ft" + 0.019*"travel" + 0.016*"flt" + 0.013*"rwi" + 0.013*"use" + 0.012*"accid" + 0.011*"aircraft"
2019-03-10 18:39:23,057 : INFO : topic diff=0.349872, rho=0.353553
2019-03-10 18:39:23,058 : INFO : PROGRESS: pass 0, at document #18000/76133
2019-03-10 18:39:24,042 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:39:24,059 : INFO : topic #3 (0.100): 0.054*"airplan" + 0.042*"runway" + 0.031*"pilot" + 0.016*"investig" +

2019-03-10 18:39:30,430 : INFO : topic #3 (0.100): 0.056*"airplan" + 0.045*"runway" + 0.032*"pilot" + 0.015*"investig" + 0.014*"left" + 0.013*"aircraft" + 0.013*"accid" + 0.012*"land" + 0.012*"feet" + 0.012*"takeoff"
2019-03-10 18:39:30,432 : INFO : topic #8 (0.100): 0.028*"pilot" + 0.024*"investig" + 0.024*"airplan" + 0.020*"accid" + 0.015*"travel" + 0.015*"flight" + 0.014*"aircraft" + 0.012*"feet" + 0.011*"data" + 0.010*"engin"
2019-03-10 18:39:30,433 : INFO : topic diff=0.273236, rho=0.267261
2019-03-10 18:39:30,434 : INFO : PROGRESS: pass 0, at document #30000/76133
2019-03-10 18:39:31,366 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:39:31,383 : INFO : topic #4 (0.100): 0.035*"pilot" + 0.032*"airplan" + 0.026*"investig" + 0.021*"land" + 0.020*"accid" + 0.019*"left" + 0.018*"right" + 0.017*"aircraft" + 0.014*"report" + 0.014*"use"
2019-03-10 18:39:31,384 : INFO : topic #7 (0.100): 0.071*"fuel" + 0.046*"engin" + 0.026*"tank" + 0.024*"powe

2019-03-10 18:39:37,423 : INFO : topic #3 (0.100): 0.053*"airplan" + 0.047*"runway" + 0.033*"pilot" + 0.015*"left" + 0.014*"investig" + 0.013*"aircraft" + 0.012*"takeoff" + 0.012*"accid" + 0.012*"feet" + 0.012*"report"
2019-03-10 18:39:37,424 : INFO : topic diff=0.228188, rho=0.223607
2019-03-10 18:39:37,425 : INFO : PROGRESS: pass 0, at document #42000/76133
2019-03-10 18:39:38,303 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:39:38,320 : INFO : topic #6 (0.100): 0.039*"flight" + 0.020*"pilot" + 0.020*"time" + 0.018*"passeng" + 0.017*"airplan" + 0.017*"airport" + 0.013*"sourc" + 0.013*"author" + 0.013*"oper" + 0.012*"injur"
2019-03-10 18:39:38,322 : INFO : topic #9 (0.100): 0.053*"investig" + 0.045*"aircraft" + 0.031*"land" + 0.029*"accid" + 0.029*"may" + 0.028*"sourc" + 0.027*"use" + 0.027*"provid" + 0.027*"prepar" + 0.027*"ntsb"
2019-03-10 18:39:38,323 : INFO : topic #2 (0.100): 0.043*"gear" + 0.042*"land" + 0.023*"flight" + 0.013*"airpla

2019-03-10 18:39:43,739 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 18:39:43,755 : INFO : topic #9 (0.100): 0.055*"investig" + 0.046*"aircraft" + 0.032*"land" + 0.030*"accid" + 0.029*"may" + 0.028*"sourc" + 0.028*"use" + 0.028*"provid" + 0.028*"prepar" + 0.027*"ntsb"
2019-03-10 18:39:43,757 : INFO : topic #5 (0.100): 0.041*"helicopt" + 0.017*"rotor" + 0.011*"blade" + 0.011*"tail" + 0.011*"investig" + 0.010*"control" + 0.010*"accid" + 0.009*"flight" + 0.009*"separ" + 0.008*"main"
2019-03-10 18:39:43,758 : INFO : topic #4 (0.100): 0.038*"airplan" + 0.036*"pilot" + 0.025*"investig" + 0.023*"land" + 0.020*"accid" + 0.019*"left" + 0.018*"right" + 0.015*"aircraft" + 0.015*"oper" + 0.015*"report"
2019-03-10 18:39:43,759 : INFO : topic #0 (0.100): 0.039*"engin" + 0.020*"investig" + 0.018*"reveal" + 0.014*"examin" + 0.013*"oil" + 0.011*"land" + 0.011*"aircraft" + 0.010*"sourc" + 0.010*"use" + 0.010*"travel"
2019-03-10 18:39:43,761 : INFO : topic #7 (0

2019-03-10 18:39:50,492 : INFO : topic #0 (0.100): 0.040*"engin" + 0.020*"investig" + 0.019*"reveal" + 0.015*"examin" + 0.014*"oil" + 0.012*"land" + 0.011*"aircraft" + 0.010*"sourc" + 0.009*"use" + 0.009*"travel"
2019-03-10 18:39:50,493 : INFO : topic #5 (0.100): 0.046*"helicopt" + 0.019*"rotor" + 0.012*"blade" + 0.011*"tail" + 0.011*"control" + 0.010*"investig" + 0.010*"accid" + 0.009*"main" + 0.009*"separ" + 0.008*"flight"
2019-03-10 18:39:50,495 : INFO : topic #1 (0.100): 0.068*"acft" + 0.052*"plt" + 0.040*"investig" + 0.026*"travel" + 0.024*"ft" + 0.021*"rwi" + 0.018*"flt" + 0.015*"eng" + 0.015*"use" + 0.013*"sourc"
2019-03-10 18:39:50,496 : INFO : topic #7 (0.100): 0.066*"fuel" + 0.050*"engin" + 0.026*"power" + 0.026*"pilot" + 0.024*"tank" + 0.020*"airplan" + 0.017*"investig" + 0.015*"accid" + 0.015*"land" + 0.011*"aircraft"
2019-03-10 18:39:50,497 : INFO : topic diff=0.155169, rho=0.174078
2019-03-10 18:39:50,498 : INFO : PROGRESS: pass 0, at document #68000/76133
2019-03-10 18:3

2019-03-10 18:39:55,075 : INFO : topic #9 (0.100): 0.057*"investig" + 0.047*"aircraft" + 0.034*"land" + 0.031*"may" + 0.031*"accid" + 0.029*"sourc" + 0.029*"use" + 0.029*"provid" + 0.029*"ntsb" + 0.029*"prepar"
2019-03-10 18:39:55,076 : INFO : topic #4 (0.100): 0.042*"airplan" + 0.036*"pilot" + 0.025*"investig" + 0.024*"land" + 0.020*"accid" + 0.020*"left" + 0.019*"right" + 0.015*"oper" + 0.015*"report" + 0.014*"aircraft"
2019-03-10 18:39:55,077 : INFO : topic diff=0.117186, rho=0.160128


In [49]:
for i in range(5):
    pprint(narr_dict[eventID[i]]["narrative"])
    print("="*80)

('NTSB investigators may not have traveled in support of this investigation '
 'and used data provided by various sources to prepare this public aircraft '
 'accident report.The accident occurred while the helicopter was supporting '
 'firefighting efforts with long-line operations. Two days prior to the '
 'accident, the division group supervisor (DIVS) anchored a colored reflective '
 'panel used for indicating landing and drop zones at the accident location.  '
 'The DIVS stated that no site assessment was performed at the time of the '
 'panel placement because the placement was not intended to be the indicator '
 'of the drop zone for blivet deliveries.  The terrain in the area consisted '
 'of steep slopes and trees varying in height from 75 to 200 feet.  One day '
 'prior to the accident, the location of the panel was not changed from the '
 'previous day and remained as placed by the DIVS.  The accident helicopter, '
 'equipped with a 150-foot-long line, then made the blivet dr

In [38]:
lda.get_document_topics(corpus[0])

[(1, 0.19169757),
 (3, 0.25452796),
 (5, 0.033713732),
 (7, 0.15853497),
 (8, 0.19672607),
 (9, 0.16350488)]

In [39]:
from gensim.models import ldamodel

# extract 10 LDA topics, using 1 pass and updating once every 1 chunk (10,000 documents)
lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, update_every=1, passes=1)

2019-03-10 01:25:44,399 : INFO : using symmetric alpha at 0.05
2019-03-10 01:25:44,400 : INFO : using symmetric eta at 0.05
2019-03-10 01:25:44,405 : INFO : using serial LDA version on this node
2019-03-10 01:25:44,457 : INFO : running online (single-pass) LDA training, 20 topics, 1 passes over the supplied corpus of 76133 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2019-03-10 01:25:44,459 : INFO : PROGRESS: pass 0, at document #2000/76133
2019-03-10 01:25:45,895 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 01:25:45,950 : INFO : topic #5 (0.050): 0.031*"airplan" + 0.019*"land" + 0.019*"investig" + 0.016*"pilot" + 0.013*"accid" + 0.011*"flight" + 0.011*"fuel" + 0.010*"use" + 0.010*"support" + 0.009*"prepar"
2019-03-10 01:25:45,953 : INFO : topic #9 (0.050): 0.022*"pilot" + 0.019*"accid" + 0.019*"airplan" + 0.018*"investig" + 0.017*"aircr

2019-03-10 01:25:52,541 : INFO : topic diff=0.532716, rho=0.408248
2019-03-10 01:25:52,543 : INFO : PROGRESS: pass 0, at document #14000/76133
2019-03-10 01:25:53,641 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 01:25:53,683 : INFO : topic #6 (0.050): 0.047*"pilot" + 0.044*"land" + 0.026*"investig" + 0.026*"aircraft" + 0.024*"airplan" + 0.020*"runway" + 0.020*"accid" + 0.014*"provid" + 0.014*"use" + 0.014*"ntsb"
2019-03-10 01:25:53,684 : INFO : topic #7 (0.050): 0.028*"pilot" + 0.021*"investig" + 0.017*"flight" + 0.016*"accid" + 0.014*"weather" + 0.013*"travel" + 0.011*"condit" + 0.011*"aircraft" + 0.010*"approach" + 0.009*"data"
2019-03-10 01:25:53,685 : INFO : topic #2 (0.050): 0.037*"helicopt" + 0.032*"pilot" + 0.031*"student" + 0.025*"flight" + 0.022*"control" + 0.020*"investig" + 0.018*"aircraft" + 0.016*"instructor" + 0.015*"accid" + 0.014*"rotor"
2019-03-10 01:25:53,686 : INFO : topic #16 (0.050): 0.090*"fuel" + 0.030*"engin" + 0.030*"t

2019-03-10 01:26:01,038 : INFO : PROGRESS: pass 0, at document #26000/76133
2019-03-10 01:26:02,126 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 01:26:02,170 : INFO : topic #9 (0.050): 0.042*"airplan" + 0.026*"pilot" + 0.022*"runway" + 0.021*"feet" + 0.016*"takeoff" + 0.012*"accid" + 0.010*"aircraft" + 0.010*"control" + 0.009*"climb" + 0.009*"engin"
2019-03-10 01:26:02,171 : INFO : topic #19 (0.050): 0.059*"investig" + 0.038*"travel" + 0.033*"aircraft" + 0.022*"accid" + 0.022*"use" + 0.020*"variou" + 0.020*"prepar" + 0.020*"support" + 0.020*"sourc" + 0.020*"ntsb"
2019-03-10 01:26:02,172 : INFO : topic #2 (0.050): 0.049*"helicopt" + 0.034*"student" + 0.033*"pilot" + 0.024*"flight" + 0.023*"control" + 0.020*"investig" + 0.019*"rotor" + 0.018*"instructor" + 0.017*"aircraft" + 0.015*"accid"
2019-03-10 01:26:02,174 : INFO : topic #11 (0.050): 0.086*"acft" + 0.059*"plt" + 0.051*"rwi" + 0.033*"investig" + 0.033*"ft" + 0.023*"sourc" + 0.020*"use" + 0.

2019-03-10 01:26:08,613 : INFO : topic #15 (0.050): 0.040*"gear" + 0.030*"land" + 0.017*"oil" + 0.015*"investig" + 0.014*"reveal" + 0.012*"failur" + 0.011*"engin" + 0.010*"main" + 0.010*"examin" + 0.010*"system"
2019-03-10 01:26:08,614 : INFO : topic #6 (0.050): 0.049*"land" + 0.047*"pilot" + 0.031*"aircraft" + 0.030*"investig" + 0.025*"runway" + 0.021*"accid" + 0.019*"airplan" + 0.017*"use" + 0.017*"provid" + 0.016*"may"
2019-03-10 01:26:08,615 : INFO : topic #8 (0.050): 0.068*"wind" + 0.031*"pilot" + 0.029*"knot" + 0.026*"gust" + 0.024*"accid" + 0.024*"investig" + 0.022*"aircraft" + 0.020*"report" + 0.018*"degre" + 0.016*"condit"
2019-03-10 01:26:08,616 : INFO : topic #11 (0.050): 0.088*"acft" + 0.060*"plt" + 0.045*"rwi" + 0.039*"investig" + 0.032*"ft" + 0.024*"sourc" + 0.023*"use" + 0.021*"may" + 0.020*"travel" + 0.020*"data"
2019-03-10 01:26:08,618 : INFO : topic diff=0.290299, rho=0.229416
2019-03-10 01:26:10,164 : INFO : -6.636 per-word bound, 99.5 perplexity estimate based on a 

2019-03-10 01:26:16,275 : INFO : topic #14 (0.050): 0.040*"pilot" + 0.036*"airplan" + 0.025*"accid" + 0.018*"investig" + 0.015*"flight" + 0.015*"wit" + 0.013*"aircraft" + 0.012*"impact" + 0.010*"travel" + 0.010*"failur"
2019-03-10 01:26:16,276 : INFO : topic #15 (0.050): 0.044*"gear" + 0.032*"land" + 0.018*"oil" + 0.015*"investig" + 0.013*"reveal" + 0.012*"failur" + 0.011*"engin" + 0.010*"examin" + 0.010*"main" + 0.009*"system"
2019-03-10 01:26:16,277 : INFO : topic #19 (0.050): 0.064*"investig" + 0.042*"travel" + 0.036*"aircraft" + 0.023*"accid" + 0.023*"without" + 0.022*"work" + 0.022*"either" + 0.022*"amount" + 0.022*"conduct" + 0.021*"obtain"
2019-03-10 01:26:16,279 : INFO : topic diff=0.243777, rho=0.200000
2019-03-10 01:26:16,280 : INFO : PROGRESS: pass 0, at document #52000/76133
2019-03-10 01:26:17,220 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 01:26:17,263 : INFO : topic #1 (0.050): 0.076*"engin" + 0.039*"power" + 0.025*"investig" +

2019-03-10 01:26:24,260 : INFO : topic #6 (0.050): 0.050*"land" + 0.047*"pilot" + 0.033*"aircraft" + 0.032*"investig" + 0.026*"runway" + 0.022*"accid" + 0.018*"use" + 0.018*"provid" + 0.018*"airplan" + 0.017*"may"
2019-03-10 01:26:24,261 : INFO : topic #17 (0.050): 0.034*"park" + 0.033*"taxi" + 0.018*"taxiway" + 0.017*"ramp" + 0.016*"vehicl" + 0.014*"oper" + 0.013*"truck" + 0.012*"area" + 0.010*"clearanc" + 0.009*"load"
2019-03-10 01:26:24,263 : INFO : topic diff=0.207489, rho=0.179605
2019-03-10 01:26:24,264 : INFO : PROGRESS: pass 0, at document #64000/76133
2019-03-10 01:26:25,271 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 01:26:25,317 : INFO : topic #19 (0.050): 0.065*"investig" + 0.043*"travel" + 0.037*"aircraft" + 0.024*"without" + 0.024*"work" + 0.024*"either" + 0.024*"conduct" + 0.023*"amount" + 0.023*"obtain" + 0.023*"accid"
2019-03-10 01:26:25,318 : INFO : topic #9 (0.050): 0.047*"airplan" + 0.026*"pilot" + 0.025*"runway" + 0.023*"

2019-03-10 01:26:30,597 : INFO : topic diff=0.184687, rho=0.164399
2019-03-10 01:26:30,598 : INFO : PROGRESS: pass 0, at document #76000/76133
2019-03-10 01:26:31,535 : INFO : merging changes from 2000 documents into a model of 76133 documents
2019-03-10 01:26:31,578 : INFO : topic #13 (0.050): 0.051*"helicopt" + 0.026*"sourc" + 0.025*"rotor" + 0.023*"blade" + 0.022*"author" + 0.022*"foreign" + 0.018*"turbin" + 0.017*"drive" + 0.013*"oper" + 0.013*"tail"
2019-03-10 01:26:31,579 : INFO : topic #12 (0.050): 0.061*"ice" + 0.048*"carburetor" + 0.028*"heat" + 0.027*"power" + 0.023*"cfi" + 0.020*"condit" + 0.018*"temperatur" + 0.017*"flight" + 0.016*"degre" + 0.014*"instructor"
2019-03-10 01:26:31,580 : INFO : topic #11 (0.050): 0.081*"acft" + 0.053*"plt" + 0.052*"investig" + 0.035*"rwi" + 0.028*"sourc" + 0.028*"use" + 0.026*"may" + 0.026*"travel" + 0.026*"ntsb" + 0.025*"variou"
2019-03-10 01:26:31,581 : INFO : topic #4 (0.050): 0.047*"flight" + 0.025*"time" + 0.022*"airplan" + 0.020*"airpor

In [40]:
lda.print_topics(20)

2019-03-10 01:26:31,857 : INFO : topic #0 (0.050): 0.034*"flight" + 0.026*"crew" + 0.024*"passeng" + 0.022*"seat" + 0.016*"captain" + 0.013*"incid" + 0.013*"door" + 0.012*"aircraft" + 0.012*"investig" + 0.011*"turbul"
2019-03-10 01:26:31,859 : INFO : topic #1 (0.050): 0.078*"engin" + 0.039*"power" + 0.025*"investig" + 0.022*"loss" + 0.022*"pilot" + 0.020*"land" + 0.018*"accid" + 0.018*"aircraft" + 0.014*"airplan" + 0.013*"forc"
2019-03-10 01:26:31,861 : INFO : topic #2 (0.050): 0.055*"helicopt" + 0.043*"student" + 0.030*"flight" + 0.029*"pilot" + 0.027*"instructor" + 0.023*"control" + 0.023*"rotor" + 0.018*"investig" + 0.015*"tail" + 0.015*"accid"
2019-03-10 01:26:31,863 : INFO : topic #3 (0.050): 0.052*"airplan" + 0.041*"pilot" + 0.030*"land" + 0.025*"left" + 0.024*"investig" + 0.023*"runway" + 0.022*"right" + 0.018*"accid" + 0.015*"control" + 0.015*"aircraft"
2019-03-10 01:26:31,865 : INFO : topic #4 (0.050): 0.048*"flight" + 0.024*"time" + 0.021*"airplan" + 0.019*"airport" + 0.016*"

[(0,
  '0.034*"flight" + 0.026*"crew" + 0.024*"passeng" + 0.022*"seat" + 0.016*"captain" + 0.013*"incid" + 0.013*"door" + 0.012*"aircraft" + 0.012*"investig" + 0.011*"turbul"'),
 (1,
  '0.078*"engin" + 0.039*"power" + 0.025*"investig" + 0.022*"loss" + 0.022*"pilot" + 0.020*"land" + 0.018*"accid" + 0.018*"aircraft" + 0.014*"airplan" + 0.013*"forc"'),
 (2,
  '0.055*"helicopt" + 0.043*"student" + 0.030*"flight" + 0.029*"pilot" + 0.027*"instructor" + 0.023*"control" + 0.023*"rotor" + 0.018*"investig" + 0.015*"tail" + 0.015*"accid"'),
 (3,
  '0.052*"airplan" + 0.041*"pilot" + 0.030*"land" + 0.025*"left" + 0.024*"investig" + 0.023*"runway" + 0.022*"right" + 0.018*"accid" + 0.015*"control" + 0.015*"aircraft"'),
 (4,
  '0.048*"flight" + 0.024*"time" + 0.021*"airplan" + 0.019*"airport" + 0.016*"pilot" + 0.014*"passeng" + 0.013*"oper" + 0.012*"visual" + 0.012*"condit" + 0.012*"injur"'),
 (5,
  '0.016*"control" + 0.015*"wing" + 0.015*"airplan" + 0.014*"separ" + 0.013*"inspect" + 0.012*"propel" + 

In [41]:
frequency["left"]

47754

In [43]:
c = 0
for v in narr_dict.values():
    pprint(v)
    c += 1
    if c > 10:
        break

{'EventId': '20070804X01107',
 'narrative': 'NTSB investigators may not have traveled in support of this '
              'investigation and used data provided by various sources to '
              'prepare this public aircraft accident report.The accident '
              'occurred while the helicopter was supporting firefighting '
              'efforts with long-line operations. Two days prior to the '
              'accident, the division group supervisor (DIVS) anchored a '
              'colored reflective panel used for indicating landing and drop '
              'zones at the accident location.  The DIVS stated that no site '
              'assessment was performed at the time of the panel placement '
              'because the placement was not intended to be the indicator of '
              'the drop zone for blivet deliveries.  The terrain in the area '
              'consisted of steep slopes and trees varying in height from 75 '
              'to 200 feet.  One day prior to 

In [12]:
'''
from gensim import models

tfidf = models.TfidfModel(corpus)

vec = corpus[0]

from gensim import similarities
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=5)

sims = index[tfidf[vec]]

list(enumerate(sims))[0:10]
'''

'\nfrom gensim import models\n\ntfidf = models.TfidfModel(corpus)\n\nvec = corpus[0]\n\nfrom gensim import similarities\nindex = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=5)\n\nsims = index[tfidf[vec]]\n\nlist(enumerate(sims))[0:10]\n'

In [13]:
#narr_dict['20070804X01107']["narrative"].translate(str.maketrans('', '', string.punctuation)).lower()

|ROWS  |Explanation |Type(should be) |Example| 
|------|----------- |-----|-------|
|EventDate | X |str|"MM/DD/YYYY"|
|Location  | X |str|"City, State"|
|Country| X | str | "United States" |
|Latitude| X |double| "-23.094356" |
|Longitude| X |double| "-95.123456" |
|AirportCode| - |str| "IWS" |
|AirportName| same above|str|"WEST HOUSTON"|
|InjurySeverity| - |str|"Non-Fatal"|
|AircraftDamage| - |str|"Substantial"|
|AircraftCategory| - |str|"Airplane"|
|RegistrationNumber| - |str|"V452CS"|
|Make| - |str|"CESSNA"|
|Model| - |str|"T240"|
|AmateurBuild| - |str|"No"|
|NumberOfEngines| - |int?|"2"|
|EngineType| - |str|"Turbo Jet"|
|Model| - |str|"T240"|
|Model| - |str|"T240"|

