In [1]:
import os
import string
import numpy as np
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

Must add javahome environment to your machine

In [2]:
os.environ['JAVAHOME'] =  "C:/Program Files (x86)/Common Files/Oracle/Java/javapath/java.exe"

Sources for the model:

In [3]:
print('http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford')
print('https://nlp.stanford.edu/software/CRF-NER.html')

http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
https://nlp.stanford.edu/software/CRF-NER.html


There are three different models. NLTK offers a module to run the stanford model. Just give it the necessary locations

In [4]:
model = 'english.all.3class.distsim.crf.ser.gz'
#model = 'english.conll.4class.distsim.crf.ser.gz'
#model = 'english.muc.7class.distsim.crf.ser.gz'

ner = StanfordNERTagger(
        './LADOTD Data/stanford/'+model,
        './LADOTD Data/stanford/stanford-ner.jar',
        encoding='utf-8')

---

Read in the tweets to be processed

In [6]:
corpus = pd.read_csv('./LADOTD Data/road_closures.csv')
corpus.shape

(17229, 2)

Clean the tweets appropriately

In [7]:
punct = set(string.punctuation+'–')
clean = lambda document: ''.join([doc for doc in document if doc not in punct])
corpus['report_clean'] = corpus['report'].apply(clean)

Iterate over a sample of tweets to extract location tags. This is just a demonstration

In [8]:
location_tags = []

for document in corpus['report_clean'].values[:3]:
    tags = ner.tag(word_tokenize(document))
    location_tags.append([tag for tag in tags if 'ORGANIZATION' in tag or 'LOCATION' in tag])

In [9]:
test_df = corpus[['report']].head(3)
test_df['tags'] = location_tags
test_df

Unnamed: 0,report,tags
0,I-10 Westbound Bridge Joint Repair over City P...,"[(City, LOCATION), (Park, LOCATION), (LakeDalr..."
1,"LA 611-1 (River Road), Jefferson Parish, S.P. ...","[(LA, LOCATION)]"
2,ROAD CLOSURE: Swan Lake Road northbound at I-2...,"[(ROAD, LOCATION), (CLOSURE, LOCATION), (Swan,..."


In [10]:
i = 0
print(test_df['report'].values[i])
print(*test_df['tags'].values[i])

I-10 Westbound Bridge Joint Repair over City Park Lake/Dalrymple Dr., East Baton Rouge Parish
('City', 'LOCATION') ('Park', 'LOCATION') ('LakeDalrymple', 'LOCATION') ('Dr', 'LOCATION') ('East', 'LOCATION') ('Baton', 'LOCATION') ('Rouge', 'LOCATION') ('Parish', 'LOCATION')


---

Iterate over every tweet and extract location tags. This takes a LONG time

In [10]:
location_tags = []

for document in corpus['report_clean'].values:
    tags = ner.tag(word_tokenize(document))
    location_tags.append([tag for tag in tags if 'ORGANIZATION' in tag or 'LOCATION' in tag])

Add location tags to original dataframe

In [112]:
corpus['locations'] = location_tags

Clean up the tags a bit

In [109]:
detag = lambda tags: [tag[0] for tag in tags]
corpus['locations_clean'] = corpus['locations'].apply(detag)

Save the new csv with tags

In [111]:
corpus.to_csv('./LADOTD Data/road_closures_tagged.csv', header=True, index=False)