# Location Extraction and Spacy Word Vectorization

In [22]:
import pandas as pd
import re
import spacy
import string


In [24]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07242019.csv")

twitter_closures = twitter_closures[['text', 'type', 'username', 'tweet', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(43016, 5)


Unnamed: 0,text,type,username,tweet,road_closure
0,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0
1,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0
2,Much needed rain is headed our way please b...,official,GDOTATL,Much needed rain is headed our way please b...,0
3,Happy Thanksgiving Please drive safe and pati...,official,GDOTATL,Happy Thanksgiving Please drive safe and pati...,0
4,Have a safe Thanksgiving Put away your cell p...,official,GDOTATL,Have a safe Thanksgiving Put away your cell p...,0


## SpaCy Preprocessing

In [25]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,text,type,username,tweet,road_closure,modified_text,location
0,Douglas County Crash - EB I-20 b4 Post Rd exi...,official,GDOTATL,Douglas County Crash - EB I-20 b4 Post Rd exi...,0,,
1,With rain on the way - we have an important me...,official,GDOTATL,With rain on the way - we have an important me...,0,,


In [26]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [27]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [28]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [29]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [30]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')].head(100)

In [31]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [32]:
test = get_loc(test_df, 'modified_text', 'location')

In [35]:
test_df['location'] = test
test_df.head()

Unnamed: 0,text,type,username,tweet,road_closure,modified_text,location
19213,NEW Crash in Duval on I-295 E north beyond Ph...,official,fl511_northeast,NEW Crash in Duval on I-295 E north beyond Ph...,1,At New Crash At Duval At Interstate 295 East ...,"{New Crash At Duval, Right Lane, Interstate 2..."
19214,CLEARED Traffic congestion in Duval on I-95 n...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 n...,1,At Cleared Traffic Congestion At Duval At Int...,"{Interstate 95 North, Interstate 10}"
19215,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Exit 5, Interstate 295 West North, San Jose}"
19216,UPDATE Traffic congestion in Duval on I-95 no...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 no...,1,At Update Traffic Congestion At Duval At Inte...,"{Interstate 95 North, Interstate 10}"
19217,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Exit 56, Interstate 295 East South, Exit 53}"
