# Location Extraction and Spacy Word Vectorization

In [5]:
import pandas as pd
import re
import spacy
import string


In [6]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07292019.csv")

twitter_closures = twitter_closures[['text', 'type', 'username', 'tweet', 'road_closure', 'state']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(24054, 6)


Unnamed: 0,text,type,username,tweet,road_closure,state
0,The PIOH for the SR 138 I-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,0,Georgia
1,We appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,0,Georgia
2,Need info on re-entry Check here Also follow...,official,GDOTATL,Need info on re-entry Check here Also follow...,0,Georgia
3,Do you need some updates You can find ALL of ...,official,GDOTATL,Do you need some updates You can find ALL of ...,0,Georgia
4,Did you know that the 511Georgia website and ...,official,GDOTATL,Did you know that the 511Georgia website and ...,0,Georgia


## SpaCy Preprocessing

In [7]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,text,type,username,tweet,road_closure,state,modified_text,location
0,The PIOH for the SR 138 I-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,0,Georgia,,
1,We appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,0,Georgia,,


In [8]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [9]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [10]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [11]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [12]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')]

In [13]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [14]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
test_df['location'] = test
test_df.head()

Unnamed: 0,text,type,username,tweet,road_closure,state,modified_text,location
8976,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,Florida,At Cleared Traffic Congestion At Duval At Int...,"{Interstate 295 West North, Exit 5 San Jose, E..."
8977,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,Florida,At Cleared Traffic Congestion At Duval At Int...,"{Interstate 295 East South, Exit 53}"
8978,UPDATE Disabled vehicle in Duval on I-95 sout...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-95 sout...,1,Florida,At Update Disabled Vehicle At Duval At Inters...,{Interstate 95 South Ramp From Exit 344 Butler}
8979,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,Florida,At Cleared Traffic Congestion At Duval At Int...,"{Exit 56, Interstate 295 East North}"
8980,CLEARED Disabled vehicle in Duval on I-95 nor...,official,fl511_northeast,CLEARED Disabled vehicle in Duval on I-95 nor...,1,Florida,At Cleared Disabled Vehicle At Duval At Inter...,{Interstate 95 North Ramp To Exit 353B Union}


In [16]:
test_df.to_csv('./spacy_extracted_tweets.csv')