# Location Extraction and Spacy Word Vectorization

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import datetime

In [2]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07302019.csv")
exits = pd.read_csv("../data/interstate_exits.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'state', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(24054, 7)


Unnamed: 0,date,text,type,username,tweet,state,road_closure
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0
2,2016-10-09 17:30:22+00:00,need info on re-entry check here also follow...,official,GDOTATL,Need info on re-entry Check here Also follow...,Georgia,0
3,2016-10-09 15:45:11+00:00,do you need some updates you can find all of ...,official,GDOTATL,Do you need some updates You can find ALL of ...,Georgia,0
4,2016-10-09 14:00:20+00:00,did you know that the 511georgia website and ...,official,GDOTATL,Did you know that the 511Georgia website and ...,Georgia,0


## SpaCy Preprocessing

In [3]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0,,
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0,,


In [4]:
format_dict = {"hwy": "highway ",
            "blvd": "boulevard",
            " st": "street",
           "CR ": "County Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           " In ": " in",
           " in ": " at "}

In [5]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [6]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'tweet', format_dict)

In [7]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [8]:
# convert date column to datetime
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])

In [9]:
# for ease of use, only take tweets that happened on october 4, the day Hurricane Matthew hit Jacksonville
twitter_closures = twitter_closures[(twitter_closures['date'] > '2016-10-6') & (twitter_closures['date'] < '2016-10-9')]

In [10]:
# only use tweets that contain road closures are from 'fl511 northeast'
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')]
test_df.shape

(267, 9)

In [11]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [12]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
test_df['location'] = test
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
13806,2016-10-08 23:33:38+00:00,new disabled vehicle in duval on sr-202 but...,official,fl511_northeast,NEW Disabled vehicle in Duval on SR-202 But...,Florida,1,At New Disabled Vehicle At Duval At Sr-202 ...,
13807,2016-10-08 23:18:05+00:00,new disabled vehicle in duval on i-295 w nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{San Jose Right Lane Blocked}
13808,2016-10-08 21:58:06+00:00,new unconfirmed disabled vehicle in duval on ...,official,fl511_northeast,NEW Unconfirmed disabled vehicle in Duval on ...,Florida,1,At New Unconfirmed Disabled Vehicle At Duval ...,{Interstate 95}
13809,2016-10-08 21:33:05+00:00,update disabled vehicle in duval on i-295 w n...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-295 W n...,Florida,1,At Update Disabled Vehicle At Duval At Inters...,
13810,2016-10-08 21:28:40+00:00,new disabled vehicle in duval on i-295 w nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,


In [14]:
test_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv", index = False)

In [15]:
test_df = pd.read_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv")

In [16]:
# function to extract interstate, exit number, and direction
def exit_extractor (df, col, i_df):
    
    # instantiate lists for exit data
    exits = []
    interstates = []
    lat = []
    long = []
    direction = []
    
    # loop through text column
    for item in df[col]:
        
        # look for "interstate" in text
        if 'Interstate' in item:
            
            # use regex to extract interstate and number from text
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            
            # use regex to extract direction following "interstate"
            d_string = re.search("(i-\d*|Interstate \d*) (South|North|East|West)*", item)
            d_string = d_string.group(0)
            d_string =  re.search("South|North|East|West", d_string)
            
            # try to extract the direction from the regex object
            # append null if an error is thrown
            try:
                d_string = d_string.group(0)
                
            except AttributeError: 
                d_string = np.nan
                
            # append direction to list    
            direction.append(d_string)
                             
            # find "exit" in text
            if 'Exit' in item:
                
                # use regex to extract interstate and number from text
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            
            # add "none" when no exit is found   
            else:
                exits.append("None")
                
        # add "none" to exits and interstates if no interstate is found
        else:
            interstates.append("None")
            exits.append("None")
            direction.append("None")
            
    # create a new dataframe from the interstate and exit lists
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    new_df['direction'] = direction
    
    # return new dataframe
    return new_df

In [17]:
# function to extract longitude and latitude, if available
def loc_extractor(new_df, i_df):
    
    lat = []
    long = []
    
    # loop through the new dataframe
    for index, row in new_df.iterrows():
        
        # find rows that have both an interstate and exit extracted
        if (row['interstate'] != "None") and (row['exits'] != "None") and row['direction'] != "None":
            
            # attempt to add lat and long based on exit and interstate strings
            try:    
                mask = (i_df['interstate'].str.contains(row['interstate'])) & (i_df['exits'].str.contains(row['exits']))
                
                # add lat and long to list
                lat.append(i_df[mask].iloc[0]['lat'])
                long.append(i_df[mask].iloc[0]['long'])
            
            # if an error occurs, append null to lat and long
            # print index where error occured
            except:
                print(f"No exit found at {index}")

                lat.append(np.nan)
                long.append(np.nan)
        # if no exit is found, add null values to lat and long
        else:
            lat.append(np.nan)
            long.append(np.nan)
            
    # add lat and long to new dataframe
    new_df['lat'] = lat
    new_df['long'] = long
    
    return new_df

In [18]:
e_df = exit_extractor(test_df, 'modified_text', exits)
e_df['direction'].value_counts()

West     98
None     68
East     36
North    33
South    32
Name: direction, dtype: int64

In [19]:
e_df = loc_extractor(exit_extractor(test_df, 'modified_text', exits), exits)

No exit found at 148
No exit found at 154
No exit found at 206
No exit found at 216
No exit found at 237
No exit found at 248
No exit found at 250


In [20]:
final_df = pd.concat([test_df, e_df], axis = 1)

In [21]:
final_df.dropna()

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location,interstate,exits,direction,lat,long
6,2016-10-08 20:38:21+00:00,new disabled vehicle in duval on i-295 e nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 E nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Exit 60'},Interstate 295,Exit 60,East,30.16893,-81.53623
13,2016-10-08 19:08:32+00:00,new disabled vehicle in duval on i-295 w sout...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W sout...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Exit 22 Commonwealth'},Interstate 295,Exit 22,West,30.33213,-81.76245
25,2016-10-08 15:53:25+00:00,new disabled vehicle in duval on i-295 e nort...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 E nort...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Exit 51 Beach Blvd'},Interstate 295,Exit 51,East,30.29054,-81.52207
27,2016-10-08 15:43:35+00:00,new disabled vehicle in duval on i-10 east ra...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-10 east ra...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Exit 356 Interstate 295 West'},Interstate 10,Exit 356,East,30.315179999999998,-81.77517
38,2016-10-08 13:48:03+00:00,new disabled vehicle in duval on i-95 north r...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-95 north r...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Exit 366'},Interstate 95,Exit 366,North,30.51338,-81.63441
45,2016-10-08 11:28:02+00:00,update disabled truck in duval on i-10 east r...,official,fl511_northeast,UPDATE Disabled truck in Duval on I-10 east r...,Florida,1,At Update Disabled Truck At Duval At Intersta...,"{'Exit 360 Mcduff', 'Interstate 10 East Ramp'}",Interstate 10,Exit 360,East,30.320859999999996,-81.70904
46,2016-10-08 11:23:02+00:00,new unconfirmed disabled vehicle in duval on ...,official,fl511_northeast,NEW Unconfirmed disabled vehicle in Duval on ...,Florida,1,At New Unconfirmed Disabled Vehicle At Duval ...,{'Exit 360 Mcduff'},Interstate 10,Exit 360,East,30.320859999999996,-81.70904
52,2016-10-08 06:38:01+00:00,update object on roadway in duval on i-95 nor...,official,fl511_northeast,UPDATE Object on roadway in Duval on I-95 nor...,Florida,1,At Update Object At Roadway At Duval At Inter...,{'Exit 341 Baymeadows 2 Right Lanes Blocked'},Interstate 95,Exit 341,North,30.21727,-81.56876
54,2016-10-08 05:18:27+00:00,new object on roadway in duval on i-95 north ...,official,fl511_northeast,NEW Object on roadway in Duval on I-95 north ...,Florida,1,At New Object At Roadway At Duval At Intersta...,{'Exit 341 Baymeadows 2 Right Lanes Blocked'},Interstate 95,Exit 341,North,30.21727,-81.56876
69,2016-10-08 03:33:06+00:00,update object on roadway in duval on i-295 w ...,official,fl511_northeast,UPDATE Object on roadway in Duval on I-295 W ...,Florida,1,At Update Object At Roadway At Duval At Inter...,"{'Exit 12', 'West North'}",Interstate 295,Exit 12,West,30.19137,-81.71509


In [22]:
final_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07312019.csv", index = False)