# Location Extraction and Spacy Word Vectorization

In [20]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import datetime

In [30]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07302019.csv")
exits = pd.read_csv("../data/interstate_exits.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'state', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(24054, 7)


Unnamed: 0,date,text,type,username,tweet,state,road_closure
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0
2,2016-10-09 17:30:22+00:00,need info on re-entry check here also follow...,official,GDOTATL,Need info on re-entry Check here Also follow...,Georgia,0
3,2016-10-09 15:45:11+00:00,do you need some updates you can find all of ...,official,GDOTATL,Do you need some updates You can find ALL of ...,Georgia,0
4,2016-10-09 14:00:20+00:00,did you know that the 511georgia website and ...,official,GDOTATL,Did you know that the 511Georgia website and ...,Georgia,0


## SpaCy Preprocessing

In [31]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0,,
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0,,


In [39]:
format_dict = {"hwy": "highway ",
            "blvd": "boulevard",
            " st": "street",
           "CR ": "County Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           " In ": " in",
           " in ": " at "}

In [40]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [41]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'tweet', format_dict)

In [42]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [43]:
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])

In [65]:
twitter_closures[(twitter_closures['date'] > '2016-10-4') & (twitter_closures['date'] < '2016-10-5')]

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
433,2016-10-04 23:59:18+00:00,"cleared construction i-77 nb, btwn 1 mi s of...",official,SCDOTMidlands,"CLEARED Construction I-77 NB, btwn 1 mi S of...",South Carolina,0,"At Cleared Construction Interstate 77 Nb, Bt...",
434,2016-10-04 23:54:17+00:00,"construction i-26 wb btwn exit119 & exit116,...",official,SCDOTMidlands,"Construction I-26 WB btwn Exit119 & Exit116,...",South Carolina,0,At Construction Interstate 26 Westbound Btwn...,
435,2016-10-04 23:49:17+00:00,construction i-26 eb btwn exit119 & 3 mi w o...,official,SCDOTMidlands,Construction I-26 EB btwn Exit119 & 3 mi W o...,South Carolina,0,At Construction Interstate 26 Eastbound Btwn...,
436,2016-10-04 23:46:46+00:00,"construction i-20 wb btwn exit61 & exit58, r...",official,SCDOTMidlands,"Construction I-20 WB btwn Exit61 & Exit58, r...",South Carolina,0,At Construction Interstate 20 Westbound Btwn...,
437,2016-10-04 23:41:45+00:00,construction i-20 eb btwn exit58 & 1 mi w of...,official,SCDOTMidlands,Construction I-20 EB btwn Exit58 & 1 mi W of...,South Carolina,0,At Construction Interstate 20 Eastbound Btwn...,
438,2016-10-04 23:31:43+00:00,traf congestion i-26 wb btwn exit72 & exit66...,official,SCDOTMidlands,Traf congestion I-26 WB btwn Exit72 & Exit66...,South Carolina,0,At Traf Congestion Interstate 26 Westbound B...,
439,2016-10-04 23:16:42+00:00,"rd maint ops i-26 eb 2 mi w of exit91, rht l...",official,SCDOTMidlands,"Rd maint ops I-26 EB 2 mi W of Exit91, rht l...",South Carolina,0,At Rd Maint Ops Interstate 26 Eastbound 2 Mi...,
440,2016-10-04 22:44:19+00:00,construction i-77 nb btwn 1 mi s of exit2 & ...,official,SCDOTMidlands,Construction I-77 NB btwn 1 mi S of Exit2 & ...,South Carolina,0,At Construction Interstate 77 Northbound Btwn...,
441,2016-10-04 22:24:18+00:00,"cleared accident i-20 eb, at exit65 | 6 24p",official,SCDOTMidlands,"CLEARED Accident I-20 EB, at Exit65 | 6 24P",South Carolina,0,"At Cleared Accident Interstate 20 Eb, At Exi...",
442,2016-10-04 22:16:47+00:00,"update accident i-20 eb at exit65, no lns c...",official,SCDOTMidlands,"Update Accident I-20 EB at Exit65, no lns c...",South Carolina,0,At Update Accident Interstate 20 Eastbound ...,


In [44]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')].head(100)

In [45]:
test_df.head()

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
13238,2016-10-13 23:03:56+00:00,update disabled vehicle in duval on i-95 sout...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-95 sout...,Florida,1,At Update Disabled Vehicle At Duval At Inters...,
13239,2016-10-13 22:53:46+00:00,update traffic congestion in duval on i-295 e...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 E...,Florida,1,At Update Traffic Congestion At Duval At Inte...,
13240,2016-10-13 22:40:26+00:00,new disabled vehicle in duval on i-95 south r...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-95 south r...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,
13241,2016-10-13 22:40:25+00:00,new disabled vehicle in duval on i-295 w sout...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W sout...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,
13242,2016-10-13 22:38:47+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,


In [46]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [47]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [48]:
test_df['location'] = test
test_df.head()

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
13238,2016-10-13 23:03:56+00:00,update disabled vehicle in duval on i-95 sout...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-95 sout...,Florida,1,At Update Disabled Vehicle At Duval At Inters...,{Interstate 95 South Ramp From Exit 344 Butler}
13239,2016-10-13 22:53:46+00:00,update traffic congestion in duval on i-295 e...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 E...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{Interstate, East North, Exit 56}"
13240,2016-10-13 22:40:26+00:00,new disabled vehicle in duval on i-95 south r...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-95 south r...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{Interstate 95 South Ramp From Exit 344 Butler}
13241,2016-10-13 22:40:25+00:00,new disabled vehicle in duval on i-295 w sout...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 W sout...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,"{Interstate 295 West South, Blanding Blvd}"
13242,2016-10-13 22:38:47+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{Exit 3, Exit 5 San Jose, Interstate 295 West ..."


In [49]:
test_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv", index = False)

In [50]:
test_df = pd.read_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv")

In [51]:
def exit_extractor (df, col, i_df):
    
    exits = []
    interstates = []
    lat = []
    long = []
    direction = []
    
    for item in df[col]:
        if 'Interstate' in item:
            
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            
            if 'Exit' in item:
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            else:
                exits.append("None")
        else:
            interstates.append("None")
            exits.append("None")
    
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    
    for index, row in new_df.iterrows():
        
        if (row['interstate'] != "None") and (row['exits'] != "None"):
            
            mask = (i_df['interstate'].str.contains(row['interstate'])) & (i_df['exits'].str.contains(row['exits']))
            
            try:
                print(i_df[mask]['exits'].iloc[0] + ' ' + i_df[mask]['interstate'].iloc[0] + ' ' + i_df[mask]['dir'].iloc[0])
                
                lat.append(i_df[mask].iloc[0]['lat'])
                long.append(i_df[mask].iloc[0]['long'])
                
            except:
                print(f"No exit found at {index}")
                
                lat.append(np.nan)
                long.append(np.nan)

        else:
            print("No exit found")
            lat.append(np.nan)
            long.append(np.nan)
            
    
    new_df['lat'] = lat
    new_df['long'] = long
    
    return new_df

In [52]:
exits[(exits['interstate'] == 'Interstate 295') & (exits['exits'] == "Exit 12")].iloc[0]['lat']

'30.19137'

In [53]:
e_df = exit_extractor(test_df, 'modified_text', exits)

Exit 344 Interstate 95 N
Exit 56 Interstate 295 S
Exit 344 Interstate 95 N
No exit found
Exit 3 Interstate 295 S
Exit 53 Interstate 295 S
Exit 353B Interstate 95 N
Exit 347 Interstate 95 N
Exit 12 Interstate 295 N
No exit found
Exit 52 Interstate 295 S
No exit found
No exit found
No exit found
Exit 25 Interstate 295 N
Exit 53 Interstate 295 S
Exit 25 Interstate 295 N
Exit 337 Interstate 95 N
Exit 53 Interstate 295 S
No exit found
Exit 337 Interstate 95 N
No exit found
No exit found
No exit found
Exit 53 Interstate 295 S
No exit found
No exit found
Exit 360 Interstate 10 E
Exit 61B Interstate 295 S
Exit 360 Interstate 95 N
Exit 360 Interstate 10 E
Exit 355 Interstate 95 N
Exit 61B Interstate 295 S
Exit 3 Interstate 295 S
No exit found
No exit found at 35
Exit 58 Interstate 295 S
Exit 346A Interstate 95 S
Exit 3 Interstate 295 S
Exit 356A Interstate 95 N
Exit 51 Interstate 295 S
Exit 344 Interstate 95 N
No exit found
Exit 354A Interstate 95 N
No exit found at 44
Exit 52 Interstate 295 S


In [54]:
final_df = pd.concat([test_df, e_df], axis = 1)

In [58]:
final_df.dropna()

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location,interstate,exits,lat,long
0,2016-10-13 23:03:56+00:00,update disabled vehicle in duval on i-95 sout...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-95 sout...,Florida,1,At Update Disabled Vehicle At Duval At Inters...,{'Interstate 95 South Ramp From Exit 344 Butler'},Interstate 95,Exit 344,30.24393,-81.58858
1,2016-10-13 22:53:46+00:00,update traffic congestion in duval on i-295 e...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 E...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Interstate', 'East North', 'Exit 56'}",Interstate 295,Exit 56,30.21982,-81.51444
2,2016-10-13 22:40:26+00:00,new disabled vehicle in duval on i-95 south r...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-95 south r...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Interstate 95 South Ramp From Exit 344 Butler'},Interstate 95,Exit 344,30.24393,-81.58858
4,2016-10-13 22:38:47+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Exit 3', 'Exit 5 San Jose', 'Interstate 295 ...",Interstate 295,Exit 3,30.16897,-81.59703
5,2016-10-13 22:38:36+00:00,update traffic congestion in duval on i-295 e...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 E...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Interstate 295 East South', 'Exit 53'}",Interstate 295,Exit 53,30.257,-81.51787
6,2016-10-13 22:34:06+00:00,new disabled vehicle in duval on i-95 north r...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-95 north r...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,{'Interstate 95 North Ramp To Exit 353B Union'},Interstate 95,Exit 353B,30.33387,-81.67056
7,2016-10-13 22:33:41+00:00,update traffic congestion in duval on i-95 no...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 no...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Fuller', 'Interstate 95 North From Exit 347 ...",Interstate 95,Exit 347,30.288710000000002,-81.62745
8,2016-10-13 22:28:49+00:00,new unconfirmed disabled vehicle in duval on ...,official,fl511_northeast,NEW Unconfirmed disabled vehicle in Duval on ...,Florida,1,At New Unconfirmed Disabled Vehicle At Duval ...,"{'Exit 12 Blanding Blvd', 'Interstate 295 West...",Interstate 295,Exit 12,30.19137,-81.71509
10,2016-10-13 22:14:08+00:00,new disabled vehicle in duval on i-295 e sout...,official,fl511_northeast,NEW Disabled vehicle in Duval on I-295 E sout...,Florida,1,At New Disabled Vehicle At Duval At Interstat...,"{'Exit 52 University Of North Florida', 'Inter...",Interstate 295,Exit 52,30.27157,-81.52147
14,2016-10-13 21:29:47+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Exit 25', 'Pritchard Rd', 'Interstate 295 We...",Interstate 295,Exit 25,30.36906,-81.76069


In [None]:
final_df[final_df['date'] = '2016-10-10']