# Location Extraction and Spacy Word Vectorization

In [2]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import datetime

In [3]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07302019.csv")
exits = pd.read_csv("../data/interstate_exits.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'state', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(24054, 7)


Unnamed: 0,date,text,type,username,tweet,state,road_closure
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0
2,2016-10-09 17:30:22+00:00,need info on re-entry check here also follow...,official,GDOTATL,Need info on re-entry Check here Also follow...,Georgia,0
3,2016-10-09 15:45:11+00:00,do you need some updates you can find all of ...,official,GDOTATL,Do you need some updates You can find ALL of ...,Georgia,0
4,2016-10-09 14:00:20+00:00,did you know that the 511georgia website and ...,official,GDOTATL,Did you know that the 511Georgia website and ...,Georgia,0


## SpaCy Preprocessing

In [4]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
0,2016-10-11 16:39:51+00:00,the pioh for the sr 138 i-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,Georgia,0,,
1,2016-10-10 19:10:23+00:00,we appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,Georgia,0,,


In [5]:
format_dict = {"hwy": "highway ",
            "blvd": "boulevard",
            " st": "street",
           "CR ": "County Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           " In ": " in",
           " in ": " at "}

In [6]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [7]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'tweet', format_dict)

In [8]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [9]:
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])

In [10]:
twitter_closures = twitter_closures[(twitter_closures['date'] > '2016-10-4') & (twitter_closures['date'] < '2016-10-5')]

In [11]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')]
test_df.shape

(167, 9)

In [12]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [13]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
test_df['location'] = test
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location
14259,2016-10-04 23:58:11+00:00,new crash in duval on i-95 south before 8th s...,official,fl511_northeast,NEW Crash in Duval on I-95 south before 8th S...,Florida,1,At New Crash At Duval At Interstate 95 South ...,"{Interstate 95 South, New Crash At Duval}"
14260,2016-10-04 22:43:11+00:00,update traffic congestion in duval on i-95 so...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 so...,Florida,1,At Update Traffic Congestion At Duval At Inte...,{Interstate 95 South From Exit 337 Interstate ...
14261,2016-10-04 22:32:41+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{Interstate 10, Exit 21, Normandy Blvd, Inters..."
14262,2016-10-04 22:28:37+00:00,update traffic congestion in duval on i-295 e...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 E...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{Exit 56, Exit 53, Interstate 295 East South}"
14263,2016-10-04 22:22:46+00:00,update traffic congestion in duval on i-95 no...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 no...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{Interstate 10, Interstate 95 North From Exit ..."


In [15]:
test_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv", index = False)

In [16]:
test_df = pd.read_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv")

In [17]:
def exit_extractor (df, col, i_df):
    
    # instantiate lists for exit data
    exits = []
    interstates = []
    lat = []
    long = []
    direction = []
    
    # loop through text column
    for item in df[col]:
        
        # look for "interstate" in text
        if 'Interstate' in item:
            
            # use regex to extract interstate and number from text
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            
            # find "exit" in text
            if 'Exit' in item:
                
                # use regex to extract interstate and number from text
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            
            # add "none" when no exit is found   
            else:
                exits.append("None")
                
        # add "none" to exits and interstates if no interstate is found
        else:
            interstates.append("None")
            exits.append("None")
            
    # create a new dataframe from the interstate and exit lists
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    
    # loop through the new dataframe
    for index, row in new_df.iterrows():
        
        # find rows that have both an interstate and exit extracted
        if (row['interstate'] != "None") and (row['exits'] != "None"):
            
            # create filter mask
            mask = (i_df['interstate'].str.contains(row['interstate'])) & (i_df['exits'].str.contains(row['exits']))
            
            # attempt to add lat and long based on exit and interstate strings
            try:
                # add lat and long to list
                lat.append(i_df[mask].iloc[0]['lat'])
                long.append(i_df[mask].iloc[0]['long'])
            
            # if an error occurs, append null to lat and long
            # print index where error occured
            except:
                print(f"No exit found at {index}")
                
                lat.append(np.nan)
                long.append(np.nan)
                
        # if no exit is found, add null values to lat and long
        else:
            lat.append(np.nan)
            long.append(np.nan)
            
    # add lat and long to new dataframe
    new_df['lat'] = lat
    new_df['long'] = long
    
    # sreturn new dataframe
    return new_df

In [18]:
e_df = exit_extractor(test_df, 'modified_text', exits)

No exit found at 9
No exit found at 19
No exit found at 28
No exit found at 36
No exit found at 49
No exit found at 88
No exit found at 89
No exit found at 153
No exit found at 161


In [19]:
final_df = pd.concat([test_df, e_df], axis = 1)

In [20]:
final_df

Unnamed: 0,date,text,type,username,tweet,state,road_closure,modified_text,location,interstate,exits,lat,long
0,2016-10-04 23:58:11+00:00,new crash in duval on i-95 south before 8th s...,official,fl511_northeast,NEW Crash in Duval on I-95 south before 8th S...,Florida,1,At New Crash At Duval At Interstate 95 South ...,"{'Interstate 95 South', 'New Crash At Duval'}",Interstate 95,,,
1,2016-10-04 22:43:11+00:00,update traffic congestion in duval on i-95 so...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 so...,Florida,1,At Update Traffic Congestion At Duval At Inte...,{'Interstate 95 South From Exit 337 Interstate...,Interstate 95,Exit 337,30.15859,-81.55057
2,2016-10-04 22:32:41+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Interstate 10', 'Exit 21', 'Normandy Blvd', ...",Interstate 295,Exit 21,30.31098,-81.76966
3,2016-10-04 22:28:37+00:00,update traffic congestion in duval on i-295 e...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 E...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Exit 56', 'Exit 53', 'Interstate 295 East So...",Interstate 295,Exit 53,30.257,-81.51787
4,2016-10-04 22:22:46+00:00,update traffic congestion in duval on i-95 no...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 no...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Interstate 10', 'Interstate 95 North From Ex...",Interstate 95,Exit 344,30.24393,-81.58858
5,2016-10-04 22:09:08+00:00,new traffic congestion in duval on i-95 south...,official,fl511_northeast,NEW Traffic congestion in Duval on I-95 south...,Florida,1,At New Traffic Congestion At Duval At Interst...,{'Interstate 95 South From Exit 337 Interstate...,Interstate 95,Exit 337,30.15859,-81.55057
6,2016-10-04 22:02:53+00:00,update traffic congestion in duval on i-295 w...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-295 W...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Wilson Blvd', 'Exit 25', 'Interstate 295 Wes...",Interstate 295,Exit 25,30.36906,-81.76069
7,2016-10-04 21:58:14+00:00,update disabled vehicle in duval on i-95 nort...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-95 nort...,Florida,1,At Update Disabled Vehicle At Duval At Inters...,"{'Warren', 'Interstate 95 North'}",Interstate 95,,,
8,2016-10-04 21:58:12+00:00,update traffic congestion in duval on sr-202 ...,official,fl511_northeast,UPDATE Traffic congestion in Duval on SR-202 ...,Florida,1,At Update Traffic Congestion At Duval At Sr-2...,"{'Kernan Blvd', 'Blvd East From Southside'}",,,,
9,2016-10-04 21:58:11+00:00,update traffic congestion in duval on i-10 we...,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-10 we...,Florida,1,At Update Traffic Congestion At Duval At Inte...,"{'Exit 357 Lane Ave', 'Interstate 10 West From...",Interstate 10,Exit 363,,


In [91]:
final_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv", index = False)