# Location Extraction and Spacy Word Vectorization

In [5]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import datetime

In [6]:
# Read in csv with Tweets
twitter_closures = pd.read_csv("../data/Cleaned_Tweets/cleaned_historic_official_07292019.csv")
exits = pd.read_csv("../data/interstate_exits.csv")

twitter_closures = twitter_closures[['date', 'text', 'type', 'username', 'tweet', 'road_closure']]

# Print DF shape
print(twitter_closures.shape)

# Show head 
twitter_closures.head()

(24054, 6)


Unnamed: 0,date,text,type,username,tweet,road_closure
0,2016-10-11 16:39:51+00:00,The PIOH for the SR 138 I-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,0
1,2016-10-10 19:10:23+00:00,We appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,0
2,2016-10-09 17:30:22+00:00,Need info on re-entry Check here Also follow...,official,GDOTATL,Need info on re-entry Check here Also follow...,0
3,2016-10-09 15:45:11+00:00,Do you need some updates You can find ALL of ...,official,GDOTATL,Do you need some updates You can find ALL of ...,0
4,2016-10-09 14:00:20+00:00,Did you know that the 511Georgia website and ...,official,GDOTATL,Did you know that the 511Georgia website and ...,0


## SpaCy Preprocessing

In [7]:
# Create new columns to transfer modified tweet text. Five versions of tweets will be created.
twitter_closures['modified_text'] = ''
twitter_closures['location'] = ''

# Show modified DF
twitter_closures.head(2)

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location
0,2016-10-11 16:39:51+00:00,The PIOH for the SR 138 I-20 is going on now u...,official,GDOTATL,The PIOH for the SR 138 I-20 is going on now u...,0,,
1,2016-10-10 19:10:23+00:00,We appreciate all the hard work our crews have...,official,GDOTATL,We appreciate all the hard work our crews have...,0,,


In [8]:
format_dict = {"hwy": "Highway ",
            "Blvd": "Boulevard",
            " st": "street",
           "CR ": "Country Road ",
           "SR ": "State Road",
           "I-": "Interstate ",
           "EB ": "Eastbound ",
           "WB ": "Westbound ",
           "SB ": "Southbound",
           "NB ": "Northbound",
           " on ": " at ",
           " E ": " East ",
           " W ": " West ",
           " S ": " South",
           " N ": " North",
           "mi ": "mile ",
           "between ": "at ",
           "Between ": "at ",
           "In ": "in ",
           " in ": " at "}

In [9]:
def spacy_cleaner(df, col, word_dict):
    modified_text = "At " + df[col].replace(word_dict, regex=True)
    modified_text = modified_text.str.title()
    return modified_text

In [10]:
# run the text cleaning function and test results
twitter_closures['modified_text'] = spacy_cleaner(twitter_closures, 'text', format_dict)

In [11]:
twitter_closures['username'].unique()

array(['GDOTATL', 'SCDOTMidlands', 'SCDOTPeeDee', 'SCDOTLowCountry',
       'SCDOTPiedmont', '511statewideva', 'fl511_panhandl', '511Georgia',
       'fl511_state', 'fl511_northeast', 'fl511_southeast',
       'fl511_southwest', 'fl511_tampabay', 'fl511_central',
       '511centralva', '511hamptonroads', '511northernva',
       'NCDOT_Westmtn', 'NCDOT_Triangle', 'NCDOT_Piedmont',
       'NCDOT_Charlotte', 'NCDOT_Asheville', 'NCDOT_Scoast',
       'NCDOT_Ncoast'], dtype=object)

In [12]:
twitter_closures['date'] = pd.to_datetime(twitter_closures['date'])

In [13]:
test_df = twitter_closures[(twitter_closures['road_closure'] == 1) & (twitter_closures['username'] == 'fl511_northeast')].head(100)

In [14]:
def get_loc(df, text_column, location_column):
    
    # Use Spacy to extract location names from `text` column
    for i in range(len(df)):
        
        #instantiate spacy model
        nlp = spacy.load("en_core_web_sm")
        
        # create documewnt from modified text column
        doc = nlp(df[text_column].iloc[i])
        
        locations = set()
        
        # loop through every entity in the doc
        for ent in doc.ents:
            
            # find entities labelled as places
            if (ent.label_=='GPE') or (ent.label_=='FAC') or (ent.label_ == 'LOC'):
                
                # put locations in a set
                locations.add(ent.text)
                df[location_column].iloc[i] = locations
                
    return df[location_column]

In [15]:
test = get_loc(test_df, 'modified_text', 'location')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
test_df['location'] = test
test_df.head()

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location
8976,2016-10-13 23:33:35+00:00,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Exit 5 San Jose, Interstate 295 West North, E..."
8977,2016-10-13 23:08:56+00:00,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Interstate 295 East South, Exit 53}"
8978,2016-10-13 23:03:56+00:00,UPDATE Disabled vehicle in Duval on I-95 sout...,official,fl511_northeast,UPDATE Disabled vehicle in Duval on I-95 sout...,1,At Update Disabled Vehicle At Duval At Inters...,{Interstate 95 South Ramp From Exit 344 Butler}
8979,2016-10-13 23:03:34+00:00,CLEARED Traffic congestion in Duval on I-295 ...,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 ...,1,At Cleared Traffic Congestion At Duval At Int...,"{Interstate 295 East North, Exit 56}"
8980,2016-10-13 22:53:49+00:00,CLEARED Disabled vehicle in Duval on I-95 nor...,official,fl511_northeast,CLEARED Disabled vehicle in Duval on I-95 nor...,1,At Cleared Disabled Vehicle At Duval At Inter...,{Interstate 95 North Ramp To Exit 353B Union}


In [17]:
test_df.to_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv", index = False)

In [18]:
test_df = pd.read_csv("../data/Loc_Extracted/tweet_locations_sample_07302019.csv")

In [146]:
def exit_extractor (df, col, i_df):
    
    exits = []
    interstates = []
    lat = []
    long = []
    direction = []
    
    for item in df[col]:
        if 'Interstate' in item:
            
            i_string = re.search(r'Interstate (\S+)', item)
            interstates.append(i_string.group(0))
            
            if 'Exit' in item:
                e_string = re.search(r'Exit (\S+)', item)
                exits.append(e_string.group(0))
            else:
                exits.append("None")
        else:
            interstates.append("None")
            exits.append("None")
    
    new_df = pd.DataFrame(data = interstates, columns = ['interstate'])
    new_df['exits'] = exits
    
    for index, row in new_df.iterrows():
        
        if (row['interstate'] != "None") and (row['exits'] != "None"):
            
            mask = (i_df['interstate'].str.contains(row['interstate'])) & (i_df['exits'].str.contains(row['exits']))
            
            try:
                print(i_df[mask]['exits'].iloc[0] + ' ' + i_df[mask]['interstate'].iloc[0] + ' ' + i_df[mask]['dir'].iloc[0])
                
                lat.append(i_df[mask].iloc[0]['lat'])
                long.append(i_df[mask].iloc[0]['long'])
                
            except:
                print(f"No exit found at {index}")
                
                lat.append(np.nan)
                long.append(np.nan)

        else:
            print("No exit found")
            lat.append(np.nan)
            long.append(np.nan)
            
    
    new_df['lat'] = lat
    new_df['long'] = long
    
    return new_df

In [68]:
exits[(exits['interstate'] == 'Interstate 295') & (exits['exits'] == "Exit 12")].iloc[0]['lat']

'30.19137'

In [147]:
e_df = exit_extractor(test_df, 'modified_text', exits)

Exit 3 Interstate 295 S
Exit 53 Interstate 295 S
Exit 344 Interstate 95 N
Exit 56 Interstate 295 S
Exit 353B Interstate 95 N
Exit 347 Interstate 95 N
Exit 56 Interstate 295 S
No exit found
No exit found
No exit found
Exit 344 Interstate 95 N
No exit found
Exit 3 Interstate 295 S
Exit 53 Interstate 295 S
Exit 353B Interstate 95 N
Exit 12 Interstate 295 N
No exit found
Exit 355 Interstate 95 N
Exit 347 Interstate 95 N
Exit 344 Interstate 95 N
Exit 337 Interstate 95 N
Exit 12 Interstate 295 N
No exit found
Exit 53 Interstate 295 S
Exit 346A Interstate 95 S
Exit 25 Interstate 295 N
Exit 337 Interstate 95 N
No exit found
Exit 52 Interstate 295 S
No exit found
Exit 52 Interstate 295 S
No exit found
No exit found
No exit found
Exit 360 Interstate 10 E
No exit found
Exit 25 Interstate 295 N
Exit 53 Interstate 295 S
Exit 25 Interstate 295 N
No exit found
Exit 337 Interstate 95 N
Exit 53 Interstate 295 S
No exit found
Exit 337 Interstate 95 N
No exit found
No exit found
No exit found
Exit 53 Int

In [150]:
e_df.shape

(100, 4)

In [138]:
exits[(exits['interstate'] == 'Interstate 95') & (exits['exits'] == 'Exit 348')]

Unnamed: 0.1,Unnamed: 0,exit,crossSt,dir,lat,long,interstate,exits
252,252,348,US-90 Atlantic Blvd,N,30.30263,-81.64124,Interstate 95,Exit 348


In [36]:
for item in exit_df.iterrows():
    if item[1]['exits'] != "None":
        print(item[1]['exits'])

Exit 3
Exit 53
Exit 344
Exit 56
Exit 353B
Exit 347
Exit 56
Exit 344
Exit 3
Exit 53
Exit 353B
Exit 12
Exit 355
Exit 347
Exit 344
Exit 337
Exit 12
Exit 53
Exit 346
Exit 25
Exit 337
Exit 52
Exit 52
Exit 360
Exit 25
Exit 53
Exit 25
Exit 337
Exit 53
Exit 337
Exit 53
Exit 360
Exit 360
Exit 61
Exit 360
Exit 360
Exit 355
Exit 61
Exit 3
Exit 3
Exit 349
Exit 58
Exit 346
Exit 3
Exit 356
Exit 51
Exit 344
Exit 354
Exit 349
Exit 350
Exit 52
Exit 58
Exit 53
Exit 337
Exit 346
Exit 345
Exit 355
Exit 350
Exit 344
Exit 347
Exit 53
Exit 355
Exit 346
Exit 346
Exit 354
Exit 353
Exit 3
Exit 3
Exit 354
Exit 354


In [87]:
exits

Unnamed: 0.1,Unnamed: 0,exit,crossSt,dir,lat,long,interstate,exits
0,0,1A,,S,25.75506,-80.20200,Interstate 95,Exit 1A
1,1,1B,,N,25.76255,-80.19960,Interstate 95,Exit 1B
2,2,1B,,S,25.77588,-80.19986,Interstate 95,Exit 1B
3,3,2A,,N,25.77139,-80.19882,Interstate 95,Exit 2A
4,4,2B,,N,25.77339,-80.19866,Interstate 95,Exit 2B
5,5,2C,,S,25.7752,-80.19976,Interstate 95,Exit 2C
6,6,2D,,N,25.78082,-80.19989,Interstate 95,Exit 2D
7,7,3A,,S,25.79316,-80.20553,Interstate 95,Exit 3A
8,8,3B,,S,25.7877,-80.20368,Interstate 95,Exit 3B
9,9,2D,,S,25.79003,-80.20486,Interstate 95,Exit 2D


In [151]:
exits = exit_extractor(test_df, 'modified_text', i95)
final_df = pd.concat([test_df, exits], axis = 1)
final_df.head()

Unnamed: 0,date,text,type,username,tweet,road_closure,modified_text,location,interstate,exits,lat,long
0,2016-11-29 23:59:23+00:00,"NEW Crash in Duval on I-295 E north beyond Philips Hwy, right lane blocked",official,fl511_northeast,"NEW Crash in Duval on I-295 E north beyond Philips Hwy, right lane blocked",1,"At New Crash At Duval At Interstate 295 East North Beyond Philips Hwy, Right Lane Blocked","{'Interstate 295 East North', 'New Crash At Duval', 'Right Lane'}",Interstate 295,,,
1,2016-11-29 23:59:15+00:00,CLEARED Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,1,At Cleared Traffic Congestion At Duval At Interstate 95 North From Before Fuller Warren To Ramp To Exit 351 Interstate 10,"{'Interstate 10', 'Interstate 95 North'}",Interstate 95,Exit 351,30.31568,-81.67278
2,2016-11-29 23:54:26+00:00,CLEARED Traffic congestion in Duval on I-295 W north from Exit 5 San Jose to at Buckman,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 W north from Exit 5 San Jose to at Buckman,1,At Cleared Traffic Congestion At Duval At Interstate 295 West North From Exit 5 San Jose To At Buckman,"{'Interstate 295 West North', 'San Jose', 'Exit 5'}",Interstate 295,Exit 5,,
3,2016-11-29 23:49:10+00:00,UPDATE Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,official,fl511_northeast,UPDATE Traffic congestion in Duval on I-95 north from before Fuller Warren to ramp to Exit 351 I-10,1,At Update Traffic Congestion At Duval At Interstate 95 North From Before Fuller Warren To Ramp To Exit 351 Interstate 10,"{'Interstate 10', 'Interstate 95 North'}",Interstate 95,Exit 351,30.31568,-81.67278
4,2016-11-29 23:49:10+00:00,CLEARED Traffic congestion in Duval on I-295 E south from Exit 53 Butler to at Exit 56 Baymeadows,official,fl511_northeast,CLEARED Traffic congestion in Duval on I-295 E south from Exit 53 Butler to at Exit 56 Baymeadows,1,At Cleared Traffic Congestion At Duval At Interstate 295 East South From Exit 53 Butler To At Exit 56 Baymeadows,"{'Exit 56', 'Exit 53', 'Interstate 295 East South'}",Interstate 295,Exit 53,,


In [98]:
i95.head()

Unnamed: 0,exit,crossSt,dir,lat,long,interstate,exits
0,1A,,S,25.75506,-80.202,Interstate 95,Exit 1A
1,1B,,N,25.76255,-80.1996,Interstate 95,Exit 1B
2,1B,,S,25.77588,-80.19986,Interstate 95,Exit 1B
3,2A,,N,25.77139,-80.19882,Interstate 95,Exit 2A
4,2B,,N,25.77339,-80.19866,Interstate 95,Exit 2B


In [119]:
i95[i95['exits'].str.contains('Exit 351')].iloc[0]

exit                   351A
crossSt             Park St
dir                       N
lat                30.31568
long               -81.6728
interstate    Interstate 95
exits             Exit 351A
Name: 253, dtype: object

In [129]:
mylist = []
for row in final_df.iterrows():     
    if row[1]['interstate'] != "None":
            
        if (row[1]['interstate'] in i95['interstate'].unique()) and (row[1]['exits'] != "None"): 
            print(f"{row[1]['exits']} Lat: {i95[i95['exits'].str.contains(row[1]['exits'])]['lat'].iloc[0]}")
            mylist.append(row[1]['exits'])
        else:
            print("No Exit Found")
            mylist.append("None")

No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
Exit 344 Lat: 30.24393
No Exit Found
No Exit Found
No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
No Exit Found
Exit 347 Lat: 30.288710000000002
Exit 344 Lat: 30.24393
No Exit Found
Exit 344 Lat: 30.24393
Exit 344 Lat: 30.24393
No Exit Found
No Exit Found
No Exit Found
Exit 337 Lat: 30.15859
No Exit Found
No Exit Found
Exit 347 Lat: 30.288710000000002
No Exit Found
Exit 347 Lat: 30.288710000000002
Exit 346 Lat: 30.27203
No Exit Found
Exit 351 Lat: 30.31568
Exit 337 Lat: 30.15859
No Exit Found
Exit 346 Lat: 30.27203
No Exit Found
No Exit Found
No Exit Found
Exit 344 Lat: 30.24393
No Exit Found
No Exit Found
No Exit Found
No Exit Found
No Exit Found
Exit 337 Lat: 30.15859
Exit 337 Lat: 30.15859
No Exit Found
Exit 353 Lat: 30.33461
No Exit Found
No Exit Found
No Exit Found
No Exit Found
Exit 347 Lat: 30.288710000000002
No Exit Found
Exit 351 Lat: 30.31568
No Exit Found
Exit 353 Lat: 30.33461
No Exi

In [131]:
len(mylist)

100

In [132]:
len(final_df)

100