### Named Entity Recognition with spacy library - Foretify.ai - Friday July 1

In [16]:
!pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 26.1 MB/s 
Collecting fiona>=1.8
  Downloading Fiona-1.8.21-cp37-cp37m-manylinux2014_x86_64.whl (16.7 MB)
[K     |████████████████████████████████| 16.7 MB 32.3 MB/s 
Collecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 21.1 MB/s 
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.21 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1


In [27]:
import spacy
from spacy import displacy
import os 
import pandas as pd
import geopandas as gpd
import geopy
import matplotlib.pyplot as plt
from geopy.extra.rate_limiter import RateLimiter
import folium
from folium.plugins import FastMarkerCluster

In [13]:
extract = spacy.load('en_core_web_sm')

doc = text('William Han was last seen in Torrance. He was riding his bike around Los Angeles, then rode the bus to Carson.')

displacy.render(doc, style='ent')

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    William Han\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>\n</mark>\n was last seen in \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Torrance\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GPE</span>\n</mark>\n. He was riding his bike around \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Los Angeles\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle

In [10]:
!ls

0.txt	 104.txt  1.txt    307.txt  311.txt  318.txt  sample_data
100.txt  105.txt  304.txt  308.txt  312.txt  319.txt
101.txt  106.txt  305.txt  309.txt  313.txt  31.txt
103.txt  10.txt   306.txt  310.txt  314.txt  32.txt


In [14]:
locations = []

for i in os.listdir('./'):
  if 'txt' in i:
    with open(f'./{i}', encoding='utf-8') as f:
      doc = extract(f.read())
      locations.extend([[i, ent.text, ent.start, ent.end]
                        for ent in doc.ents if ent.label_ in ['LOC']])
      
df = pd.DataFrame(locations, columns=['File', 'Location', 'start', 'end'])
df.head()


Unnamed: 0,File,Location,start,end
0,313.txt,Mamo,272,273
1,10.txt,Lake Manassas,424,426
2,0.txt,the Mississippi River,98,101
3,32.txt,25th District,34,36
4,104.txt,Mississippi State,357,359


In [18]:
locator = geopy.geocoders.Nominatim(user_agent='mygeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
df['address'] = df['Location'].apply(geocode)

In [19]:
len(df)

20

In [22]:
df['address'][0]

Location(Mamo, Gwer West, Benue, Nigeria, (7.559, 8.0912, 0.0))

In [23]:
df['coordinates'] = df['address'].apply(lambda loc: tuple(loc.point) if loc else None)
df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)

df.latitude.isnull().sum()

df=df[pd.notnull(df['latitude'])]

In [29]:
folium_map = folium.Map(location=[59.338315, 18.089960], 
                        zoom_start = 1, tiles = 'CartoDB dark_matter')

FastMarkerCluster(data=list(zip(df['latitude'].values,
                                df['longitude'].values))).add_to(folium_map)
folium.LayerControl().add_to(folium_map)
folium_map 