# Named Entity Recognition

This project aims to take preconditioned text and apply transformations for:
+ Tagging named entities
+ Entity Recognition
+ Entity Disambiguation

**Extract text from an article**

In [1]:
# Use urllib or requests package to read this CNBC article through its URL link
import urllib
html = urllib.request.urlopen('https://www.cnbc.com/2020/06/27/us-coronavirus-cases-surge-by-more-than-45000-as-states-roll-back-reopenings.html').read()

In [2]:
# Use BeautifulSoup (Links to an external site.) or another HTML parsing package to extract text from the article.
from bs4 import BeautifulSoup
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

cnbc_url=text_from_html(html)

**Extract Company/Organization and Geo entities**

---



In [3]:
import spacy as sp

In [4]:
# Load pre-existing spacy model
nlp = sp.load("en_core_web_sm")

In [5]:
#Count number of entities
article = nlp(cnbc_url)
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 26,
         'DATE': 53,
         'GPE': 46,
         'MONEY': 1,
         'NORP': 3,
         'ORG': 35,
         'PERCENT': 15,
         'PERSON': 17,
         'TIME': 4})

In [6]:
# Visualization of entities from the article
from spacy import displacy
options = {"ents":["ORG","GPE"]}
displacy.render(article, style ="ent", options = options, jupyter = True)

In [7]:
# Extract Company/Organization and Geo entities from the article chosen
for x in article.ents:
    if x.label_ == 'ORG' or x.label_ == 'GPE':
        print(x.label_ + ' - ' + x.text)

ORG - Watchlist Business Economy Finance Health & Science Media
ORG - White House
ORG - CNBC
ORG - Watchlist Business Economy Finance Health & Science Media
ORG - White House
ORG - CNBC
ORG - Science
GPE - U.S.
ORG - Updated Sun
ORG - Johns Hopkins University
GPE - U.S.
GPE - U.S.
GPE - Arizona
GPE - Texas
GPE - California
GPE - Florida
GPE - Nevada
GPE - Texas
GPE - Florida
GPE - Arizona
GPE - U.S.
ORG - Johns Hopkins University
ORG - Johns Hopkins
GPE - U.S.
ORG - Johns Hopkins
GPE - U.S.
GPE - U.S.
GPE - Arizona
GPE - Texas
GPE - California
GPE - Florida
GPE - Nevada
ORG - White House
ORG - CNBC
ORG - the Milken Institute
ORG - CNBC
ORG - Covid Tracking Project
ORG - White House
ORG - White House
GPE - California
GPE - Texas
GPE - Arizona
ORG - department of health
GPE - Washington
GPE - Texas
GPE - Austin
GPE - Texas
GPE - Texas
ORG - Bexar
GPE - Dallas
GPE - Houston
GPE - San Antonio
GPE - Dallas
GPE - Austin
GPE - Texas
ORG - John Hopkins
GPE - the State of Texas
GPE - Arizona
GP

**Spark Entities Matched**

In [8]:
!pip install pyspark



In [9]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext() 
config = sc.getConf() 
sqlContext = SQLContext(sc)

print("Using Apache Spark Version", sc.version)

Using Apache Spark Version 3.0.0


In [10]:
df = sqlContext.read.option("header", "true").option("delimiter", ",") \
                    .option("inferSchema", "true") \
                    .csv("/content/drive/My Drive/data/cb_odm_092419.csv")

In [11]:
#Create the list of Company or Organization entities in the picked article
org_list = []
for x in article.ents:
    if x.label_ == 'ORG'and x.text not in org_list:
        org_list.append(x.text)
org_list

['Watchlist Business Economy Finance Health & Science Media',
 'White House',
 'CNBC',
 'Science',
 'Updated Sun',
 'Johns Hopkins University',
 'Johns Hopkins',
 'the Milken Institute',
 'Covid Tracking Project',
 'department of health',
 'Bexar',
 'John Hopkins',
 'ICU',
 'CNN',
 'Nevada Nevada',
 'Nate Rattner',
 'CNBC PRO Licensing & Reprints',
 'Global Business and Financial News',
 'Market Data',
 'Disclaimers Data']

In [None]:
# Find matches of Company or Organization entities identified in Step 3 using rlik
for x in org_list:
  match_df = df[df['name'].rlike(x)]
  if match_df.count() > 0:
    match_df['crunchbase_uuid','name','homepage_domain','stock_symbol'].show(10, truncate=False)

+------------------------------------+---------------------------------------------------+-------------------------+------------+
|crunchbase_uuid                     |name                                               |homepage_domain          |stock_symbol|
+------------------------------------+---------------------------------------------------+-------------------------+------------+
|8c3d547e-2b14-2787-a593-c656d253e2fb|White House Business Solutions Pvt Ltd.,           |whitehouseit.com         |:           |
|65fb148f-9051-a982-4697-185c675cc21b|The White House                                    |whitehouse.gov           |:           |
|c5ff601c-17bf-af99-3cfe-068b92c86110|White House Black Market                           |whitehouseblackmarket.com|:           |
|ac9c0e82-2903-0923-2993-001e9b009add|White House Brothers                               |whitehousebrothers.com   |:           |
|48d207fb-221a-6048-1e83-219f35a473a8|Flat White Houseboat                               |