In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [39]:
ex = 'A trading history dating back to 1961 in Abu Dhabi, UAE and the region with a combination of strategic sponsorships, agency agreements, alliances and joint ventures partnerships, UTS has been instrumental in introducing to the regional markets globally branded exclusive engineered products and services to its customers. To name a few of our global partners, Carrier, NSSMC – Nippon Steel & Sumitomo Metal, Yokogawa, Alfa Laval, Baker Hughes Drill Bits, OCS Total Facilities Management, Meco, Evac and Ecochlor in Water Desalination and Total Water Treatment, Roemex, ChemTreat, SNC Lavalin/Kentz Engineering Procurement and Contracting business.'

In [40]:
ex

'A trading history dating back to 1961 in Abu Dhabi, UAE and the region with a combination of strategic sponsorships, agency agreements, alliances and joint ventures partnerships, UTS has been instrumental in introducing to the regional markets globally branded exclusive engineered products and services to its customers. To name a few of our global partners, Carrier, NSSMC – Nippon Steel & Sumitomo Metal, Yokogawa, Alfa Laval, Baker Hughes Drill Bits, OCS Total Facilities Management, Meco, Evac and Ecochlor in Water Desalination and Total Water Treatment, Roemex, ChemTreat, SNC Lavalin/Kentz Engineering Procurement and Contracting business.'

In [41]:
def preprocess(sent):
    sent=nltk.word_tokenize(sent)
    sent=nltk.pos_tag(sent)
    return sent

In [42]:
preprocess(ex)

[('A', 'DT'),
 ('trading', 'NN'),
 ('history', 'NN'),
 ('dating', 'VBG'),
 ('back', 'RB'),
 ('to', 'TO'),
 ('1961', 'CD'),
 ('in', 'IN'),
 ('Abu', 'NNP'),
 ('Dhabi', 'NNP'),
 (',', ','),
 ('UAE', 'NNP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('region', 'NN'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('combination', 'NN'),
 ('of', 'IN'),
 ('strategic', 'JJ'),
 ('sponsorships', 'NNS'),
 (',', ','),
 ('agency', 'NN'),
 ('agreements', 'NNS'),
 (',', ','),
 ('alliances', 'NNS'),
 ('and', 'CC'),
 ('joint', 'JJ'),
 ('ventures', 'NNS'),
 ('partnerships', 'NNS'),
 (',', ','),
 ('UTS', 'NNP'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('instrumental', 'JJ'),
 ('in', 'IN'),
 ('introducing', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('regional', 'JJ'),
 ('markets', 'NNS'),
 ('globally', 'RB'),
 ('branded', 'VBD'),
 ('exclusive', 'JJ'),
 ('engineered', 'VBN'),
 ('products', 'NNS'),
 ('and', 'CC'),
 ('services', 'NNS'),
 ('to', 'TO'),
 ('its', 'PRP$'),
 ('customers', 'NNS'),
 ('.', '.'),
 ('To', 'TO'),
 ('name', 'VB'),
 

In [43]:
sent=preprocess(ex)

### Chunking
Chunking is a process of extracting phrases from unstructured text. Instead of just simple tokens which may not represent the actual meaning of the text, its advisable to use phrases such as “South Africa” as a single word instead of ‘South’ and ‘Africa’ separate words

In [44]:
pattern='NP:{<DT>?<JJ>*<NN>}'
    #Our chunk pattern consists of one rule, 
    #that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT,
    #followed by any number of adjectives, JJ, and then a noun, NN.

In [45]:
pattern

'NP:{<DT>?<JJ>*<NN>}'

In [46]:
cp=nltk.RegexpParser(pattern)
cs=cp.parse(sent)

In [47]:
print(cs)

(S
  (NP A/DT trading/NN)
  (NP history/NN)
  dating/VBG
  back/RB
  to/TO
  1961/CD
  in/IN
  Abu/NNP
  Dhabi/NNP
  ,/,
  UAE/NNP
  and/CC
  (NP the/DT region/NN)
  with/IN
  (NP a/DT combination/NN)
  of/IN
  strategic/JJ
  sponsorships/NNS
  ,/,
  (NP agency/NN)
  agreements/NNS
  ,/,
  alliances/NNS
  and/CC
  joint/JJ
  ventures/NNS
  partnerships/NNS
  ,/,
  UTS/NNP
  has/VBZ
  been/VBN
  instrumental/JJ
  in/IN
  introducing/VBG
  to/TO
  the/DT
  regional/JJ
  markets/NNS
  globally/RB
  branded/VBD
  exclusive/JJ
  engineered/VBN
  products/NNS
  and/CC
  services/NNS
  to/TO
  its/PRP$
  customers/NNS
  ./.
  To/TO
  name/VB
  a/DT
  few/JJ
  of/IN
  our/PRP$
  global/JJ
  partners/NNS
  ,/,
  Carrier/NNP
  ,/,
  NSSMC/NNP
  –/NNP
  Nippon/NNP
  Steel/NNP
  &/CC
  Sumitomo/NNP
  Metal/NNP
  ,/,
  Yokogawa/NNP
  ,/,
  Alfa/NNP
  Laval/NNP
  ,/,
  Baker/NNP
  Hughes/NNP
  Drill/NNP
  Bits/NNP
  ,/,
  OCS/NNP
  Total/NNP
  Facilities/NNP
  Management/NNP
  ,/,
  Meco/NNP
  ,/,
 

In [48]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [49]:
iob_tagged=tree2conlltags(cs)

In [50]:
pprint(iob_tagged)

[('A', 'DT', 'B-NP'),
 ('trading', 'NN', 'I-NP'),
 ('history', 'NN', 'B-NP'),
 ('dating', 'VBG', 'O'),
 ('back', 'RB', 'O'),
 ('to', 'TO', 'O'),
 ('1961', 'CD', 'O'),
 ('in', 'IN', 'O'),
 ('Abu', 'NNP', 'O'),
 ('Dhabi', 'NNP', 'O'),
 (',', ',', 'O'),
 ('UAE', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('the', 'DT', 'B-NP'),
 ('region', 'NN', 'I-NP'),
 ('with', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('combination', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('strategic', 'JJ', 'O'),
 ('sponsorships', 'NNS', 'O'),
 (',', ',', 'O'),
 ('agency', 'NN', 'B-NP'),
 ('agreements', 'NNS', 'O'),
 (',', ',', 'O'),
 ('alliances', 'NNS', 'O'),
 ('and', 'CC', 'O'),
 ('joint', 'JJ', 'O'),
 ('ventures', 'NNS', 'O'),
 ('partnerships', 'NNS', 'O'),
 (',', ',', 'O'),
 ('UTS', 'NNP', 'O'),
 ('has', 'VBZ', 'O'),
 ('been', 'VBN', 'O'),
 ('instrumental', 'JJ', 'O'),
 ('in', 'IN', 'O'),
 ('introducing', 'VBG', 'O'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'O'),
 ('regional', 'JJ', 'O'),
 ('markets', 'NNS', 'O'),
 ('globally', 'RB', '

In [51]:
ne_tree=nltk.ne_chunk(pos_tag(word_tokenize(ex)))

In [52]:
print(ne_tree)


(S
  A/DT
  trading/NN
  history/NN
  dating/VBG
  back/RB
  to/TO
  1961/CD
  in/IN
  (GPE Abu/NNP Dhabi/NNP)
  ,/,
  (ORGANIZATION UAE/NNP)
  and/CC
  the/DT
  region/NN
  with/IN
  a/DT
  combination/NN
  of/IN
  strategic/JJ
  sponsorships/NNS
  ,/,
  agency/NN
  agreements/NNS
  ,/,
  alliances/NNS
  and/CC
  joint/JJ
  ventures/NNS
  partnerships/NNS
  ,/,
  (ORGANIZATION UTS/NNP)
  has/VBZ
  been/VBN
  instrumental/JJ
  in/IN
  introducing/VBG
  to/TO
  the/DT
  regional/JJ
  markets/NNS
  globally/RB
  branded/VBD
  exclusive/JJ
  engineered/VBN
  products/NNS
  and/CC
  services/NNS
  to/TO
  its/PRP$
  customers/NNS
  ./.
  To/TO
  name/VB
  a/DT
  few/JJ
  of/IN
  our/PRP$
  global/JJ
  partners/NNS
  ,/,
  (PERSON Carrier/NNP)
  ,/,
  (ORGANIZATION NSSMC/NNP)
  –/NNP
  (PERSON Nippon/NNP Steel/NNP)
  &/CC
  (PERSON Sumitomo/NNP Metal/NNP)
  ,/,
  (GPE Yokogawa/NNP)
  ,/,
  (PERSON Alfa/NNP Laval/NNP)
  ,/,
  (PERSON Baker/NNP Hughes/NNP Drill/NNP Bits/NNP)
  ,/,
  (ORGANIZA

In [64]:
import spacy
from spacy import displacy
from collections import Counter
nlp=spacy.load('en_core_web_sm')


In [65]:
nlp=spacy.load('en_core_web_sm')

In [66]:
doc=nlp(ex)

In [67]:
[(X.text,X.label_) for X in doc.ents]

[('1961', 'DATE'),
 ('Abu Dhabi', 'GPE'),
 ('UAE', 'GPE'),
 ('UTS', 'ORG'),
 ('Carrier, NSSMC – Nippon Steel & Sumitomo Metal', 'ORG'),
 ('Yokogawa', 'ORG'),
 ('Alfa Laval', 'PERSON'),
 ('Baker Hughes Drill Bits', 'PERSON'),
 ('OCS Total Facilities Management', 'ORG'),
 ('Meco', 'ORG'),
 ('Evac', 'PERSON'),
 ('Total Water Treatment', 'WORK_OF_ART'),
 ('Roemex', 'ORG'),
 ('ChemTreat', 'ORG'),
 ('SNC Lavalin/Kentz Engineering Procurement', 'ORG'),
 ('Contracting', 'NORP')]

In [68]:
from bs4 import BeautifulSoup
import requests,re

In [69]:
def url_to_string(url):
    res=requests.get(url)
    html=res.text
    soup=BeautifulSoup(html,'html5lib')
    for script in soup(['script','style','aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [70]:
ny_bb=url_to_string('https://www.uts.ae/leadership')

In [71]:
article=nlp(ny_bb)

In [72]:
len(article.ents)

369

In [73]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 175,
         'CARDINAL': 8,
         'PERSON': 50,
         'DATE': 68,
         'GPE': 42,
         'NORP': 5,
         'WORK_OF_ART': 2,
         'ORDINAL': 1,
         'PRODUCT': 2,
         'LOC': 13,
         'FAC': 2,
         'LANGUAGE': 1})

In [74]:
items=[x.text for x in article.ents]

In [75]:
Counter(items).most_common(5)

[('UTS', 32),
 ('United Technical Services', 7),
 ('Rick', 4),
 ('Dubai', 4),
 ('800-UTS', 3)]

In [76]:
sentences=[x for x in article.sents]

In [77]:
sentences[20]

He has also served on numerous public and private company boards and is at present a Director of the Emirates Cricket Board.                                                              

In [78]:
displacy.render(nlp(str(sentences[20])),jupyter=True,style='ent')

In [79]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('He', 'PRON', '-PRON-'),
 ('served', 'VERB', 'serve'),
 ('numerous', 'ADJ', 'numerous'),
 ('public', 'ADJ', 'public'),
 ('private', 'ADJ', 'private'),
 ('company', 'NOUN', 'company'),
 ('boards', 'NOUN', 'board'),
 ('present', 'ADJ', 'present'),
 ('Director', 'PROPN', 'director'),
 ('Emirates', 'PROPN', 'emirates'),
 ('Cricket', 'PROPN', 'cricket'),
 ('Board', 'PROPN', 'board'),
 ('                                                             ',
  'SPACE',
  '                                                             ')]

In [80]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])


{'the Emirates Cricket Board': 'ORG'}

In [81]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])


[(He, 'O', ''), (has, 'O', ''), (also, 'O', ''), (served, 'O', ''), (on, 'O', ''), (numerous, 'O', ''), (public, 'O', ''), (and, 'O', ''), (private, 'O', ''), (company, 'O', ''), (boards, 'O', ''), (and, 'O', ''), (is, 'O', ''), (at, 'O', ''), (present, 'O', ''), (a, 'O', ''), (Director, 'O', ''), (of, 'O', ''), (the, 'B', 'ORG'), (Emirates, 'I', 'ORG'), (Cricket, 'I', 'ORG'), (Board, 'I', 'ORG'), (., 'O', ''), (                                                             , 'O', '')]


In [82]:
displacy.render(nlp(str(sentences[:len(sentences)])),jupyter=True,style='ent')