In [30]:
#install spacy on windows
!pip install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [31]:
from pandas.core.internals.construction import dataclasses_to_dicts
import urllib.request
import urllib.parse
import urllib
import json
import gzip
from io import StringIO, BytesIO
from pandas import Series,DataFrame
import pandas as pd

In [53]:
# use Spacy to add POS tags and lemmas
def process_spacy(pure_text):
  data_format = pd.DataFrame(columns=('token','lemma','pos','onset','offset','entity','babelfy_id(iob)','link'))
  nlp = spacy.load('en_core_web_sm')
  doc = nlp(pure_text)
  i= 0
  for token in doc:
    if len(token) <= 1:
      pass
    else:
      data_format.loc[i] = (token.text, token.lemma_, token.pos, None, None, None, None, None)
      i+=1
  return data_format

In [33]:
# use Babelfy to disambiguate and link to the entities
def process_babelfy(pure_text):
  service_url = 'https://babelfy.io/v1/disambiguate'

  text = pure_text
  lang = 'EN'
  key  = '01126cf0-7a1f-4276-9727-b742413aecf9'
  inf = 0
  params = {
    'text' : text,
    'lang' : lang,
    'key'  : key
  }

  url = service_url + '?' + urllib.parse.urlencode(params)
  request = urllib.request.Request(url)
  request.add_header('Accept-encoding', 'gzip')
  response = urllib.request.urlopen(request)

  data_Babelfy = pd.DataFrame(columns=('onset','offset','entity','babelfy_id(iob)','link','signal'))

  if response.info().get('Content-Encoding') == 'gzip':
    buf = BytesIO(response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = json.loads(f.read())
    # retrieving data
    for result in data:
      # retrieving token fragment
      tokenFragment = result.get('tokenFragment')
      onset = tokenFragment.get('start')
      offset = tokenFragment.get('end')
      charFragment = result.get('charFragment')
      cfStart = charFragment.get('start')
      cfEnd = charFragment.get('end')
      entity = text[cfStart:cfEnd+1]
      babelfy_id = result.get('babelSynsetID')
      link = result.get('BabelNetURL')
      data_Babelfy.loc[inf] = (onset, offset, entity, babelfy_id, link, None)
      inf = inf+1
  #print(data_Babelfy)
  len(data_Babelfy)
  for i in range(0, len(data_Babelfy)):
    for j in range(0, len(data_Babelfy)):
      if i != j:
        if data_Babelfy.iat[i,2] in data_Babelfy.iat[j,2]:
          data_Babelfy.iat[i,5] = 1
  return data_Babelfy


In [62]:
# combine two format
def combine_format(data_format, data_Babelfy):
  for i in range(0, len(data_format)):
    for j in range(0, len(data_Babelfy)):
      #print(data_Babelfy.iat[j,2])
      if data_format.iat[i,0] in data_Babelfy.iat[j,2] and data_Babelfy.iat[j,5] != 1:
        data_format.iat[i,3] = data_Babelfy.iat[j,0]
        data_format.iat[i,4] = data_Babelfy.iat[j,1]
        data_format.iat[i,5] = data_Babelfy.iat[j,2]
        data_format.iat[i,6] = data_Babelfy.iat[j,3]
        data_format.iat[i,7] = data_Babelfy.iat[j,4]
  return data_format

In [66]:
# combine the previous algorithm steps
def process_text(pure_text):
  data_format = process_spacy(pure_text)
  data_Babelfy = process_babelfy(pure_text)
  data_format = combine_format(data_format, data_Babelfy)
  data_format = pd.concat([data_format, pd.DataFrame(columns=('TP','FP','FN'))])
  return data_format

In [67]:
data_format1 = process_text("Jelani Cobb, historian and co-editor of The Essential Kerner Commission Report, tells CNN that people and institutions already know what the problem is and that the only action that needs to be taken now is actually following the recommendations of the commission, and pay the price that comes with it.")
data_format2 = process_text("In a recently published interview with Variety, Field talked about her relationship with Reynolds, who she met on the set of their 1977 film  Smokey and the Bandit and dated off and on for five years.")
data_format3 = process_text("In the first pandemic-enforced lockdown in the United Kingdom, current West Ham manager David Moyes returned home to the Lancashire village in which his family lives, taking the opportunity to help out in the community, delivering fruit and vegetables for a local shop.")
data_format4 = process_text("Whatever the newest iPhone is called, investors are hoping that there will be enough new bells and whistles to live up to the hype -- and help dispel the notion that Apple has lost its innovative edge since Steve Jobs died and Tim Cook has led the company.")
data_format5 = process_text("Samsung is the latest tech company giving customers the ability to repair their own mobile devices amid pressure from consumers, regulators and even US President Joe Biden to ease restrictions on fixing products.")

In [69]:
data_format1.to_excel('example1.xlsx')
data_format2.to_excel('example2.xlsx')
data_format3.to_excel('example3.xlsx')
data_format4.to_excel('example4.xlsx')
data_format5.to_excel('example5.xlsx')

           token         lemma  pos onset offset                 entity  \
0             In            in   85  None   None                   None   
1       recently      recently   86     2      2               recently   
2      published       publish  100     3      3              published   
3      interview     interview   92     4      4              interview   
4           with          with   85  None   None                   None   
5        Variety       Variety   96     6      6                Variety   
6          Field         Field   96     8      8                  Field   
7         talked          talk  100     9      9                 talked   
8          about         about   85  None   None                   None   
9            her           her   95  None   None                   None   
10  relationship  relationship   92    12     12           relationship   
11          with          with   85  None   None                   None   
12      Reynolds      Rey