# Libraries and Dependencies

In [1]:
!pip install spacy
!pip install spacy-entity-Linker
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg
!python -m spacyEntityLinker "download_knowledge_base"

# NOTE: Restart the runtime after running this cell.

Collecting spacy-entity-Linker
  Downloading https://files.pythonhosted.org/packages/a9/28/6c8279e9c4d89128e4bf3fd93b2180d4c74fce2a96e244b258401937c027/spacy_entity_linker-0.0.5-py3-none-any.whl
Installing collected packages: spacy-entity-Linker
Successfully installed spacy-entity-Linker-0.0.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.4MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180944 sha256=829e72cab17dc6ea3eb27c674ee87dfe0f0bf4ff90e0be79b4c0b678b36d969e
  Stored in directory: /tmp/pip-ephem-wheel-cache-bf4mvrff/wheels/2

# Imports

In [2]:
import spacy
import pandas as pd
import json
import numpy as np
from spacyEntityLinker.EntityLinker import EntityLinker

# Data Loading

In [4]:

# Load the data
trainData = pd.read_json("./train.jsonl", lines=True, orient="records")
valData = pd.read_json("./val.jsonl", lines=True, orient="records")
testData = pd.read_json("./test.jsonl", lines=True, orient="records")

# Train 
trainPassages = trainData.passage.values
trainQuestions = trainData.question.values
trainAnswers = trainData.label.values.astype(int)

# Validation
valPassages = valData.passage.values
valQuestions = valData.question.values
valAnswers = valData.label.values.astype(int)

# Test
testPassages = testData.passage.values
testQuestions = testData.question.values


# FinalSet
finalPassages = np.concatenate((trainPassages, valPassages))
finalPassages = np.concatenate((finalPassages, testPassages))

finalQuestions = np.concatenate((trainQuestions, valQuestions))
finalQuestions = np.concatenate((finalQuestions, testQuestions))


In [5]:
#Initialize Entity Linker
entityLinker = EntityLinker()

#initialize language model
nlp = spacy.load('en_core_web_sm')

#add pipeline
nlp.add_pipe(entityLinker, last=True, name="entityLinker")


# Entity Linking

In [6]:
eDictWriter = open("allEntsInfo.jsonl", 'w')

for question, passage in zip(finalQuestions, finalPassages):

  epList = []
  eqList = []
  eDict = {}

  # This gives all the entities present in the question and passage
  q = nlp(question)
  p = nlp(passage)

  #returns all linked entities in the question and passage
  qEnts = q._.linkedEntities
  pEnts = p._.linkedEntities

  #iterates over all question and collects the links
  for qe in qEnts:
    eqList.append([qe.get_label(),
                  "https://www.wikidata.org/wiki/Q{}".format(qe.get_id())])
  
  eDict["question"] = eqList

  # iterates over all the passages and collects the links
  for pe in pEnts:
    epList.append([pe.get_label(),
                    "https://www.wikidata.org/wiki/Q{}".format(pe.get_id())])

  eDict["passage"] = epList

  # Writes the information about the linked entities from question and passage to the file
  eDictWriter.write(json.dumps(eDict) + '\n')


# NER Tagger

In [None]:
# This was used to generate a knowledge base of all the entities present in our BOOLQ dataset

ner = spacy.load("en_core_web_lg")
entities = []

eDictWriter = open("entitiesAll.jsonl", "w")

for passage, question in zip(finalPassages, finalQuestions):
    neP = ner(passage)
    neQ = ner(question)

    epList = []
    eqList = []
    eDict = {}

    for eP in neP.ents:
        epList.append((eP.text, eP.label_, eP.start_char, eP.end_char))
    
    eDict["passage"] = epList

    for eQ in neQ.ents:
        eqList.append((eQ.text, eQ.label_, eQ.start_char, eQ.end_char))
    eDict["question"] = eqList

    # print(eDict)
    eDictWriter.write(json.dumps(eDict) + '\n')