# Libraries and Dependencies

In [None]:
!pip install spacy
!pip install spacy-entity-Linker
!python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg
!python -m spacyEntityLinker "download_knowledge_base"

!git clone https://github.com/thunlp/OpenNRE.git

%cd OpenNRE/
!pip install -r requirements.txt
!python setup.py install 

# NOTE: Restart the runtime after running this cell.

# Imports

In [1]:
import spacy
import pandas as pd
import json
import numpy as np
from spacyEntityLinker.EntityLinker import EntityLinker
import os
import opennre as on

# Data Loading

In [2]:
# Load the data
trainData = pd.read_json("./train.jsonl", lines=True, orient="records")
valData = pd.read_json("./val.jsonl", lines=True, orient="records")
testData = pd.read_json("./test.jsonl", lines=True, orient="records")

# Train 
trainPassages = trainData.passage.values
trainQuestions = trainData.question.values
trainAnswers = trainData.label.values.astype(int)

# Validation
valPassages = valData.passage.values
valQuestions = valData.question.values
valAnswers = valData.label.values.astype(int)

# Test
testPassages = testData.passage.values
testQuestions = testData.question.values


# FinalSet
finalPassages = np.concatenate((trainPassages, valPassages))
finalPassages = np.concatenate((finalPassages, testPassages))

finalQuestions = np.concatenate((trainQuestions, valQuestions))
finalQuestions = np.concatenate((finalQuestions, testQuestions))


2021-05-09 19:57:28,452 - numexpr.utils - INFO - NumExpr defaulting to 2 threads.


In [3]:
#Initialize Entity Linker
entityLinker = EntityLinker()

#initialize language model
nlp = spacy.load('en_core_web_sm')

#add pipeline
nlp.add_pipe(entityLinker, last=True, name="entityLinker")


# Entity Linking & Relation Extraction

In [4]:
model = on.get_model('wiki80_bert_softmax')


2021-05-09 19:58:49,051 - root - INFO - Loading BERT pre-trained checkpoint.


In [5]:
model1 = on.get_model('wiki80_bertentity_softmax')

2021-05-09 20:02:09,686 - root - INFO - Loading BERT pre-trained checkpoint.


In [55]:
def augmentData(questions, passages, labels, file=''):

  augData = open("Augmented_" + file + ".jsonl", 'w')

  for question, passage, label in zip(questions, passages, labels):
    
    qpDict = {}

    qpDict['question'] = question

    # This gives all the entities present in the question and passage
    q = nlp(question)
    p = nlp(passage)

    #returns all linked entities in the question and passage
    qEnts = q._.linkedEntities
    pEnts = p._.linkedEntities

    if len(qEnts):

      for qe in qEnts:
        
        if qe.get_label() != None:
          if qe.get_label() in passage:
            qeDes = qe.get_description()
          
            if qeDes not in passage:
              passage = passage + qeDes

      qpDict['passage'] = passage
      qpDict['label'] = int(label)

      augData.write(json.dumps(qpDict) + '\n')

In [56]:
augmentData(trainQuestions, trainPassages, trainAnswers, 'train')
augmentData(valQuestions, valPassages, valAnswers, 'val')

In [None]:

    # print(question)
    # print(passage)

    # sents = passage.split('.')
    # # print(sents)

    # # check whether the question has any entities
    # if len(qEnts) > 0:

    #   # for every entity in question
    #   for qe in qEnts:
      
    #     qe = qe.get_label()
    #     qeLen = len(qe)

    #     # perform relation extraction of entities present in both question and passage
    #     # A entity which is present in both question and passage is used to find relations with all other entities of the passage
    #     if qe in passage:

    #       # perform sentential relation extraction using every sentence of the passage
    #       # for sent in sents:


    #       for pe in pEnts:
    #         pe = pe.get_label()
    #         # if pe in sent and qe in sent and pe != qe:
              
    #         qeStart = passage.index(qe)
    #         qeEnd = qeStart + qeLen - 1

    #         peLen = len(pe)
    #         if pe in passage and pe != qe:
    #           peStart = passage.index(pe)
    #           peEnd = peStart + peLen - 1

    #           hPos = {'pos': (qeStart, qeEnd)}
    #           tPos = {'pos': (peStart, peEnd)}

              
    #           pred, conf = (model1.infer({'text': passage, 'h': hPos, 't': tPos}))

    #           if conf > 0.85:
    #             print("Qe: %s, Pe: %s" % (qe,pe))
    #             print(pred,conf)
    #   break
