In [1]:
# !pip install wikipedia
# !pip install -U spacy
# !python -m spacy download en_core_web_trf

In [1]:
import pandas as pd
import numpy as np
import json
import re
import pickle
from tqdm.auto import tqdm
import time

import wikipedia
import spacy
import unidecode
from tqdm.auto import tqdm
from multiprocessing import Process, Pool

from flair.data import Sentence
from flair.models import SequenceTagger

# load the NER tagger
tagger = SequenceTagger.load('ner-fast') #'ner-fast'

nlp1 = spacy.load("en_core_web_sm")
nlp2 = spacy.load("en_core_web_trf")

def getting_wiki_candidates_raw(query, n = 10):
    search_results = wikipedia.search(query, results=n)
    return [t.replace(' ', '_') for t in search_results]

def get_enteties_spacy_sm(text):
    doc = nlp1(text)
    ents = []
    for ent in doc.ents:
         ents.append(ent.text)
    return ents

def get_enteties_spacy_trf(text):
    doc = nlp2(text)
    ents = []
    for ent in doc.ents:
         ents.append(ent.text)
    return ents

def get_enteties_flair(text):
    # make and process sentence
    sentence = Sentence(text)
    tagger.predict(sentence)
    ents = []
    # iterate over entities and print
    for entity in sentence.get_spans('ner'):
        ents.append(entity.text)
    return ents

def query_extending_with_entities(text, ents, extend = True):
    # getting NE
    if not extend:
        return ents
    else:
        for e in ents:
            tokens_b = re.findall(f'{e} \(.*?\)',text)
            if len(tokens_b) > 0:
                ents.remove(e)
                ents.append(tokens_b[0])
        return ents

def getting_wiki_candidates_NER(i, query, 
                                ents_name: str = None, 
                                n: int = 10, 
                                extend: bool = False, 
                                separate: bool = True,
                                verbose = True):
    i+=1
    
    # getting enteties
    if ents_name == 'spacy_en':
        ents = get_enteties_spacy_sm(query)
    elif ents_name == 'spacy_trf':
        ents = get_enteties_spacy_trf(query)
    elif ents_name == 'flair':
        ents = get_enteties_flair(query)
    else:
        ents = []
        
    # extenting enteties if needed
    ents = query_extending_with_entities(query, ents, extend = extend) 
    
    if verbose:
        print(i)
        print(query)
        print(ents)
    
    search_results = getting_wiki_candidates_raw(query, n=n)
    
    if not separate:
        search_results_en = getting_wiki_candidates_raw(' '.join(ents), n=n)
    
    else:
        search_results_en = []
        for e in ents:
            search_results_en += getting_wiki_candidates_raw(e, n=n)

    return query, set([t.replace(' ', '_') for t in search_results + search_results_en])


configs = [{'ents_name':None, 'n':10, 'extend':False, 'separate': True},
          {'ents_name':None, 'n':30, 'extend':False, 'separate': True},
          {'ents_name':None, 'n':50, 'extend':False, 'separate': True},
          {'ents_name':'spacy_en', 'n':10, 'extend':False, 'separate': False},
          {'ents_name':'spacy_en', 'n':30, 'extend':False, 'separate': False},
          {'ents_name':'spacy_en', 'n':50, 'extend':False, 'separate': False},
          {'ents_name':'spacy_en', 'n':10, 'extend':False, 'separate': True},
          {'ents_name':'spacy_en', 'n':10, 'extend':True, 'separate': True},
          {'ents_name':'spacy_trf', 'n':10, 'extend':False, 'separate': False},
          {'ents_name':'spacy_trf', 'n':3, 'extend':False, 'separate': True},
          {'ents_name':'spacy_trf', 'n':5, 'extend':False, 'separate': True},
          {'ents_name':'spacy_trf', 'n':10, 'extend':False, 'separate': True},
          {'ents_name':'spacy_trf', 'n':10, 'extend':True, 'separate': True},
          {'ents_name':'flair', 'n':10, 'extend':False, 'separate': False},
          {'ents_name':'flair', 'n':3, 'extend':False, 'separate': True},
          {'ents_name':'flair', 'n':5, 'extend':False, 'separate': True},
          {'ents_name':'flair', 'n':10, 'extend':False, 'separate': True},
          {'ents_name':'flair', 'n':10, 'extend':True, 'separate': True}]

all_fever = pd.read_csv('all_fever_upd.csv')
all_fever = all_fever.head(10)
input_iterable = all_fever.claim.values

# time_performance = []
# for conf in tqdm(configs):
#     start = time.time()
    
#     def getting_wiki_candidates_with_params(i, query):
#         return getting_wiki_candidates_NER(i, query, **conf)
    
#     pool = Pool(processes=20) # Initalize a pool of n processes
#     results = []
#     def get_result(result):
#         global results
#         results.append(result)
#     for i in range(0, len(input_iterable)):
#         pool.apply_async(getting_wiki_candidates_with_params, args = (i, input_iterable[i]), callback=get_result)
#     pool.close() # this means that no more tasks will be added to the pool
#     pool.join()  
#     end = time.time()
    
#     time_performance.append(end - start)
    
# print(time_performance)

2021-04-27 19:56:21,921 --------------------------------------------------------------------------------
2021-04-27 19:56:21,924 The model key 'ner-fast' now maps to 'https://huggingface.co/flair/ner-english-fast' on the HuggingFace ModelHub
2021-04-27 19:56:21,924  - The most current version of the model is automatically downloaded from there.
2021-04-27 19:56:21,925  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner-fast/en-ner-fast-conll03-v0.4.pt)
2021-04-27 19:56:21,925 --------------------------------------------------------------------------------
2021-04-27 19:56:22,548 loading file /Users/ntr/.flair/models/ner-english-fast/4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611


In [43]:
configs = [{'ents_name':None, 'n':10, 'extend':False, 'separate': True},
          {'ents_name':None, 'n':30, 'extend':False, 'separate': True},
          {'ents_name':None, 'n':50, 'extend':False, 'separate': True},
          {'ents_name':'spacy_en', 'n':10, 'extend':False, 'separate': False},
          {'ents_name':'spacy_en', 'n':30, 'extend':False, 'separate': False},
          {'ents_name':'spacy_en', 'n':50, 'extend':False, 'separate': False},
          {'ents_name':'spacy_en', 'n':10, 'extend':False, 'separate': True},
          {'ents_name':'spacy_en', 'n':10, 'extend':True, 'separate': True},
          {'ents_name':'spacy_trf', 'n':10, 'extend':False, 'separate': False},
          {'ents_name':'spacy_trf', 'n':3, 'extend':False, 'separate': True},
          {'ents_name':'spacy_trf', 'n':5, 'extend':False, 'separate': True},
          {'ents_name':'spacy_trf', 'n':10, 'extend':False, 'separate': True},
          {'ents_name':'spacy_trf', 'n':10, 'extend':True, 'separate': True},
          {'ents_name':'flair', 'n':10, 'extend':False, 'separate': False},
          {'ents_name':'flair', 'n':3, 'extend':False, 'separate': True},
          {'ents_name':'flair', 'n':5, 'extend':False, 'separate': True},
          {'ents_name':'flair', 'n':10, 'extend':False, 'separate': True},
          {'ents_name':'flair', 'n':10, 'extend':True, 'separate': True}]

configs[0]

{'ents_name': None, 'n': 10, 'extend': False, 'separate': True}

In [2]:
getting_wiki_candidates_NER(1, 'Stranger Things is set in Bloomington, Indiana.', **configs[8])

2
Stranger Things is set in Bloomington, Indiana.
['Stranger Things', 'Bloomington', 'Indiana']


('Stranger Things is set in Bloomington, Indiana.',
 {'Alfred_Schütz',
  'Auschwitz_concentration_camp',
  'Breaking_Away',
  'Ceiling_Unlimited',
  'John_Mellencamp',
  'Lewis_Browne',
  'List_of_erotic_thriller_films',
  'Martin_Heidegger',
  'Orson_Welles_radio_credits',
  'Robby_Benson',
  "Schindler's_List",
  'Shawn_Pelton',
  'Sophist_(dialogue)',
  'Things_Fall_Apart'})

In [4]:
page = wikipedia.page('Alfred Schütz')

In [12]:
page.content.split('. ')

['Alfred Schutz (; born Alfred Schütz, German: [ʃʏts]; 1899–1959) was an Austrian philosopher and social phenomenologist whose work bridged sociological and phenomenological traditions',
 "Schutz is gradually being recognized as one of the 20th century's leading philosophers of social science",
 "He related Edmund Husserl's work to the social sciences, using it to develop the philosophical foundations of Max Weber's sociology in his major work, Phenomenology of the Social World.\n\n\n== Biography ==\nSchutz was born on 13 April 1899 in Vienna, Austria, as an only child in an upper-middle-class Jewish family",
 'Following his graduation from high school, he was drafted into the Austrian Army where he quickly rose to the American equivalent rank of second lieutenant',
 'His army regiment was dispatched to fight in a series of heavy battles on the Italian front (WWI).\n\n\n=== Education and later life ===\nIn 1918, Schutz enrolled at the University of Vienna, where he earned his law degre

In [13]:
page.content

'Alfred Schutz (; born Alfred Schütz, German: [ʃʏts]; 1899–1959) was an Austrian philosopher and social phenomenologist whose work bridged sociological and phenomenological traditions. Schutz is gradually being recognized as one of the 20th century\'s leading philosophers of social science. He related Edmund Husserl\'s work to the social sciences, using it to develop the philosophical foundations of Max Weber\'s sociology in his major work, Phenomenology of the Social World.\n\n\n== Biography ==\nSchutz was born on 13 April 1899 in Vienna, Austria, as an only child in an upper-middle-class Jewish family. Following his graduation from high school, he was drafted into the Austrian Army where he quickly rose to the American equivalent rank of second lieutenant. His army regiment was dispatched to fight in a series of heavy battles on the Italian front (WWI).\n\n\n=== Education and later life ===\nIn 1918, Schutz enrolled at the University of Vienna, where he earned his law degree. He also

In [17]:
page.sections

[]

## Experiment itself: 

In [52]:
all_fever = pd.read_csv('all_fever_upd.csv')
all_fever = all_fever.head(10)
input_iterable = all_fever.claim.values

time_performance = []
for conf in tqdm(configs):
    start = time.time()
    
    def getting_wiki_candidates_with_params(i, query):
        return getting_wiki_candidates_NER(i, query, **conf)
    
    pool = Pool(processes=20) # Initalize a pool of n processes
    results = []
    def get_result(result):
        global results
        results.append(result)
    for i in range(0, len(input_iterable)):
        pool.apply_async(getting_wiki_candidates_with_params, args = (i, input_iterable[i]), callback=get_result)
    pool.close() # this means that no more tasks will be added to the pool
    pool.join()  
    end = time.time()
    
    time_performance.append(end - start)
    
print(time_performance)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




KeyboardInterrupt: 

In [19]:
all_fever

Unnamed: 0,id,verifiable,label,claim,evidence,evidence_sources,NER,valid_link
0,75397,VERIFIABLE,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[[92206, 104971, 'Nikolaj_Coster-Waldau', 7],...","{'Fox_Broadcasting_Company', 'Nikolaj_Coster-W...","['Nikolaj Coster-Waldau', 'Fox Broadcasting Co...",True
1,150448,VERIFIABLE,SUPPORTS,Roman Atwood is a content creator.,"[[[174271, 187498, 'Roman_Atwood', 1]], [[1742...",{'Roman_Atwood'},['Roman Atwood'],True
2,214861,VERIFIABLE,SUPPORTS,"History of art includes architecture, dance, s...","[[[255136, 254645, 'History_of_art', 2]]]",{'History_of_art'},[],True
3,156709,VERIFIABLE,REFUTES,Adrienne Bailon is an accountant.,"[[[180804, 193183, 'Adrienne_Bailon', 0]]]",{'Adrienne_Bailon'},['Adrienne Bailon'],True
4,129629,VERIFIABLE,SUPPORTS,Homeland is an American television spy thrille...,"[[[151831, 166598, 'Homeland_-LRB-TV_series-RR...","{'Prisoners_of_War_(TV_series)', 'Homeland_(TV...","['American', 'Israeli', 'Prisoners of War']",True
5,33078,VERIFIABLE,SUPPORTS,The Boston Celtics play their home games at TD...,"[[[49158, 58489, 'Boston_Celtics', 3]], [[4915...",{'Boston_Celtics'},"['Boston Celtics', 'TD Garden']",True
6,6744,VERIFIABLE,SUPPORTS,The Ten Commandments is an epic film.,"[[[23513, 28977, 'The_Ten_Commandments_-LRB-19...",{'The_Ten_Commandments_(1956_film)'},['Ten Commandments'],True
7,226034,VERIFIABLE,SUPPORTS,Tetris has sold millions of physical copies.,"[[[269479, 265800, 'Tetris', 18]]]",{'Tetris'},['Tetris'],True
8,40190,VERIFIABLE,SUPPORTS,Cyndi Lauper won the Best New Artist award at ...,"[[[56492, 66697, 'Cyndi_Lauper', 2]]]",{'Cyndi_Lauper'},"['Cyndi Lauper', 'Best New Artist award', '27t...",True
9,76253,VERIFIABLE,SUPPORTS,There is a movie called The Hunger Games.,"[[[93100, 106004, 'The_Hunger_Games_-LRB-film-...",{'The_Hunger_Games_(film)'},['The Hunger Games'],True
