In [None]:
import pandas as pd
import numpy as np
import json
import re
from tqdm.auto import tqdm
from unidecode import unidecode


import sys

train = pd.read_json(path_or_buf='../data/fever/train.jsonl', lines=True)
test = pd.read_json(path_or_buf='../data/fever/shared_task_dev.jsonl', lines=True)

In [None]:
sys.path.insert(0, "/Users/ntr/Documents/tresh/fairapi")
from modules.candidatets_picker import WikiCandidatesSelector

MIN_SEN_LEN = 10

sel = WikiCandidatesSelector()

In [None]:
test.head(1)

In [None]:
train.head(5)

In [None]:
def process_breckets(text):
    try:
        return text.replace('-RRB-', ')').replace('-LRB-', '(').replace('-COLON-', ':').replace('\t', ' ')
    except:
        return text

def get_all_evidence(x):
    res = []
    for l in x:
        for m in l:
            res.append(process_link(m[2]))
    return set(res)

def process_link(x):
    if x is None:
        return None
    x = process_breckets(x)
    new_x = unidecode(x)
    if x == new_x:
        return new_x
    else:
        return new_x + '_'

train['evidence_sources'] = train['evidence'].apply(get_all_evidence)
test['evidence_sources'] = test['evidence'].apply(get_all_evidence)

In [None]:
test

In [None]:
all_related_articles = set()

for ev in train.evidence_sources.values:
    all_related_articles.update(ev)
    
for ev in test.evidence_sources.values:
    all_related_articles.update(ev)
    

### removing diactric symbols
all_related_articles = all_related_articles - {None}  
all_related = set()
for s in all_related_articles:
    all_related.update(set([unidecode(s)])) 

In [None]:
len(all_related_articles)

### Reading wikipedia dump:

In [None]:
dfs = []
for i in tqdm(range(1, 110)):
    if len(str(i))==1:
        s = '00'+str(i)
    elif len(str(i))==2:
        s = '0'+str(i)
    else:
        s = str(i)
    dfs.append(pd.read_json(path_or_buf=f'../data/wiki-pages/wiki-pages/wiki-{s}.jsonl', lines=True))
    
df_wiki = pd.concat(dfs, axis = 0)
del dfs

In [None]:
### filtering
df_wiki.id = df_wiki.id.apply(lambda x: process_link(x))
# df_wiki = df_wiki[df_wiki.id.isin(all_related)]

### Converting FEVER dataset into SNLI style data

#### Wikipedia dump to dict (consider using Redis for that)

In [None]:
converted_dataset = {}

for i, row in tqdm(df_wiki.iterrows()):
    page_text = {}
    for line in row.lines.split('\n'):
        try:
            m = re.match(r'(\d+)+(.*)', line)
            page_text.update({int(m.group(1)):m.group(2)})
        except:
            pass
    converted_dataset.update({row.id:page_text})

#### Converting relations to SNLI like dataset

In [None]:
def fever_oversample(articles_used, sentences_used):
    returned_sentences = []
    iterator_lock = 0
    while len(returned_sentences) < len(sentences_used):
        article = np.random.choice(articles_used)
        sentences_set = converted_dataset.get(article, None)
        if sentences_set:
            sentences_set = list(sentences_set.keys())
            picked_sentence = np.random.choice(sentences_set)

            if ((picked_sentence in sentences_used) or (picked_sentence in returned_sentences) or
                (len(converted_dataset[article][picked_sentence]) < MIN_SEN_LEN) or
                 picked_sentence == '\t'):
                pass
            else:
                returned_sentences.append(converted_dataset[article][picked_sentence])

        iterator_lock += 1
        if iterator_lock > 25:
            break
            
    return returned_sentences


def fill_NEI_text(claim):
    res = sel.getting_wiki_candidates_NER(claim)
    
    ### try to get from FEVER
    for candidate in res:
        article = converted_dataset.get(process_link(candidate))
        if article:
            keys = list(article.keys())
            iterator_lock = 0
            while True:
                iterator_lock += 1
                i = np.random.choice(keys)
                if len(article[i]) < MIN_SEN_LEN:
                    pass
                else:
                    return article[i]
                if iterator_lock > 25:
                    break
    
    candidate = np.random.choice(list(res))
    return np.random.choice(wikipedia.page(candidate).summary.split(". "))
                    
            

    
def convert_to_snli_style(df, NEI_filling = False, FEVER_sampling = False):
    labels = []
    claims = []
    hypothesis = []

    for i, row in tqdm(df.iterrows()):
        
        articles_used = []
        ids_used = []
        
        # ------ Adding lables that already present in dataset ------ #
        #         collecting evidences
        evidences = set()
        for evidence in row.evidence:
            for little_evidence in evidence:
                if little_evidence[2] is not None:
                    evidences.add((little_evidence[2], little_evidence[3]))
        
        for evidence in evidences:   
            labels.append(row.label)
            claims.append(row.claim)
            hypothesis.append(converted_dataset.get(process_link(evidence[0]), {}).get(evidence[1], None))
            articles_used.append(process_link(evidence[0]))
            ids_used.append(evidence[1])
            
        # ------ Adding negative sample texts for NEI labaled samples using wikipedia API------- #
        if NEI_filling and (row.label == 'NOT ENOUGH INFO'):
            labels.append(row.label)
            claims.append(row.claim)
            hypothesis.append(None)
            labels.append(row.label)
            claims.append(row.claim)
            hypothesis.append(None)
         
        # ------ Oversampling texts with NEI labled from lables present ------ #
        if FEVER_sampling and (len(articles_used) > 0):
            new_sentences_samples = fever_oversample(articles_used, ids_used)
            
            for evidence in new_sentences_samples:
                labels.append('NOT ENOUGH INFO')
                claims.append(row.claim)
                hypothesis.append(evidence)
            
    return pd.DataFrame({'claim':claims, 'hypothesis':hypothesis, 'label':labels})

# train_processed = convert_to_snli_style(train, FEVER_sampling = True, NEI_filling = False)
# test_processed = convert_to_snli_style(test, NEI_filling = True)

In [None]:
# train_processed.to_csv("../data/fever/train_snli_style_sampling_2.csv", index = False)
# test_processed.to_csv("../data/fever/test_snli_style_sampling.csv", index = False)

## Building true Fever testset

Select candidates from model one (can be done in parallel) -> get corresponding texts from FEVER -> if texts are not found select directly from wikipedia


In [None]:
test_processed = convert_to_snli_style(test, NEI_filling = True)

In [None]:
import pickle
with open('/Users/ntr/Documents/tresh/parsed_candidates.pickle', 'rb') as handle:
    b = pickle.load(handle)
    
results = b['results']
index, query, candidates = results[0], results[1], results[2]

candidates_df = pd.DataFrame({'query':query, 'candidates':candidates})
candidates_df = candidates_df.drop_duplicates('query')

test_processed = test_processed.merge(candidates_df, left_on='claim', right_on='query')

In [None]:
test = test.merge(candidates_df, left_on='claim', right_on='query')

In [None]:
# ------------ filling_passed_queries ----------------- #
from flair.data import Sentence
from flair.models import SequenceTagger
import wikipedia

tagger = SequenceTagger.load('ner-fast') #'ner-fast'

def getting_wiki_candidates_raw(query, n = 10):
    search_results = wikipedia.search(query, results=n)
    return [t.replace(' ', '_') for t in search_results]

def get_enteties_flair(text):
    # make and process sentence
    sentence = Sentence(text)
    tagger.predict(sentence)
    ents = []
    # iterate over entities and print
    for entity in sentence.get_spans('ner'):
        ents.append(entity.text)
    return ents

def getting_wiki_candidates_NER(i, query, 
                                n = 10, 
                                separate = True,
                                verbose = True):
    
    ents = get_enteties_flair(query) 
    # extenting enteties if needed
    
    search_results = getting_wiki_candidates_raw(query, n=n)
    
    if not separate:
        search_results_en = getting_wiki_candidates_raw(' '.join(ents), n=n)
    
    else:
        search_results_en = []
        for e in ents:
            search_results_en += getting_wiki_candidates_raw(e, n=n)
#     print(i, search_results_en)

    return i, query, set([t.replace(' ', '_') for t in search_results + search_results_en])

conf = {'n':3, 'separate': True}
def getting_wiki_candidates_with_params(query):
    return getting_wiki_candidates_NER(0, query, **conf)[2]


In [None]:
def fill_candidates(query):
    new_query = ''
    for word in query.split(' '):
        try:
            if word[0].isupper():
                new_query += word + ' '
        except:
            pass
    ### - query with capital letter
    res = getting_wiki_candidates_with_params(new_query)
    
    ### - query with only start of sentence
    res.update(getting_wiki_candidates_with_params(' '.join(query.split(' ')[:3])))
    
    return res

for index, row in tqdm(test_processed.iterrows()):
    if len(row['candidates']) == 0:
        candidates = fill_candidates(row['claim'])
        test_processed.loc[index, ['candidates']] = [candidates]
        
for index, row in tqdm(test.iterrows()):
    if len(row['candidates']) == 0:
        candidates = fill_candidates(row['claim'])
        test.loc[index, ['candidates']] = [candidates]

In [None]:
def fill_NEI_text(claim, candidates):
    if len(candidates) < 1:
        return ' '
    ### try to get from FEVER
    for candidate in candidates:
        article = converted_dataset.get(process_link(candidate))
        if article:
            keys = list(article.keys())
            iterator_lock = 0
            while True:
                iterator_lock += 1
                i = np.random.choice(keys)
                if len(article[i]) < MIN_SEN_LEN:
                    pass
                else:
                    return article[i]
                if iterator_lock > 25:
                    break
    
    candidate = np.random.choice(list(candidates))
    iterator_lock = 0
    while True:
        iterator_lock += 1
        try:
            return np.random.choice(wikipedia.page(candidate).summary.split(". "))
        except: pass
        if iterator_lock > 25:
            return ' '
        

def fill_NEI_dataset(dataset):
    new_hypothesis = []
    for claim, hypothesis, candidates in tqdm(zip(dataset.claim.values, dataset.hypothesis.values, dataset.candidates.values)):
        if hypothesis is None:
            new_hypothesis.append(fill_NEI_text(claim, candidates))
        else:
            new_hypothesis.append(hypothesis)
    dataset['hypothesis'] = new_hypothesis
    return dataset

In [None]:
test_processed = fill_NEI_dataset(test_processed)

In [None]:
test_processed.head()

In [None]:
test_processed.label.value_counts()

In [None]:
test_processed.to_csv('test_oversampled.csv', 
                      index = False)

### Texts collection and saving:

In [None]:
# ------------ collecting texts for FEVER validation ----------------- #
def get_texts_for_validation(candidates):
    text_candidates = dict()
    ### try to get from FEVER
    if len(candidates) < 1:
        return dict()
    for candidate in candidates:
        article = converted_dataset.get(process_link(candidate))
        if article:
            text_candidates[candidate] = article
        else:
            text_candidates[candidate] = {}
#             print(candidate)
#             try:
#                 text_wiki = wikipedia.page(candidate).summary.split(". ")
#                 text_candidates[candidate] = {i:sentence for i, sentence in enumerate(text_wiki)}
#             except:
#                 pass
    return text_candidates

In [None]:
fever_full_test = {claim:get_texts_for_validation(candidates) for claim, candidates in tqdm(zip(test.claim.values, test.candidates.values))}

In [None]:
import pickle

with open('fever_end_test.pickle', 'wb') as handle:
    pickle.dump(fever_full_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Finding rate of not found articles

In [None]:
not_found_at_all = 0
not_found_candidates = 0
total_candidates = 0

for key in fever_full_test:
    candidates = fever_full_test[key].keys()
    total_items = len(candidates)
    found_items = 0
    for c in candidates:
        if len(fever_full_test[key][c]) != 0:
            found_items += 1
    
    total_candidates += total_items
    not_found_candidates += (total_items - found_items)
    if found_items == 0:
        not_found_at_all += 1
    
    

In [None]:
print(not_found_at_all)
print(not_found_candidates)
print(total_candidates)

In [None]:
print(not_found_at_all)
print(not_found_candidates)
print(total_candidates)

In [None]:
14373/124843

## Getting candidates for aggregation training:

In [None]:
import pickle

with open('nei_res.pickle', 'rb') as handle:
    b = pickle.load(handle)

d = b['results']

In [None]:
di = {c:h for c,h in d}

In [None]:
train['picked_article'] = train.claim.apply(lambda x: di.get(x, []))
train.head()

In [None]:
ids_to_use = []

ids_to_use.append(np.random.choice(train[(train.label == 'NOT ENOUGH INFO') & (train.picked_article.apply(len) > 0)].id.values, 5000, replace = False))
ids_to_use.append(np.random.choice(train[train.label == 'SUPPORTS'].id.values, 5000, replace = False))
ids_to_use.append(np.random.choice(train[train.label == 'REFUTES'].id.values, 5000, replace = False))

In [None]:
ranking_train = pd.concat([train[train.id.isin(ids_to_use[i])] for i in range(3)], axis = 0)



In [None]:
ranking_train

In [None]:
# ------------ collecting texts for FEVER validation ----------------- #
def get_texts_for_validation(candidates):
    text_candidates = dict()
    ### try to get from FEVER
    if candidates is None:
#         print(1)
        return dict()
    if list(candidates)[0] is None:
#         print(2)
        return dict()
    if len(candidates) < 1:
#         print(3)
        return dict()
    for candidate in candidates:
        article = converted_dataset.get(process_link(candidate))
        if article:
            text_candidates[candidate] = article
        else:
            text_candidates[candidate] = {}
    return text_candidates

In [None]:
ranking_train_agg = {(claim, lable):get_texts_for_validation(candidates) for lable, claim, candidates in tqdm(zip(ranking_train.label.values, ranking_train.claim.values, ranking_train.evidence_sources.values))}

for lable, claim, picked_article in tqdm(zip(ranking_train.label.values, ranking_train.claim.values, ranking_train.picked_article.values)):
    if len(picked_article)>0:
        ranking_train_agg.update({(claim, lable):get_texts_for_validation(picked_article)})

import pickle

with open('train_set_labling.pickle', 'wb') as handle:
    pickle.dump(ranking_train_agg, handle, protocol=pickle.HIGHEST_PROTOCOL)
    


In [None]:
dataset_ev = []

for claim, e, l in zip(ranking_train.claim.values,ranking_train.evidence.values,ranking_train.label.values):
    evidences = set()
    for evidence in e:
        for little_evidence in evidence:
            if little_evidence[2] is not None:
                evidences.add((little_evidence[2], little_evidence[3]))
    
    if len(evidences)>0:
        dataset_ev.append((claim, evidences, l))

In [None]:
with open('train_set_labling_ranking.pickle', 'wb') as handle:
    pickle.dump(dataset_ev, handle, protocol=pickle.HIGHEST_PROTOCOL)