In [None]:
%cd ../src
%load_ext autoreload
%autoreload 2

In [None]:
from sentence_transformers import CrossEncoder
import requests
import pandas as pd
requests.packages.urllib3.disable_warnings() 

import get_evidences

In [None]:
def flatten_evidence(parsed_evidence):
    return ' '.join([' '.join([' '.join([el if len(el)>2 else '' for el in sl]).strip() for sl in lst]) for lst in parsed_evidence]).strip()

In [None]:
def collect_evidence(claim, num_results):
    links = get_evidences.get_top_k_results_from_google(claim, k=num_results)
    evidence = [(l, flatten_evidence(get_evidences.get_relevant_text_from_webpage(l))) for l in links]

    # Remove empty evidence
    evidence = [ev for ev in evidence if len(ev[1]) > 0]
    return evidence

In [None]:
def parse_conclusions(conclusions):
    num_sources = len(conclusions)
    
    false_score = conclusions.count('false')*-1
    unsure_score = conclusions.count('unsure')*-0.2
    true_score = conclusions.count('true')*1
    
    total_score = false_score + unsure_score + true_score
    
    print(f'Conclusions: {conclusions}')
    print(f'Computed score: {total_score}\n')
    
    if total_score < -(num_sources / 3):
        print('This claim is very likely to be false.')
    elif total_score < (num_sources / 3):
        print('This claim is probably untrue.')
    else:
        print('This claim is plausible.')
    
    return total_score

In [None]:
def investigate_claim(claim, model):
    label_mapping = ['false', 'true', 'unsure']
    
    try:
        evidence = load_evidence(claim)
        print('Existing evidence results found locally, loading from disk.')
    except:
        evidence = collect_evidence(claim, num_results=10)
        save_evidence(evidence, claim)
        
    conclusions = []

    for i, row in evidence.iterrows():
        source, text = row[0], row[1]
        scores = model.predict([(claim, text)])

        #Convert scores to labels
        labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
        conclusions += labels

    parse_conclusions(conclusions)
    evidence['conclusion'] = conclusions
    
    return evidence

In [None]:
def save_evidence(evidence, claim):
    claim = claim.replace(' ', '').strip()
    pd.DataFrame(evidence).to_csv(f"../data/temp/{claim}.csv")
    
def load_evidence(claim):
    claim = claim.replace(' ', '').strip()
    evidence = pd.read_csv(f"../data/temp/{claim}.csv")
    return evidence

## Perform NLI on evidence of several claims

https://huggingface.co/cross-encoder/nli-roberta-base

In [None]:
model = CrossEncoder('cross-encoder/nli-roberta-base')

## Brad pitt married to Britney Spears? (fake)

In [None]:
claim = "brad pitt is to marry with britney spears"
evidence =  = investigate_claim(claim, model)

## Joe Biden classified documents found? (true)

In [None]:
claim = "Joe Biden took home classified documents after leaving the vice-presidency"
evidence = investigate_claim(claim, model)

## COVID-19 vaccine causes infertility (fake)

In [None]:
claim = "COVID-19 vaccine causes infertility"
evidence = investigate_claim(claim, model)

## Try other models

In [None]:
# snli_labels = ['true', 'unsure', 'false']
snli_labels = ['contradiction', 'entailment', 'neutral']

In [None]:
claim = 'COVID-19 vaccine causes infertility'
evidence = load_evidence(claim)
context = evidence.text[0]

### Q&A

In [None]:
summarizer = pipeline("summarization")
summarized_context = summarizer(context, min_length=5, max_length=50)
summarized_context = summarized_context[0]['summary_text']

In [None]:
model = CrossEncoder('cross-encoder/nli-distilroberta-base')

claim = 'COVID-19 vaccine does not cause infertility'
for ev in evidence['text'].values:
    result = model.predict([(claim, context)])
    print(result)
    print(snli_labels[result.argmax(axis=1)[0]])

In [None]:
model = CrossEncoder('cross-encoder/nli-distilroberta-base')

claim = 'COVID-19 vaccine causes infertility'
for ev in evidence['text'].values:
    result = model.predict([(claim, context)])
    print(result)
    print(snli_labels[result.argmax(axis=1)[0]])

In [None]:
# https://huggingface.co/datasets/snli
model = CrossEncoder('cross-encoder/nli-distilroberta-base')

result = model.predict([("This church choir sings to the masses as they sing joyous songs from the book at a church.", 
                         "The church is filled with song.")])
snli_labels[result.argmax(axis=1)[0]]

# should be entailment

In [None]:
model = CrossEncoder('cross-encoder/nli-distilroberta-base')

result = model.predict([("A woman with a green headscarf, blue shirt and a very big grin.", 
                         "The woman is very happy.")])
snli_labels[result.argmax(axis=1)[0]]

# should be entailment

In [None]:
# https://huggingface.co/datasets/snli
model = CrossEncoder('cross-encoder/nli-distilroberta-base')

result = model.predict([(claim, summarized_context)])
print(result)
snli_labels[result.argmax(axis=1)[0]]

### Zero-shot

In [None]:
from transformers import pipeline

oracle = pipeline(model="facebook/bart-large-mnli")

In [None]:
context = f"{evidence.text[0]}."
conclusion = oracle(context, candidate_labels=[f"{claim} is true", f"{claim} is false"])
print(conclusion)

In [None]:
context = f"{evidence.text[0]}."
conclusion = oracle(context, candidate_labels=[f"{claim} is false", 
                                               f"{claim} is true"])
print(conclusion)

In [None]:
context = f"Claim: COVID-19 vaccine does not cause infertility. Information: {summarized_context}."
conclusion = oracle(context, candidate_labels=["False claim", "True claim"])
print(conclusion)

In [None]:
context = f"Claim: {claim}. Information: {summarized_context}."
conclusion = oracle(context, candidate_labels=["False claim", "True claim"])
print(conclusion)

In [None]:
# claim = 'COVID-19 vaccine causes infertility'
# for ev in evidence['text'].values:
#     context = f"{claim}. This is logically supported by the following passages. {ev}"
#     conclusion = oracle(context, candidate_labels=["true", "unsure", "false"])
#     print(conclusion)

## Text Generation

In [None]:
generator = pipeline(model="gpt2", max_length=100)

In [None]:
generator(f"{summarized_context}. Therefore, is the claim '{claim}' true or false?", do_sample=False)