## FEVER: Fact Extraction and VERification

In [1]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import os
import json

from tqdm import tqdm
from collections import Counter
from itertools import product
from sklearn.linear_model import LogisticRegression

import fever
import utils

### Functionalities of Oracle Class

In [2]:
DB_PATH = '../data/fever/fever.db'
MAT_PATH = 'data/index/tfidf-count-ngram=2-hash=16777216.npz'

In [3]:
oracle = fever.Oracle()

In [5]:
query = 'Tetris has sold millions of physical copies.'

In [6]:
oracle.closest_docs(query, k=4)

['Tetris',
 'Jolin_Tsai_discography',
 'List_of_best-selling_Game_Boy_video_games',
 'Eminem_discography']

In [7]:
oracle.doc_ids2texts(['Tetris'])

["Tetris -LRB- , pronounced -LSB- ˈtɛtrʲɪs -RSB- -RRB- is a tile-matching puzzle video game , originally designed and programmed by Russian game designer Alexey Pajitnov . It was released on June 6 , 1984 , while he was working for the Dorodnitsyn Computing Centre of the Academy of Science of the USSR in Moscow . He derived its name from the Greek numerical prefix tetra - -LRB- all of the game 's pieces contain four segments -RRB- and tennis , Pajitnov 's favorite sport .   Tetris was the first entertainment software to be exported from the USSR to the US , where it was published by Spectrum HoloByte for Commodore 64 and IBM PC . The Tetris game is a popular use of tetrominoes , the four-element special case of polyominoes . Polyominoes have been used in popular puzzles since at least 1907 , and the name was given by the mathematician Solomon W. Golomb in 1953 . However , even the enumeration of pentominoes is dated to antiquity .   The game -LRB- or one of its many variants -RRB- is a

In [8]:
oracle.get_sentence('Tetris', 0)

'Tetris -LRB- , pronounced -LSB- ˈtɛtrʲɪs -RSB- -RRB- is a tile-matching puzzle video game , originally designed and programmed by Russian game designer Alexey Pajitnov .'

In [9]:
oracle.choose_sents_from_doc_ids(query, oracle.closest_docs(query, k=4), k=3)

{('Tetris',
  12): 'In January 2010 , it was announced that the Tetris franchise had sold more than 170 million copies , approximately 70 million physical copies and over 100 million copies for cell phones , making it the best selling paid-downloaded game of all time .',
 ('Jolin_Tsai_discography',
  9): 'Her next release under Sony , Magic -LRB- 2003 -RRB- , was heralded as her comeback album , which sold more than 1.5 million copies in Asia , with more than 360,000 copies sold in Taiwan alone , and the album made her the best-selling female singer of the year in Taiwan .',
 ('Jolin_Tsai_discography',
  11): 'The album has sold over 2 million copies in Asia , with 300,000 copies sold in Taiwan alone , and made her the best-selling female singer of the year in Taiwan .'}

In [10]:
oracle.read(query)

{('Tetris',
  12): 'In January 2010 , it was announced that the Tetris franchise had sold more than 170 million copies , approximately 70 million physical copies and over 100 million copies for cell phones , making it the best selling paid-downloaded game of all time .',
 ('Jolin_Tsai_discography',
  9): 'Her next release under Sony , Magic -LRB- 2003 -RRB- , was heralded as her comeback album , which sold more than 1.5 million copies in Asia , with more than 360,000 copies sold in Taiwan alone , and the album made her the best-selling female singer of the year in Taiwan .',
 ('Jolin_Tsai_discography',
  11): 'The album has sold over 2 million copies in Asia , with 300,000 copies sold in Taiwan alone , and made her the best-selling female singer of the year in Taiwan .'}

### Dataset Structure

In [11]:
fever_iterator = iter(fever.TrainReader().read())

In [12]:
fever_ex = next(fever_iterator)

In [13]:
print(fever_ex)

Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
VERIFIABLE
SUPPORTS


In [14]:
fever_ex

"FEVER Example({'id': 75397, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'evidence': [[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]]})

In [15]:
fever_ex.get_evidence_ids()

[('Nikolaj_Coster-Waldau', 7), ('Fox_Broadcasting_Company', 0)]

In [16]:
fever_labels = pd.Series(
    [ex.label for ex in fever.TrainReader().read()])

In [17]:
fever_labels.value_counts()

SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
dtype: int64

In [18]:
fever_labels = pd.Series(
    [ex.label for ex in fever.DevReader().read()])

In [19]:
fever_labels.value_counts()

SUPPORTS           3333
NOT ENOUGH INFO    3333
REFUTES            3333
dtype: int64

### Test on Document Retrieval and Sentence Selection 

In [20]:
# TF-IDF
for num_docs in [1,3,5,10]:
    fever.doc_retrieval_accuracy(reader=fever.TrainReader(samp_percentage=0.05),
                                oracle=oracle,
                                num_docs=num_docs)

Reading from dataset: 7243examples [12:17,  9.83examples/s]                    
Reading from dataset:   0%|          | 0/7199 [00:00<?, ?examples/s]

Num_docs = 1, accuracy 1265/5455 = 0.23189734188817598


Reading from dataset: 7248examples [12:01, 10.04examples/s]                    


Num_docs = 3, accuracy 2503/5454 = 0.45892922625595894


Reading from dataset:  98%|█████████▊| 7239/7388 [11:55<00:16,  9.06examples/s]
Reading from dataset:   0%|          | 0/7129 [00:00<?, ?examples/s]

Num_docs = 5, accuracy 3059/5461 = 0.5601538179820545


Reading from dataset: 7388examples [11:58,  7.98examples/s]                    

Num_docs = 10, accuracy 3824/5618 = 0.6806692773228907





In [21]:
for num_sents in [1,3,5,10]:
    fever.sentence_selection_accuracy(reader=fever.TrainReader(samp_percentage=0.05),
                                oracle=oracle,
                                num_sents=num_sents)

Reading from dataset: 7423examples [02:54, 44.67examples/s]                    
Reading from dataset:   0%|          | 4/7388 [00:00<03:24, 36.05examples/s]

Num_sents = 1, accuracy 2870/5602 = 0.5123170296322742


Reading from dataset:  99%|█████████▉| 7306/7388 [02:44<00:01, 44.36examples/s]


Num_sents = 3, accuracy 3657/5537 = 0.660465956294022


Reading from dataset:  98%|█████████▊| 7181/7328 [02:40<00:03, 44.84examples/s]


Num_sents = 5, accuracy 3935/5398 = 0.7289736939607262


Reading from dataset:  98%|█████████▊| 7177/7354 [02:38<00:03, 45.30examples/s]

Num_sents = 10, accuracy 4367/5356 = 0.8153472740851382





### Sampling for NotEnoughInfo class

In [24]:
def sampling_for_NEI(oracle, num_docs=5, num_sents=5):
    names = ['train','dev','test']
    for name in names:
        print('Working on {} split'.format(name))
        original_path = 'data/fever-data/{}.jsonl'.format(name)
        sampling_path = 'data/fever-data/{}_sampled.jsonl'.format(name)
        with open(original_path, "r") as f:
            with open(sampling_path, "w+") as f2:
                for line in tqdm(f.readlines()):
                    line = json.loads(line)

                    if name == 'dev' or name == 'test' or line["label"] == "NOT ENOUGH INFO":
                        evidences = oracle.read(line['claim'], num_docs=num_docs, num_sents=num_sents).keys()
                        line['evidence'] = [[[0,0,ev[0],ev[1]] for ev in evidences]]

                    f2.write(json.dumps(line) + "\n")

In [25]:
sampling_for_NEI(oracle)

  0%|          | 0/145449 [00:00<?, ?it/s]

Working on train split


100%|██████████| 145449/145449 [3:01:26<00:00, 13.36it/s]  
  0%|          | 0/9999 [00:00<?, ?it/s]

Working on dev split


100%|██████████| 9999/9999 [51:59<00:00,  3.26it/s]  
  0%|          | 1/9999 [00:00<25:34,  6.52it/s]

Working on test split


100%|██████████| 9999/9999 [51:48<00:00,  4.85it/s]  


### RTE Training and Test

In [26]:
def word_overlap_phi(claim, evidence):    
    """Basis for features for the words in both the premise and hypothesis.
    This tends to produce very sparse representations.
    
    Parameters
    ----------
    claim : a string
    evidence : a list of sentences
    
    Returns
    -------
    defaultdict
       Maps each word in both claim and evidence to 1.
    
    """
    sents=[]
    for sent in evidence:
        sents.extend(utils.process_sent(sent))
    overlap = set([w1 for w1 in utils.process_text(claim) if w1 in sents])
    return Counter(overlap)

In [27]:
def fit_maxent_classifier(X, y):    
    """Wrapper for `sklearn.linear.model.LogisticRegression`. This is also 
    called a Maximum Entropy (MaxEnt) Classifier, which is more fitting 
    for the multiclass case.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    sklearn.linear.model.LogisticRegression
        A trained `LogisticRegression` instance.
    
    """
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [28]:
percentage = 0.1

In [29]:
dataset = fever.build_dataset(fever.SampledTrainReader(samp_percentage=percentage), 
                              word_overlap_phi, oracle)

Reading from dataset: 14441examples [02:04, 115.78examples/s]                     


In [30]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(samp_percentage=percentage), 
    phi=word_overlap_phi,
    oracle=oracle,
    train_func=fit_maxent_classifier,
    assess_reader=fever.SampledDevReader(),
    random_state=42)

Reading from dataset: 14781examples [02:04, 118.96examples/s]                     
Reading from dataset: 100%|██████████| 9999/9999 [04:57<00:00, 33.66examples/s]


                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.363     0.222     0.275      3333
        REFUTES      0.353     0.038     0.069      3333
       SUPPORTS      0.333     0.758     0.463      3333

       accuracy                          0.340      9999
      macro avg      0.349     0.340     0.269      9999
   weighted avg      0.349     0.340     0.269      9999



In [31]:
def word_cross_product_phi(claim, evidence):
    """Basis for cross-product features. This tends to produce pretty 
    dense representations.
    
    Parameters
    ----------
    claim : a string
    evidence : a list of sentences
        
    Returns
    -------
    defaultdict
        Maps each (w1, w2) in the cross-product of words in claim and 
        evidence to its count. This is a multi-set cross-product
        (repetitions matter).
    
    """
    sents=[]
    for sent in evidence:
        sents.extend(utils.process_sent(sent))
    return Counter([(w1, w2) for w1, w2 in product(utils.process_text(claim), sents)])

In [32]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(samp_percentage=percentage), 
    phi=word_cross_product_phi,
    oracle=oracle,
    train_func=fit_maxent_classifier,
    assess_reader=fever.SampledDevReader(),
    random_state=42)

Reading from dataset: 14713examples [02:14, 109.66examples/s]                     
Reading from dataset: 100%|██████████| 9999/9999 [05:17<00:00, 31.52examples/s]


                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.361     0.402     0.381      3333
        REFUTES      0.556     0.233     0.328      3333
       SUPPORTS      0.372     0.547     0.443      3333

       accuracy                          0.394      9999
      macro avg      0.430     0.394     0.384      9999
   weighted avg      0.430     0.394     0.384      9999



In [33]:
def fit_maxent_with_crossvalidation(X, y):
    """A MaxEnt model of dataset with hyperparameter cross-validation.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
        
    y : list
        The list of labels for rows in `X`.   
    
    Returns
    -------
    sklearn.linear_model.LogisticRegression
        A trained model instance, the best model found.
    
    """    
    basemod = LogisticRegression(fit_intercept=True)
    cv = 3
    param_grid = {'C': [0.4, 0.6, 0.8, 1.0],
                  'penalty': ['l1','l2']}    
    return fever.fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid)


In [30]:
# unigram+bigram result
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(), 
    phi=word_overlap_phi,
    oracle=oracle,
    train_func=fit_maxent_with_crossvalidation,
    assess_reader=fever.SampledDevReader())


Reading from dataset: 100%|██████████| 145449/145449 [20:28<00:00, 118.42examples/s]
Reading from dataset: 100%|██████████| 9999/9999 [04:55<00:00, 33.85examples/s]
  'precision', 'predicted', average, warn_for)


Best params {'C': 1.0, 'penalty': 'l2'}
Best score: 0.430
                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.362     0.329     0.344      3333
        REFUTES      0.424     0.012     0.023      3333
       SUPPORTS      0.337     0.696     0.454      3333

       accuracy                          0.346      9999
      macro avg      0.374     0.346     0.274      9999
   weighted avg      0.374     0.346     0.274      9999



In [34]:
# unigram test result
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(), 
    phi=word_overlap_phi,
    oracle=oracle,
    train_func=fit_maxent_with_crossvalidation,
    assess_reader=fever.SampledDevReader())

Reading from dataset: 100%|██████████| 145449/145449 [20:23<00:00, 118.89examples/s]
Reading from dataset: 100%|██████████| 9999/9999 [04:57<00:00, 33.59examples/s]


Best params {'C': 1.0, 'penalty': 'l2'}
Best score: 0.430
                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.362     0.326     0.343      3333
        REFUTES      0.418     0.011     0.022      3333
       SUPPORTS      0.337     0.698     0.454      3333

       accuracy                          0.345      9999
      macro avg      0.372     0.345     0.273      9999
   weighted avg      0.372     0.345     0.273      9999



In [35]:
percentage = 0.2

In [36]:
_ = fever.experiment(
    train_reader=fever.SampledTrainReader(samp_percentage=percentage), 
    phi=word_cross_product_phi,
    oracle=oracle,
    train_func=fit_maxent_with_crossvalidation,
    assess_reader=fever.SampledDevReader(),
    random_state=42)

Reading from dataset: 28999examples [04:05, 118.24examples/s]                     
Reading from dataset: 100%|██████████| 9999/9999 [05:17<00:00, 31.52examples/s]


Best params {'C': 1.0, 'penalty': 'l1'}
Best score: 0.602
                 precision    recall  f1-score   support

NOT ENOUGH INFO      0.356     0.515     0.421      3333
        REFUTES      0.546     0.233     0.327      3333
       SUPPORTS      0.382     0.429     0.404      3333

       accuracy                          0.392      9999
      macro avg      0.428     0.392     0.384      9999
   weighted avg      0.428     0.392     0.384      9999

