# 0. Libraries

In [None]:
# ! pip install datasets -q
# !pip install transformers
from datasets import load_dataset
import torch
from transformers import BertTokenizer, BertModel, EncoderDecoderModel, AdamW
import pandas as pd

# 1. Load data

In [None]:
labels = load_dataset("fever", name='v1.0', split='train')



In [None]:
labels[0]

{'id': 75397,
 'label': 'SUPPORTS',
 'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'evidence_annotation_id': 92206,
 'evidence_id': 104971,
 'evidence_wiki_url': 'Nikolaj_Coster-Waldau',
 'evidence_sentence_id': 7}

In [None]:
wiki = load_dataset("fever", name='wiki_pages')



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
wiki['wikipedia_pages'][1]

{'id': '1928_in_association_football',
 'text': 'The following are the football -LRB- soccer -RRB- events of the year 1928 throughout the world . ',
 'lines': '0\tThe following are the football -LRB- soccer -RRB- events of the year 1928 throughout the world .\n1\t'}

In [None]:
labels_df = labels.to_pandas()
labels_df.head()

Unnamed: 0,id,label,claim,evidence_annotation_id,evidence_id,evidence_wiki_url,evidence_sentence_id
0,75397,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,92206,104971,Nikolaj_Coster-Waldau,7
1,75397,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,92206,104971,Fox_Broadcasting_Company,-1
2,150448,SUPPORTS,Roman Atwood is a content creator.,174271,187498,Roman_Atwood,1
3,150448,SUPPORTS,Roman Atwood is a content creator.,174271,187499,Roman_Atwood,3
4,214861,SUPPORTS,"History of art includes architecture, dance, s...",255136,254645,History_of_art,2


In [None]:
wiki['wikipedia_pages']

Dataset({
    features: ['id', 'text', 'lines'],
    num_rows: 5416537
})

In [None]:
wiki_df = wiki['wikipedia_pages'].to_pandas()
wiki_df.head()

Unnamed: 0,id,text,lines
0,,,
1,1928_in_association_football,The following are the football -LRB- soccer -R...,0\tThe following are the football -LRB- soccer...
2,1986_NBA_Finals,The 1986 NBA Finals was the championship round...,0\tThe 1986 NBA Finals was the championship ro...
3,1901_Villanova_Wildcats_football_team,The 1901 Villanova Wildcats football team repr...,0\tThe 1901 Villanova Wildcats football team r...
4,1992_Northwestern_Wildcats_football_team,The 1992 Northwestern Wildcats team represente...,0\tThe 1992 Northwestern Wildcats team represe...


In [None]:
sample_row = wiki_df[wiki_df['id']=='Nikolaj_Coster-Waldau']

In [None]:
print(sample_row.text.values)
print(sample_row.lines.values)

["Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter . He graduated from Danish National School of Theatre in Copenhagen in 1993 . Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- . Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .   In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon . He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot . He became widely known to a broad audience for his current role as Ser Jaime Lannister , in the HBO series Game o

In [None]:
print(sample_row.lines.values[0])

0	Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰʌsd̥ɐ ˈʋald̥ɑʊ̯ -RSB- ; born 27 July 1970 -RRB- is a Danish actor , producer and screenwriter .
1	He graduated from Danish National School of Theatre in Copenhagen in 1993 .	Danish National School of Theatre	Danish National School of Theatre and Contemporary Dance	Copenhagen	Copenhagen
2	Coster-Waldau 's breakthrough performance in Denmark was his role in the film Nightwatch -LRB- 1994 -RRB- .	Nightwatch	Nightwatch (1994 film)
3	Since then he has appeared in numerous films in his native Scandinavia and Europe in general , including Headhunters -LRB- 2011 -RRB- and A Thousand Times Good Night -LRB- 2013 -RRB- .	Headhunters	Headhunters (film)	A Thousand Times Good Night	A Thousand Times Good Night
4	
5	
6	In the United States , his debut film role was in the war film Black Hawk Down -LRB- 2001 -RRB- , playing Medal of Honor recipient Gary Gordon .	Black Hawk Down	Black Hawk Down (film)	Gary Gordon	Gary Gordon
7	He then played Detective John

In [None]:
print(labels_df.claim[0])

Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.


# 2. Isolate the true claims

In [None]:
labels_df.label.value_counts()

SUPPORTS           193756
REFUTES             70066
NOT ENOUGH INFO     47609
Name: label, dtype: int64

In [None]:
true_claims = labels_df[labels_df.label=='SUPPORTS'][['claim', 'evidence_wiki_url']].copy()
true_claims.head()

Unnamed: 0,claim,evidence_wiki_url
0,Nikolaj Coster-Waldau worked with the Fox Broa...,Nikolaj_Coster-Waldau
1,Nikolaj Coster-Waldau worked with the Fox Broa...,Fox_Broadcasting_Company
2,Roman Atwood is a content creator.,Roman_Atwood
3,Roman Atwood is a content creator.,Roman_Atwood
4,"History of art includes architecture, dance, s...",History_of_art


In [None]:
del labels_df

In [None]:
wiki_id_text_df = wiki_df[['id', 'text']].copy()

In [None]:
del wiki_df

In [None]:
wiki_id_text_df.columns = ['evidence_wiki_url', 'text']
wiki_id_text_df.head()

Unnamed: 0,evidence_wiki_url,text
0,,
1,1928_in_association_football,The following are the football -LRB- soccer -R...
2,1986_NBA_Finals,The 1986 NBA Finals was the championship round...
3,1901_Villanova_Wildcats_football_team,The 1901 Villanova Wildcats football team repr...
4,1992_Northwestern_Wildcats_football_team,The 1992 Northwestern Wildcats team represente...


In [None]:
joint_df = pd.merge(true_claims, wiki_id_text_df, on='evidence_wiki_url')
joint_df.head()

Unnamed: 0,claim,evidence_wiki_url,text
0,Nikolaj Coster-Waldau worked with the Fox Broa...,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
1,Nikolaj Coster-Waldau worked with Peter Dinklage.,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
2,Nikolaj Coster-Waldau was in a film.,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
3,Nikolaj Coster-Waldau was in a film.,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
4,Game of Thrones (season 1) featured Danish act...,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...


In [None]:
del true_claims
del wiki_id_text_df

In [None]:
joint_df.shape

(192405, 3)

In [None]:
joint_df=joint_df.drop('evidence_wiki_url', axis=1)
joint_df

Unnamed: 0,claim,text
0,Nikolaj Coster-Waldau worked with the Fox Broa...,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
1,Nikolaj Coster-Waldau worked with Peter Dinklage.,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
2,Nikolaj Coster-Waldau was in a film.,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
3,Nikolaj Coster-Waldau was in a film.,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
4,Game of Thrones (season 1) featured Danish act...,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
...,...,...
192400,A subtype of anti-nuclear antibodies are anti-...,Anti-SSA autoantibodies -LRB- Anti-Sjögren 's ...
192401,"In 2009, Joe Biden became the Vice President.","The 2008 presidential campaign of Joe Biden , ..."
192402,"In 2009, Joe Biden became the Vice President.","The 2008 presidential campaign of Joe Biden , ..."
192403,Sarah Michelle Gellar was in a movie.,Scooby-Doo -LRB- also known as Scooby-Doo : Th...


In [None]:
joint_df=joint_df.drop_duplicates().reset_index(drop=True)
joint_df

Unnamed: 0,claim,text
0,Nikolaj Coster-Waldau worked with the Fox Broa...,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
1,Nikolaj Coster-Waldau worked with Peter Dinklage.,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
2,Nikolaj Coster-Waldau was in a film.,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
3,Game of Thrones (season 1) featured Danish act...,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
4,Nikolaj Coster-Waldau played Frank Pike in Vir...,Nikolaj Coster-Waldau -LRB- -LSB- neɡ̊olaɪ̯ kʰ...
...,...,...
94841,Texas shares a border with a place.,"Chihuahua -LRB- -LSB- tʃiˈwawa -RSB- -RRB- , o..."
94842,Taipei is on an island in Asia.,"New Taipei , is a special municipality and the..."
94843,A subtype of anti-nuclear antibodies are anti-...,Anti-SSA autoantibodies -LRB- Anti-Sjögren 's ...
94844,"In 2009, Joe Biden became the Vice President.","The 2008 presidential campaign of Joe Biden , ..."


In [None]:
joint_df.to_csv('joint_true_claims.csv')