In [2]:
# Adapted and modified from https://github.com/sheffieldnlp/fever-baselines/tree/master/src/scripts
# which is adapted from https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/build_db.py
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
import json
import random
import re
import os
import sys
from tqdm import tqdm
from collections import Counter
from nltk import word_tokenize


def parse_wiki(wikipedia_dir, doc_id_dir):
    """
    This function traverses all the jsonl files
    and returns a dictionary containing document ID and corresponding content

    Args
    wikipedia_dir: the parent directory of the jsonl files
    doc_id_dir: the location of wiki-pages

    Returns
    a dictionary: document ID as dictionary keys and document content as values.
    
    Remark: Saves the dictionary in ../data/doc_id_text to speed up subsequent passes.
    """
    # doc_id_text saves the title and content of each wiki-page
    doc_id_text=dict()
    try:
        with open(doc_id_dir, "r") as f:
            print("Reading from" + str(doc_id_dir) )
            for line in f:
                fields=line.rstrip("\n").split("\t")
                doc_id=fields[0]
                text=fields[1]
                doc_id_text[doc_id]=text
    except:
        print(doc_id_dir)
        with open(doc_id_dir,"w") as w:
            print("Constructing " + str(doc_id_dir))
            for i in tqdm(range(1,110)):# jsonl file number from 001 to 109
                jnum="{:03d}".format(i)
                fname=wikipedia_dir+"wiki-"+jnum+".jsonl"
                with open(fname) as f:
                    # point=f.tell()# file pointer starting from 0
                    line=f.readline()
                    while line:
                        data=json.loads(line.rstrip("\n"))
                        doc_id=data["id"]
                        text = data["text"]
                        lines=data["lines"]
                        if text != "":
                            w.write(doc_id+"\t"+text+"\n")
                            doc_id_text[doc_id]=text
                        # point=f.tell()
                        line=f.readline()
        
    return doc_id_text


def load_doclines(titles, t2jnum, filtering=True):
    """load all lines for provided titles
    Args
    titles: list of titles
    """
    if filtering:
        # select title from titles if this title is in the wiki-pages
        filtered_titles = [title for title in titles if title in t2jnum]
        print("mismatch: {} / {}".format(len(titles) - len(filtered_titles), len(titles)))
        titles = filtered_titles

    docs ={"dummy_id": [(title, "dummy_linum") for title in titles]}
    doclines = load_doc_lines(docs, t2jnum, wikipedia_dir="../data/wiki-pages/wiki-pages/")
    return doclines


def load_doc_lines(docs=dict(), t2jnum=dict(), wikipedia_dir="../data/wiki-pages/wiki-pages/"):
    """Returns a dictionary from titles to line numbers to line text.
    Args
    docs: {claim_id: [(title, sentence_num),  ...], ...}
    Input is a dictionary from claim ids to titles and line numbers,
    and a lookup from titles to filenumbers.
    """
    doclines = dict()
    jnums = dict()
    titles = set()
    ## cid is the claim id that is an integer
    for cid in docs:
        for title, sentence_num in docs[cid]:
            doclines[title] = dict()
            titles.add(title)
            if title in t2jnum:
                jnum, point = t2jnum[title]
                if jnum not in jnums:
                    jnums[jnum] = set()
                jnums[jnum].add(point)
            else:
                print(str(title) + " not in t2jnum!")
    for jnum in tqdm(jnums):
        points = sorted(list(jnums[jnum]))
        fname = wikipedia_dir + "wiki-" + jnum + ".jsonl"
        with open(fname) as f:
            for point in points:
                f.seek(point, 0)
                line = f.readline()
                data = json.loads(line.rstrip("\n"))
                title = data["id"]
                lines = data["lines"]
                assert title in titles
                if title in titles and lines != "":
                    for l in lines.split("\n"):
                        fields = l.split("\t")
                        if fields[0].isnumeric():
                            l_id = int(fields[0])
                            l_txt = fields[1]
                            doclines[title][l_id] = l_txt
    return doclines


def get_evidence_sentence_list(evidences, t2l2s, prependlinum=False, prependtitle=False):
    """lookup corresponding sentences and return list of sentences
    Args
    evidences: [(title, linum), ...]
    t2l2s: title2line2sentence <- output of load_doc_lines
    Returns
    list of evidence sentences
    """
    SEP = "#"
    def process_title(title):
        """ 'hoge_fuga_hoo' -> 'hoge fuga hoo' """
        return re.sub("_", " ", title)

    def maybe_prepend(title, linum):
        prep = list()
        if prependtitle:
            prep.append(title)
        if prependlinum:
            prep.append(str(linum))

        content = " {} ".format(SEP).join(prep)
        if prep:
            return "{0} {1} {0}".format(SEP, content)
        else:
            return content

    titles = [title for title, _ in evidences]
    linums = [linum for _, linum in evidences]

    return [ (maybe_prepend(process_title(title), linum) + " " + t2l2s[title][linum]).strip() for title, linum in zip(titles, linums)]



def load_dataset_json(path, instance_num=1e6):
    """
    Reads the Fever Training set, returns list of examples.
    instance_num: how many examples to load. Useful for debugging.
    """
    data = []
    with open(path, 'r') as openfile:
        for iline, line in enumerate(openfile.readlines()):
            data.append(json.loads(line))
            if iline+1 >= instance_num:
                break
    return data

def load_dataset(set_type,instance_num=1e6):
    """Reads the Fever train/dev set used on the paper.
    """
    if set_type == 'train':
        dataset = load_dataset_json(path="/home/ubuntu/efs/fever_data/train.jsonl", instance_num=instance_num)
    if set_type == 'dev':
        dataset = load_dataset_json(path="/home/ubuntu/efs/fever_data/dev.jsonl", instance_num=instance_num)
    return dataset


if __name__ == "__main__":
    # load fever training data

    train_path = '/Users/cengqiqi/Desktop/DM_working/dataset/train.jsonl'
    train_data = load_dataset_json(path=train_path,instance_num=20)
    print(len(train_data))

    dev_path = '/Users/cengqiqi/Desktop/DM_working/dataset/shared_task_dev.jsonl'
    dev_data = load_dataset_json(path=dev_path, instance_num=20)
    print(len(dev_data))


    for sample in train_data[:1]:
        print(sample)

In [8]:
wikipedia_dir = '/Users/cengqiqi/Desktop/DM_working/dataset/wiki-pages/wiki-pages/'
doc_id_dir = '/Users/cengqiqi/Desktop/DM_working/trytry'
x = parse_wiki(wikipedia_dir, doc_id_dir)

/Users/cengqiqi/Desktop/DM_working/trytry
Constructing /Users/cengqiqi/Desktop/DM_working/trytry


100%|██████████| 109/109 [02:07<00:00,  1.18s/it]


In [11]:
doc_id_text=dict()
with open(doc_id_dir, "r") as f:
            print("Reading from" + str(doc_id_dir) )
            for line in f:
                fields=line.rstrip("\n").split("\t")
                doc_id=fields[0]
                text=fields[1]
                doc_id_text[doc_id]=text

Reading from/Users/cengqiqi/Desktop/DM_working/trytry


In [12]:
len(doc_id_text)

5396106

In [17]:
doc_id_text

{'1928_in_association_football': 'The following are the football -LRB- soccer -RRB- events of the year 1928 throughout the world . ',
 '1986_NBA_Finals': "The 1986 NBA Finals was the championship round of the 1985 -- 86 NBA season . It pitted the Eastern Conference champion Boston Celtics against the Western Conference champion Houston Rockets , in a rematch of the 1981 Finals -LRB- only Allen Leavell and Robert Reid remained from the Rockets ' 1981 team -RRB- . The Celtics defeated the Rockets four games to two to win their 16th NBA championship . The championship would be the Celtics ' last until the 2008 NBA Finals . Larry Bird was named the Finals MVP .   On another note , this series marked the first time the `` NBA Finals '' branding was officially used , as they dropped the `` NBA World Championship Series '' branding which had been in use since the beginning of the league , though it had been unofficially called the `` NBA Finals '' for years .   Until the 2011 series , this wa

In [13]:
doc_id_textpath_windows = "N:\\DesktopSettings\\Desktop\\DM_working\\dataset\\wiki_id_text"
path_mac = "/Users/cengqiqi/Desktop/DM_working/trytry"
wikipage = pd.read_table(path_mac,header = None)

In [15]:
len(wikipage)

5395867

In [16]:
wikipage.head()

Unnamed: 0,0,1
0,1928_in_association_football,The following are the football -LRB- soccer -R...
1,1986_NBA_Finals,The 1986 NBA Finals was the championship round...
2,1901_Villanova_Wildcats_football_team,The 1901 Villanova Wildcats football team repr...
3,1992_Northwestern_Wildcats_football_team,The 1992 Northwestern Wildcats team represente...
4,1897_Princeton_Tigers_football_team,The 1897 Princeton Tigers football team repres...


In [20]:
doc_id_text['1992_Northwestern_Wildcats_football_team']

"The 1992 Northwestern Wildcats team represented Northwestern University during the 1992 NCAA Division I-A football season . In their first year under head coach Gary Barnett , the Wildcats compiled a 3 -- 8 record -LRB- 3 -- 5 against Big Ten Conference opponents -RRB- and finished in ninth place in the Big Ten Conference .   The team 's offensive leaders were quarterback Len Williams with 2,110 passing yards , Dennis Lundy with 688 rushing yards , and Lee Gissendaner with 846 receiving yards . Gissendaner was also selected by the Associated Press as a first-team wide receiver on the 1992 All-Big Ten Conference football team . "