# Extract claim_evidence pairs from training set

## Construct index dict

In [1]:
import os
import json

wiki_file_path = "./new-wiki-pages-text/"
train_file_path = "./JSONFiles/" + "train.json"
dev_file_path = "./JSONFiles/" + "devset.json"

train_output_path = "./TrainSentence/" + "training.csv"
dev_output_path = "./TrainSentence/" + "dev.csv"

# create index for evidences
evi_index = {}
head_index = {}
for file in os.listdir(wiki_file_path):
    print("processing file: " + file)
    file_name = wiki_file_path + file
    with open(file=file_name) as f:
        lines = f.readlines()
        for i in range(len(lines)):
            words = lines[i].split(" ")
            head = words[0]

            try:
                evi = (words[0], int(words[1]))
                evi_index.update({
                evi: (file, i)
                })
            except ValueError as e:
                continue
                
            if head in head_index.keys():
                head_index[head].append(int(words[1]))
            else:
                head_index.update({
                    head: [int(words[1])]
                })


processing file: wiki-007.txt
processing file: wiki-081.txt
processing file: wiki-008.txt
processing file: wiki-107.txt
processing file: wiki-078.txt
processing file: wiki-050.txt
processing file: wiki-098.txt
processing file: wiki-055.txt
processing file: wiki-043.txt
processing file: wiki-002.txt
processing file: wiki-044.txt
processing file: wiki-082.txt
processing file: wiki-072.txt
processing file: wiki-070.txt
processing file: wiki-011.txt
processing file: wiki-053.txt
processing file: wiki-010.txt
processing file: wiki-022.txt
processing file: wiki-030.txt
processing file: wiki-091.txt
processing file: wiki-077.txt
processing file: wiki-009.txt
processing file: wiki-031.txt
processing file: wiki-065.txt
processing file: wiki-102.txt
processing file: wiki-006.txt
processing file: wiki-052.txt
processing file: wiki-092.txt
processing file: wiki-075.txt
processing file: wiki-012.txt
processing file: wiki-059.txt
processing file: wiki-064.txt
processing file: wiki-023.txt
processing

In [8]:
# head_index.get("José_Ferrer")

## Build training set according to file

In [5]:
import pandas as pd
import linecache
import random

use_file = 0    # 0 for training set , 1 for dev set

# read training data
if use_file == 0:
    with open(train_file_path, 'r') as f:
        train = json.load(f)
elif use_file == 1:
    with open(dev_file_path, 'r') as f:
        train = json.load(f)

def get_evi_text(evi) -> str:
    evidence = (evi[0], int(evi[1]))
    file_index_tuple = evi_index.get(evidence)
    file = wiki_file_path + file_index_tuple[0]
    line_num = file_index_tuple[1]
    evi_text = linecache.getline(file,line_num + 1)
    evi_text = evi_text[2: len(evi_text) + 1]
    words = evi_text.split(" ")[2: len(evi_text)]
    evi_text = " ".join(words)
    return evi_text
    

claim_list = []
evi_list = []
claim_evi_list = []
label_list = []


for key in train.keys():
    item = train.get(key)
    evidence_list = item['evidence']
    if len(evidence_list) == 0:
        continue
    claim = item['claim']
        
    head_set = set()
    for evi in evidence_list:
        head = evi[0]
        head_set.add(head)
        
    for head in head_set:
        sentence_numbers = head_index.get(head)
        if sentence_numbers is None:
            continue
        for number in sentence_numbers:
            query_combination = [head, number]
            evidence_text = get_evi_text(query_combination)
            
            claim_list.append(claim)
            evi_list.append(evidence_text)
            claim_evi_list.append(claim + " ||| " + evidence_text)
            if query_combination in evidence_list:
                label_list.append(1)
            else:
                label_list.append(0)

result_dict = {
    "claim": claim_list,
    "evidence": evi_list,
    "claim_evi_pair": claim_evi_list,
    "label": label_list
}
result_df = pd.DataFrame(result_dict)

if use_file == 0:
    result_df.to_csv(train_output_path, index=False)
elif use_file == 1:
    result_df.to_csv(dev_output_path, index=False)


        

In [6]:
result_df

Unnamed: 0,claim,claim_evi_pair,evidence,label
0,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film The Wizard of Oz is...,1
1,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film The film stars Judy...,0
2,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film The co-stars are Ra...,0
3,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film Notable for its use...,0
4,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film It was nominated fo...,0
5,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film It did win in two o...,0
6,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,"The Wizard of Oz 1939 film However , the film ...",0
7,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film It was MGM 's most ...,0
8,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film The 1956 broadcast ...,0
9,The Wizard of Oz is based on a novel.,The Wizard of Oz is based on a novel. ||| The ...,The Wizard of Oz 1939 film The film was named ...,0
