# Crawling data code

In [None]:
import json

with open('processed_table.json') as f:
    data = json.load(f)
    
import sys
import requests
import urllib.request, urllib.error, urllib.parse
import os

print('original data has {} entries'.format(len(data)))
new_data = []
for i, d in enumerate(data):
    sys.stdout.write("finished {}/{} \r".format(i, len(data)))
    title = d['title']
    title = '_'.join(title.split(' '))    
    page = 'https://en.wikipedia.org/wiki/{}'.format(title)
        
    if len(d['data']) > 5 and len(d['data']) < 40 and len(d['data'][0]) >= 4:
        headers = set(d['header'])
        if len(headers) == len(d['header']):
            cols = len(d['header'])
            count = 0
            for g in d['data'][0]:
                if g[1] is not None:
                    count += 1
            if count < 0.3 * cols:
                continue
            
            # process if there are enough hyperlinks
            title = d['title']
            title = '_'.join(title.split(' '))
                
            if not os.path.exists('htmls/{}.html'.format(title)):
                try:
                    response = urllib.request.urlopen(page)
                    webContent = response.read()
                    f = open('htmls/{}.html'.format(title), 'wb')
                    f.write(webContent)
                    f.close()
                except Exception:
                    continue

            d['page'] = '{}.html'.format(title)
            new_data.append(d)

print('filtered data has {} entries'.format(len(new_data)))
with open('processed_table_with_page.json', 'w') as f:
    json.dump(new_data, f, indent=2)

In [None]:
import os
from bs4 import BeautifulSoup
import sys
import json
import re
from multiprocessing import Pool
import multiprocessing
from utils import *

def process_link(text):
    tmp = []
    hrefs = []
    for t in text.find_all('a'):
        if len(t.get_text().strip()) > 0:
            if 'href' in t.attrs and t['href'].startswith('/wiki/'):
                tmp.append(t.get_text().strip())
                hrefs.append(t['href'])
            else:
                tmp.append(t.get_text().strip())
                hrefs.append('#')
    if all([_ == '#' for _ in hrefs]):
        return ','.join(tmp).strip(), None
    else:
        return ','.join(tmp).strip(), ' '.join(hrefs)

def remove_ref(text):
    for x in text.find_all('sup'):
        x.extract()
    return text

def get_section_title(r):
    text = r.previous_sibling
    title_hierarchy = []
    while text is None or text == '\n' or text.name not in ['h2', 'h3']:
        if text is None:
            break
        else:
            text = text.previous_sibling               
    
    if text is not None:
        title_hierarchy.append(text.find(class_='mw-headline').text)
        if text.name in ['h3']:
            while text is None or text == '\n' or text.name not in ['h2']:
                if text is None:
                    break
                else:
                    text = text.previous_sibling               

            if text is None:
                pass
            else:
                title_hierarchy.append(text.find(class_='mw-headline').text)
    
    if len(title_hierarchy) == 0:
        return ''
    else:
        tmp = ' -- '.join(title_hierarchy[::-1])
        return normalize(tmp)

def get_section_text(r):
    text = r.previous_sibling
    section_text = ''
    while text is not None:
        if text == '\n':
            text = text.previous_sibling
        elif text.name in ['h1', 'h2', 'h3', 'h4']:
            break
        else:
            tmp = text.text
            if tmp:
                mask = ['note', 'indicate', 'incomplete', 'source', 'reference']
                if  any([_ in tmp.lower() for _ in mask]):
                    tmp = ''
                else:
                    tmp = normalize(tmp)
                    if section_text:
                        section_text = tmp + ' ' + section_text
                    else:
                        section_text = tmp
            text = text.previous_sibling
    return section_text

def normalize(string):
    string = string.strip().replace('\n', ' ')
    return tokenize(string)
    
def sub_func(f_name):
    results = []
    with open('htmls/' + f_name, 'r') as f:
        soup = BeautifulSoup(f, 'html.parser')
        rs = soup.find_all(class_='wikitable sortable')
        
        for r in rs:
            heads = []
            rows = []
            for i, t_row in enumerate(r.find_all('tr')):
                if i == 0:
                    for h in t_row.find_all(['th', 'td']):
                        h = remove_ref(h)
                        if len(h.find_all('a')) > 0:
                            heads.append(process_link(h))
                        else:
                            heads.append((h.get_text().strip(), None))
                else:
                    row = []
                    for h in t_row.find_all(['th', 'td']):
                        h = remove_ref(h)
                        import pdb
                        pdb.set_trace()
                        if len(h.find_all('a')) > 0:
                            row.append(process_link(h))
                        else:
                            row.append((h.get_text().strip(), None))
                    if all([len(cell[0]) == 0 for cell in row]):
                        continue
                    else:
                        rows.append(row)
            
            rows = rows[:20]
            if any([len(row) != len(heads) for row in rows]) or len(rows) < 8:
                continue
            else:
                section_title = get_section_title(r)
                section_text = get_section_text(r)
                print(section_title, "||||", section_text)
                title = soup.title.string
                title = re.sub(' - Wikipedia', '', title)
                url = 'https://en.wikipedia.org/wiki/{}'.format('_'.join(title.split(' ')))
                results.append({'url': url, 'title': title, 'header': heads, 'data': rows, 
                                'section_title': section_title, 'section_text': section_text})
    return results

rs = []
for f in os.listdir('htmls/'):
    tmp = sub_func(f)
    rs.append(tmp)
"""
results = []
for r in rs:
    results = results + r
"""
#sub_func('Shortest_tennis_match_records.html')
#with open('processed_new_table.json', 'w') as f:
#    json.dump(results, f, indent=2)

In [None]:
import re

string = '11 - 1'
re.sub(r'([0-9]{1,2})-([0-9]{1,2})', r'\1 - \2', string)

In [None]:
import json

with open('processed_new_table.json', 'r') as f:
    tables = json.load(f)

deletes = []
for i, table in enumerate(tables):
    # Remove sparse columns
    to_remove = []
    for j, h in enumerate(table['header']):
        #if j == 0:
        #    continue
        if 'Coordinates' in h[0][0] or 'Image' in h[0][0]:
            to_remove.append(j)
            continue
        
        count = 0
        total = len(table['data'])
        for d in table['data']:
            #print(d[j])
            if d[j][0][0] != '':
                count += 1
        
        if count / total < 0.5:
            to_remove.append(j)
    
    bias = 0
    for r in to_remove:
        del tables[i]['header'][r - bias]
        for _ in range(len(table['data'])):
            del tables[i]['data'][_][r - bias]
        bias += 1
    
    # Remove sparse rows
    to_remove = []
    for k in range(len(table['data'])):
        non_empty = [1 if _[0][0] != '' else 0 for _ in table['data'][k]]
        if sum(non_empty) < 0.5 * len(non_empty):
            to_remove.append(k)
    
    bias = 0
    for r in to_remove:        
        del tables[i]['data'][r - bias]
        bias += 1
    
    if len(table['header']) > 6:
        deletes.append(i)
    elif len(table['header']) <= 2:
        deletes.append(i)
    else:
        count = 0
        total = 0
        for row in table['data']:
            for cell in row:
                if len(cell[0][0]) != '':
                    if cell[1] == [None]:
                        count += 1                    
                    total += 1
        if count / total >= 0.7:
            deletes.append(i)

print('out of {} tables, {} need to be deleted'.format(len(tables), len(deletes)))

bias = 0
for i in deletes:
    del tables[i - bias]
    bias += 1

with open('processed_new_table_postfiltering.json', 'w') as f:
    json.dump(tables, f, indent=2)

In [None]:
# coding: utf-8
import pandas
import json
from yattag import Doc
from yattag import indent
import random

with open('processed_new_table_postfiltering.json', 'r') as f:
    tables = json.load(f)

doc, tag, text = Doc().tagtext()

cache = ''

style = """
    th {
        padding-top: 12px;
        padding-bottom: 12px;
        text-align: left;
        background-color: #c9c9c9;
        color: black;
    }
    td, th {
        border: 1px solid #dddddd;
        text-align: left;
        padding: 8px;
    }
    td {
        padding-top: 12px;
        padding-bottom: 12px;
        text-align: left;
        background-color: #f0f0f0;
        color: black;
    }
    """

doc.asis('<!DOCTYPE html>')
with tag('html'):
    with tag('head'):
        with tag('style'):
            doc.asis(style)

        doc.asis('<meta charset=\"utf-8\">')
        doc.asis('<meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">')
        doc.asis('<title>Demonstration</title>')
        doc.asis('<link rel="icon" href="">') #Modifier ici pour le favicon
        doc.asis('<script defer src="https://use.fontawesome.com/releases/v5.3.1/js/all.js"></script>')
    
    
    with tag('body'):
        random.shuffle(tables)
        for table in tables[:200]:
            with tag('h3'):
                with tag('a', href=table['url']):
                    text(table['title'])
            
            with tag('h4'):
                text(table['context'])
                
            with tag('table', klass='wikitable', style="border:1"):
                with tag('tbody'):
                    with tag('tr'):
                        for cell in table['header']:
                            with tag('th'):
                                if cell[1] is not None:
                                    count = 0
                                    for t, s in zip(cell[0], cell[1]):
                                        if s is not None:
                                            with tag('a', href='http://edward.cs.ucsb.edu:6007/query?name=' + s):
                                                text(t)
                                            if count < len(cell[1]) - 1:
                                                text(', ')
                                        else:
                                            text(t)
                                            
                                        count += 1
                                else:
                                    text(cell[0])

                    for row in table['data']:
                        with tag('tr'):
                            for cell in row:
                                with tag('td'):
                                    if cell[1] is not None:
                                        count = 0
                                        for t, s in zip(cell[0], cell[1]):
                                            if s is not None:
                                                with tag('a', href='http://edward.cs.ucsb.edu:6007/query?name=' + s):
                                                    text(t)
                                                if count < len(cell[1]) - 1:
                                                    text(', ')
                                            else:
                                                text(t)
                                    else:
                                        text(cell[0])
            doc.stag('br')

result = doc.getvalue()

with open('index.html', 'w') as f:
    f.write(indent(result))

In [None]:
import json
from bs4 import BeautifulSoup

def get_text(text):
    if 'Section::::' in text:
        text = text[:text.find('Section::::')]
    try:
        intro = text.split('\n\n')[1]
        d = BeautifulSoup(intro)
        intro = d.get_text().strip()
        return intro
    except Exception:
        return 'N/A'

dictionary = {}
with open('en.json') as f:
    for i, line in enumerate(f):
        d = json.loads(line.strip())
        page = '_'.join(d['title'].split(' '))
        dictionary[page] = get_text(d['text'])
        sys.stdout.write('finished {}/5989879 \r'.format(i))
        
with open('wiki-intro-with-ents-dict.json', 'w') as f:
    json.dump(dictionary, f)

In [None]:
with open('processed_new_table_postfiltering.json', 'r') as f:
    tables = json.load(f)

dictionary = {}
missed = []

succ, fail = 0, 0
for table in tables:
    for row in table['data']:
        for cell in row:
            pages = cell[1]
            if pages is not None:
                for page in pages.split(' '):
                    page = page[6:].split('#')[0]
                    if page not in database:
                        fail += 1
                        print(page)
                        #database[page]
                    else:
                        succ += 1

sys.stdout.write('success/fail = {}/{} \r'.format(succ, fail))

In [None]:
import urllib3
from bs4 import BeautifulSoup
import time
import re

http = urllib3.PoolManager()
urllib3.disable_warnings()

def get_summary(page):
    if page.startswith('https'):
        pass
    elif page.startswith('/wiki'):
        page = 'https://en.wikipedia.org{}'.format(page)
    else:
        page = 'https://en.wikipedia.org/wiki/{}'.format(page)
    
    r = http.request('GET', page)
    if r.status == 200:
        data = r.data.decode('utf-8')
        data = data.replace('</p><p>', ' ')        
        soup = BeautifulSoup(data, 'html.parser')

        div = soup.body.find("div", {"class": "mw-parser-output"})

        children = div.findChildren("p" , recursive=False)
        summary = 'N/A'
        for child in children:
            if child.get_text().strip() != "":
                html = str(child)
                html = html[html.index('>') + 1:].strip()
                if not html.startswith('<'):
                    summary = child.get_text().strip()
                    break
                elif html.startswith('<a>') or html.startswith('<b>') or \
                        html.startswith('<i>') or html.startswith('<a ') or html.startswith('<br>'):
                    summary = child.get_text().strip()
                    break
                else:
                    continue
        return summary
    elif r.status == 429:
        time.sleep(1)
        return get_summary(page)
    else:
        raise

get_summary('/wiki/Soe_Myint')

In [None]:
import os

dictionary = {}
for f in os.listdir('hyperlinks/'):
    if f.endswith('json'):
        with open('hyperlinks/' + f, 'r') as fw:
            d = json.load(fw)
            dictionary.update(d)

print('totally {}'.format(len(dictionary)))

failed = [k for k, v in dictionary.items() if v == 'N/A']
print('failed {} items'.format(len(failed)))

with open('wikipedia/round1.json', 'w') as f:
    json.dump(dictionary, f, indent=2)
    
with open('wikipedia/round1_failed.json', 'w') as f:
    json.dump(failed, f, indent=2)

In [None]:
with open('wikipedia/round1.json') as f:
    dictionary = json.load(f)
    
with open('wikipedia/round2.json') as f:
    dictionary.update(json.load(f))

In [None]:
#print(len(dictionary))
import re
import json
import urllib.parse

#for k, v in dictionary.items():
#    dictionary[k] = re.sub(r'\[[\d]+\]', '', v).strip()
with open('wikipedia/merged.json') as f:
    dictionary = json.load(f)

merged_unquote = {}
for k, v in dictionary.items():
    merged_unquote[urllib.parse.unquote(k)] = v

with open('wikipedia/merged_unquote.json', 'w') as f:
    json.dump(merged_unquote, f, indent=2)

In [None]:
import json

with open('processed_new_table_postfiltering.json', 'r') as f:
    data = json.load(f)

for i, d in enumerate(data):
    d['idx'] = i
    with open('tables/{}.json'.format(i), 'w') as f:
        json.dump(d, f, indent=2)

# Code for Generating the request data

In [None]:
import os
import json
import sys
from utils import *
import re
import copy
from shutil import copyfile

def recover(string):
    string = string[6:]
    string = string.replace('_', ' ')
    return string
    
def clean_text(k, string):
    if "Initial visibility" in string:
        return recover(k)
    
    position = string.find("mw-parser-output")
    if position != -1:
        left_quote = position - 1
        while left_quote >= 0 and string[left_quote] != '(':
            left_quote -= 1
        right_quote = position + 1
        while right_quote < len(string) and string[right_quote] != ')':
            right_quote += 1
        
        string = string[:left_quote] + " " + string[right_quote + 1:]
        
        position = string.find("mw-parser-output")
        if position != -1:
            #print(string)
            right_quote = position + 1
            while right_quote < len(string) and string[right_quote] != '\n':
                right_quote += 1
            #print("----------------")
            string = string[:position] + string[right_quote + 1:]
            #print(string)
            #print("################")
    
    string = string.replace(u'\xa0', u' ')
    string = string.replace('\ufeff', '')
    string = string.replace(u'\u200e', u' ')
    string = string.replace('–', '-')
    string = string.replace(u'\u2009', u' ')
    string = string.replace(u'\u2010', u' - ')
    string = string.replace(u'\u2011', u' - ')
    string = string.replace(u'\u2012', u' - ')
    string = string.replace(u'\u2013', u' - ')
    string = string.replace(u'\u2014', u' - ')
    string = string.replace(u'\u2015', u' - ')
    string = string.replace(u'\u2018', u'')
    string = string.replace(u'\u2019', u'')
    string = string.replace(u'\u201c', u'')
    string = string.replace(u'\u201d', u'')    
    
    string = string.replace(u'"', u'')
    string = re.sub(r'[\n]+', '\n', string)
    
    string = re.sub(r'\.+', '.', string)
    string = re.sub(r' +', ' ', string)
    
    #string = re.sub(r"'+", "'", string)
    #string = string.replace(" '", " ")
    #string = string.replace("' ", " ")
    string = filter_firstKsents(string, 12)
    
    return string

with open('wikipedia/merged_unquote.json', 'r') as f:
    merged_unquote = json.load(f)

for k in merged_unquote:
    merged_unquote[k] = clean_text(k, merged_unquote[k])

def func(f_id):
    if f_id.endswith('.json'):
        with open('tables/' + f_id) as f:
            table = json.load(f)
    
    local_dict = {}
    for d in table['header']:
        for url in d[1]:
            if url:
                url = urllib.parse.unquote(url)
                local_dict[url] = merged_unquote[url]
    
    for row in table['data']:
        for cell in row:
            for url in cell[1]:
                if url:
                    url = urllib.parse.unquote(url)
                    local_dict[url] = merged_unquote[url]
    #count += 1
    #sys.stdout.write("finished {} tables \r".format(count))
    with open('request_wo_filter/{}'.format(f_id), 'w') as f:
        json.dump(local_dict, f, indent=2)

from multiprocessing import Pool

pool = Pool(64)
results_func = pool.map(func, os.listdir('tables/'))

pool.close()
pool.join()

copyfile('request/example.json', 'request_wo_filter/examples.json')
copyfile('request/example_numeric.json', 'request_wo_filter/example_numeric.json')

In [None]:
string = "1999-present"
string = re.sub(r'(built)([0-9]{4}) ', r'\1 \2 ', string)
string = re.sub(r'\b([0-9]{4})-', r'\1 - ', string)
print(string)

In [None]:
import json
import os
import re

def clean_cell_text(string):
    string = string.replace('"', '')
    string = string.rstrip('^')
    string = re.sub(r'(built)([0-9]{4}) ', r'\1 \2 ', string)
    string = re.sub(r'\b([0-9]{4})-', r'\1 - ', string)    
    string = string.replace('"', '')
    string = string.replace(u"\u00a0", u' ')
    string = string.replace('\n', ' ')
    string = string.rstrip('^')
    string = string.replace('\u200e', '')
    string = string.replace('\ufeff', '')
    string = string.replace('–', '-')
    string = string.replace(u'\u2009', u' ')
    string = string.replace(u'\u2010', u' - ')
    string = string.replace(u'\u2011', u' - ')
    string = string.replace(u'\u2012', u' - ')
    string = string.replace(u'\u2013', u' - ')
    string = string.replace(u'\u2014', u' - ')
    string = string.replace(u'\u2015', u' - ')
    string = string.replace(u'\u2018', u'')
    string = string.replace(u'\u2019', u'')
    string = string.replace(u'\u201c', u'')
    string = string.replace(u'\u201d', u'')
    string = re.sub(r' +', ' ', string)
    string = string.strip()
    return string
"""
for fn in os.listdir('tables/'):
    with open('tables/{}'.format(fn)) as f:
        table = json.load(f)
    
    for row_idx, row in enumerate(table['data']):
        for col_idx, cell in enumerate(row):
            for i, ent in enumerate(cell[0]):
                if ent:
                    table['data'][row_idx][col_idx][0][i] = clean_cell_text(ent)
    
    for col_idx, header in enumerate(table['header']):
        for i, ent in enumerate(header[0]):
            if ent:
                table['header'][col_idx][0][i] = clean_cell_text(ent)
    
    with open('tables/{}'.format(fn), 'w') as f:
        json.dump(table, f, indent=2)
"""

In [None]:
clean_cell_text("2004-2005, 2008-present")

## Adding context information 

In [None]:
import glob
import json
for f in glob.glob('tables/*.json'):
    with open(f) as fn:
        table = json.load(fn)
        with open('data/' + f) as fn:
            reference_table = json.load(fn)
        #del table['context']
        table['section_title'] = reference_table['section_title']
        table['section_text'] = reference_table['section_text']
        table['uid'] = reference_table['uid']
        with open(f, 'w') as fn:
            json.dump(table, fn, indent=2)

for f in glob.glob('tables_tok/*.json'):
    with open(f) as fn:
        table = json.load(fn)
    with open('data/' + f) as fn:
        reference_table = json.load(fn)
    table['section_title'] = reference_table['section_title']
    table['section_text'] = reference_table['section_text']
    table['uid'] = reference_table['uid']
    with open(f, 'w') as fn:
        json.dump(table, fn, indent=2)

In [None]:
good_tables = []
for f in glob.glob('tables/*.json'):
    with open(f) as fn:
        table = json.load(fn)
        if table['section_title'] != '':
            good_tables.append(table['idx'])
print("there are {} good tables".format(len(good_tables)))
with open('good_table_context.json', 'w') as f:
    json.dump(good_tables, f, indent=2)

In [None]:
import json
from transformers import *
import torch

device = torch.device('cuda:5')

pretrained_weights = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

model.to(device)

with open('Mixed-Reasoning/collected_data.json') as f:
    data = json.load(f)

for d in data:
    table_id = d[0]
    with open('request/{}.json'.format(table_id)) as f:
        requested_documents = json.load(f)
    
    idx2key = []
    tmp = []
    for k, v in requested_documents.items():
        tokenized_paragraph = tokenizer.tokenize(v)
        if len(tokenized_paragraph) < 512:
            tokenized_paragraph = tokenized_paragraph[:512]
        
        idxs = tokenizer.convert_tokens_to_ids(tokenized_paragraph)
        tensor = torch.LongTensor(idxs).unsqueeze(0).to(device)
        _, r2 = model(tensor)
        tmp.append(r2)
        idx2key.append(k)
    
    requested_repr = torch.cat(tmp, 0)
    for q, a in d[1:]:
        idxs = tokenizer.encode(q)
        print(idxs)
        tensor = torch.LongTensor(idxs).unsqueeze(0).to(device)
        _, r2 = model(tensor)
        r2 = r2.repeat(requested_repr.shape[0], 1)
        
        similarity = torch.nn.functional.cosine_similarity(r2, requested_repr, dim=1)
        print(similarity)
    
    break

# Step-1 Preprocessing

In [None]:
import json
import json
from transformers import *
import torch
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
import nltk.data
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz
from utils import *
import re

stopWords = set(stopwords.words('english'))
tfidf = TfidfVectorizer(strip_accents="unicode", ngram_range=(2, 3), stop_words=stopWords)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
def longestSubstringFinder(S,T):
    S = S.lower()
    T = T.lower()
    m = len(S)
    n = len(T)
    counter = [[0]*(n+1) for x in range(m+1)]
    longest = 0
    lcs_set = set()
    for i in range(m):
        for j in range(n):
            if S[i] == T[j]:
                c = counter[i][j] + 1
                counter[i+1][j+1] = c
                if c > longest:
                    lcs_set = set()
                    longest = c
                    lcs_set.add(S[i-c+1:i+1])
                elif c == longest:
                    lcs_set.add(S[i-c+1:i+1])
    
    return longest, lcs_set

def longest_match_distance(str1s, str2s):
    longest_string = []
    for str1 in str1s:
        longest_string.append([])
        for str2 in str2s:
            length, _ = longestSubstringFinder(str1, str2)
            longest_string[-1].append(1 - length / len(str1))
    return longest_string

def searchForAnswer(answer, table, passages, mapping_entity):
    results = []
    correction = None
    for i, row in enumerate(table['data']):
        for j, cell in enumerate(row):
            success = False
            for content, url in zip(cell[0], cell[1]):
                if answer.lower() == content.lower():
                    results.append((content, (i, j), url, 'table'))
                    success = True
                    break
                elif " " + answer.lower() + " " in " " + content.lower() + " ":
                    correction = content
                    results.append((content, (i, j), url, 'table'))
                else:
                    pass
            
            if not success and len(cell[0]) > 1:
                content = ' , '.join(cell[0])
                if answer == content:
                    results.append((content, (i, j), None, 'table'))
                elif len(answer) > 3 and " " + answer.lower() + " " in " " + content.lower() + " ":
                    correction = content
                    results.append((content, (i, j), None, 'table'))

    if len(results) > 0:
        return correction, results
        
    for k, v in passages.items():
        if " " + answer.lower() + " " in " " + v.lower() + " ":
            for content, locs in mapping_entity[k].items():
                for loc in locs:
                    results.append((content, loc, k, 'passage'))
    
    return None, results

def searchForAnswerWithoutSpace(answer, passages, mapping_entity):
    correction = None
    results = []
    for k, v in passages.items():
        tmp = (" " + v.lower()).find(" " + answer.lower())
        if tmp != -1:
            length = len(answer)
            while tmp + length < len(v) and v[tmp + length] != " ":
                length += 1
            correction = v[tmp:tmp + length]
            for content, locs in mapping_entity[k].items():
                for loc in locs:
                    results.append((content, loc, k, 'passage'))
            break

    return correction, results

def get_edit_distance_equal_1(answer, table):
    results = []
    for i, row in enumerate(table['data']):
        for j, cell in enumerate(row):
            for tmp, url in zip(cell[0], cell[1]):
                dist = nltk.edit_distance(answer, tmp)
                if dist == 1:
                    results.append((tmp, (i, j), url, 'table'))
    return results

def fixing_answer(string):
    if ',' in string:
        tmp = string.split(',')[0].strip()
        if not tmp.isdigit():
            string = [tmp]
        else:
            return None
    elif ' and ' in string:
        string = [_.strip() for _ in string.split(' and ')]
    elif '(' and ')' in string:
        tmp = re.sub(r'([^\(\)]+) \((.+)\)$', r'\1###\2', string)
        string = [_.strip() for _ in tmp.split('###')]
    elif '-' in string:
        if ' - ' in string:
            tmp = string.replace(' - ', '-')
        else:
            tmp = string.replace('-', ' - ')
        string = [tmp, string.split('-')[0].strip(), string.split('-')[1].strip()]
    elif string.startswith('#'):
        string = [string.lstrip('#').strip()]
    else:
        return None
    
    return string
        
def func(d):
    results = []
    table_id = d[0]
    #if table_id != 351:
    #    return []

    with open('request_tok/{}.json'.format(table_id)) as f:
        requested_documents = json.load(f)
    
    with open('tables_tok/{}.json'.format(table_id)) as f:
        table = json.load(f)
    
    threshold = 90

    #title_wiki = table['url'][len('https://en.wikipedia.org'):]
    #del requested_documents[title_wiki]
    # Finding the answer and links to table
    qs = []
    ans = []
    links = []
    
    # Mapping entity link to cell, entity link to surface word
    #mapping_entity_loc = {}
    mapping_entity = {}
    for row_idx, row in enumerate(table['data']):
        for col_idx, cell in enumerate(row):
            for i, ent in enumerate(cell[1]):
                if ent:
                    if ent not in mapping_entity:
                        mapping_entity[ent] = {cell[0][i]: [(row_idx, col_idx)]}
                    else:
                        if cell[0][i] not in mapping_entity[ent]:
                            mapping_entity[ent][cell[0][i]] = [(row_idx, col_idx)]
                        else:
                            mapping_entity[ent][cell[0][i]] = mapping_entity[ent][cell[0][i]] + [(row_idx, col_idx)]
    
    for col_idx, header in enumerate(table['header']):
        for i, ent in enumerate(header[1]):
            if ent:
                if ent not in mapping_entity:
                    mapping_entity[ent] = {header[0][i]: [(-1, col_idx)]}
                else:
                    if header[0][i] not in mapping_entity[ent]:
                        mapping_entity[ent][header[0][i]] = [(-1, col_idx)]
                    else:
                        mapping_entity[ent][header[0][i]] = mapping_entity[ent][header[0][i]] + [(-1, col_idx)]
    
    # loop through the qa pairs
    for q, a in d[1:]:
        correction, tmp = searchForAnswer(a, table, requested_documents, mapping_entity)
        if len(tmp) == 0 and len(a) >= 3 and not a.isdigit():
            # See if the space becomes a problem
            correction, tmp = searchForAnswerWithoutSpace(a, requested_documents, mapping_entity)
            if len(tmp) > 0:
                print("correct span! {} -> {}".format(a, correction))
                pass
            else:
                # correct the spelling                
                tmp = get_edit_distance_equal_1(a, table)
                if len(tmp) > 0:
                    print("correct spelling! {} -> {}".format(a, tmp[0][0]))
                    correction = tmp[0][0]
                else:
                    # Split the answer
                    fixed_as = fixing_answer(a)
                    if fixed_as:
                        for correction in fixed_as:
                            #if correction:
                            _, tmp = searchForAnswer(correction, table, requested_documents, mapping_entity)
                            if len(tmp) > 0:
                                print("correct splitting! {} -> {}".format(a, correction))
                                break
            
            if len(tmp) > 4:
                print("many uncertainties for {}, decide not to replace it".format(a))
                tmp = []
            elif correction and len(tmp) > 0:
                a = correction.strip()
            else:
                pass
            
        elif correction:
            if len(correction) < 2 * len(a) : 
                print("cell correction! {} -> {}".format(a, correction))
                a = correction.strip()
        
        ans.append((a, tmp))
        qs.append(q)
        
        #if len(tmp) == 0:
        #    print("FAILED with {} {}".format(table_id, a))
        
        # LINKING THE CELL DATA
        tmp_link = []
        for row_idx, row in enumerate(table['data']):
            for col_idx, cell in enumerate(row):
                if cell[0] != ['']:
                    for ent in cell[0]:
                        ratio = fuzz.partial_ratio(' ' + ent + ' ', ' ' + q + ' ')
                        if ratio > threshold:
                            tmp_link.append((ent, (row_idx, col_idx), None, None, ratio / 100))

        links.append(tmp_link)
    
    keys = []
    paras = []
    for k, v in requested_documents.items():
        for _ in tokenizer.tokenize(v):
            keys.append(k)
            paras.append(_)
    
    para_feature = tfidf.fit_transform(paras)    
    
    q_feature = tfidf.transform(qs)
    
    dist_match = longest_match_distance(qs, paras)
    dist = pairwise_distances(q_feature, para_feature, 'cosine')
    
    for i in range(len(qs)):
        min_dist = {}
        tfidf_best_match = ('N/A', 1.)
        for k, para, d in zip(keys, paras, dist[i]):
            if d < min_dist.get(k, 1):
                min_dist[k] = d
                if d < tfidf_best_match[-1]:
                    tfidf_best_match = (k, para, d)

        min_dist = {}
        string_best_match = ('N/A', 1.)
        
        for k, para, d in zip(keys, paras, dist_match[i]):
            if d < min_dist.get(k, 1):
                min_dist[k] = d
                if d < string_best_match[-1]:
                    string_best_match = (k, para, d)
        
        tfidf_nodes = []
        if tfidf_best_match[0] != 'N/A':
            k = tfidf_best_match[0]
            for content, locs in mapping_entity[k].items():
                for loc in locs:
                    tfidf_nodes.append((content, loc, k, tfidf_best_match[1], tfidf_best_match[2]))
        
        string_nodes = []
        if string_best_match[0] != 'N/A':
            k = string_best_match[0]
            for content, locs in mapping_entity[k].items():
                for loc in locs:
                    string_nodes.append((content, loc, k, string_best_match[1], string_best_match[2]))   
        
        results.append({'table_id': table_id, 'question': qs[i], 'answer-text': ans[i][0], 
                        'answer-node': ans[i][1], 'tf-idf': tfidf_nodes, 
                        'string-overlap': string_nodes, 'link': links[i]})
    
    return results

with open('Mixed-Reasoning/collected_data.json') as f:
    data = json.load(f)


from multiprocessing import Pool

pool = Pool(64)
results_func = pool.map(func, data)

pool.close()
pool.join()

results = []
for _ in results_func:
    results.extend(_)
"""
results = []
for d in data:
    results.extend(func(d))
"""

with open('Mixed-Reasoning/processed_step1.json', 'w') as f:
    json.dump(results, f, indent=2)

In [None]:
import json
import random

with open('Mixed-Reasoning/processed_step1.json', 'r') as f:
    processed = json.load(f)

easy, medium, hard, no_answer, number, yesorno = 0, 0, 0, 0, 0, 0

from_passage, from_cell = 0, 0

def hash_string(string):
    import hashlib
    sha = hashlib.sha256()
    sha.update(string.encode())
    return sha.hexdigest()[:16]

new_processed = []
for p in processed:    
    question_type = ''
    where_from = ''
    if p['answer-text'].lower() in ['yes', 'no']:
        yesorno += 1
        #question_type = 'binary'
        continue
    else:
        number_trigger = ['how many', 'how much', 'how long', 'how far', 'how old', 'difference', 'total']
        answer_node = p['answer-node']
        if len(answer_node) == 0:
            if any([x in p['question'].lower() for x in number_trigger]):
                number += 1
                question_type = 'numeric'
                where_from = 'calculation'
            else:
                no_answer += 1
                continue
        else:            
            if answer_node[0][-1] == 'passage':
                from_passage += 1
            else:
                from_cell += 1
                
            where_from = answer_node[0][-1]

            matching_cells = []
            if p['tf-idf']:
                matching_cells.extend([tuple(_[1]) for _ in p['tf-idf']])
            if p['string-overlap']:
                matching_cells.extend([tuple(_[1]) for _ in p['string-overlap']])
            linking_cells = [tuple(_[1]) for _ in p['link']]

            evidence_cells = set(matching_cells + linking_cells)
            #print(answer_node)
            answer_cells = set([tuple(_[1]) for _ in answer_node])
            
            if len(evidence_cells & answer_cells) > 0:
                new_answer_nodes = []
                for node in p['answer-node']:
                    if tuple(node[1]) in evidence_cells:
                        new_answer_nodes.append(node)
                p['answer-node'] = new_answer_nodes
                
                easy += 1
                question_type = 'easy'
            else:            
                answer_row = set([_[0] for _ in answer_cells])
                evidence_row = set([_[0] for _ in evidence_cells])
                intersect_row = answer_row & evidence_row

                if len(intersect_row) > 0:
                    new_answer_nodes = []
                    for node in p['answer-node']:
                        if node[1][0] in intersect_row:
                            new_answer_nodes.append(node)
                    p['answer-node'] = new_answer_nodes
                    
                    medium += 1
                    question_type = 'medium'
                else:
                    hard += 1
                    question_type = 'hard'
    
    p['type'] = question_type
    p['where'] = where_from
    p['question_id'] = hash_string(p['question'])
    new_processed.append(p)

print("easy: {}, medium: {}, hard: {}, no answer: {}, yes/no: {}, number: {}".format(easy, medium, hard, no_answer, yesorno, number))
print("from cell: {}, from passage: {}".format(from_cell, from_passage))

new_processed = sorted(new_processed, key=lambda x: x['question_id'])
#random.shuffle(new_processed)

with open('Mixed-Reasoning/processed_step2.json', 'w') as f:
    json.dump(new_processed, f, indent=2)

# Done with the pre-processing

In [None]:
import os

for fn in os.listdir('tables/'):
    if fn.endswith('.json'):
        with open('tables/{}'.format(fn)) as f:
            table = json.load(f)
        
        headers = table['header']
        if headers[0][0] == ['']:
            for i in range(len(table['data'])):
                del table['data'][i][0]
        
            del headers[0]

            with open('tables/{}'.format(fn), 'w') as f:
                json.dump(table, f, indent=2)
                
for fn in os.listdir('tables/'):
    if fn.endswith('.json'):
        with open('tables/{}'.format(fn)) as f:
            table = json.load(f)

        headers = table['header']

        if any([_[0] == ['Rank'] for _ in headers]):
            if table['data'][0][0][0] == ['']:
                for i in range(len(table['data'])):
                    if table['data'][i][0][0] == ['']:
                        table['data'][i][0][0] = [str(i + 1)]
                
                with open('tables/{}'.format(fn), 'w') as f:
                    json.dump(table, f, indent=2)
        
        if any([_[0] == ['Place'] for _ in headers]):
            if table['data'][0][0][0] == ['']:
                for i in range(len(table['data'])):
                    if table['data'][i][0][0] == ['']:
                        table['data'][i][0][0] = [str(i + 1)]
                
                with open('tables/{}'.format(fn), 'w') as f:
                    json.dump(table, f, indent=2)

In [None]:
import json

def is_num(num):
    try:
        float(num)
        return True
    except Exception:
        return False

plausible_tables = []
for i in range(7000, 15000):
    with open('tables/{}.json'.format(i)) as f:
        table = json.load(f)
    
    for column_idx in range(len(table['header'])):      
        if all([is_num(table['data'][row_idx][column_idx][0][0]) for row_idx in range(len(table['data']))]):    
            plausible_tables.append(i)
            break
            
print(len(plausible_tables))

with open('Mixed-Reasoning/numeric_tables.json', 'w') as f:
    json.dump(plausible_tables, f, indent=2)

In [None]:
with open('Mixed-Reasoning/processed_step2.json', 'r') as f:
    data = json.load(f)
    
mismatch = 0
match = 0
for d in data:
    if d['where'] == 'table':
        if d['answer-text'] != d['answer-node'][0][0]:
            print(d['answer-node'][0][0], '#', d['answer-text'])
            mismatch += 1
        else:
            match += 1

print("final match = {}, partial match = {}".format(match, mismatch))       

In [None]:
easy_set = []
for d in new_processed:
    if d['type'] == 'easy':
        easy_set.append(d)

with open('Mixed-Reasoning/processed_step2_easy_split.json', 'w') as f:
    json.dump(easy_set, f, indent=2)

In [None]:
import random
import json
from utils import url2dockey, filter_firstKsents
from multiprocessing import Pool

with open('Mixed-Reasoning/processed_step2.json', 'r') as f:
    data = json.load(f)

keys = []
for k in data:
    keys.append(k['question_id'])
random.shuffle(keys)

tr_size = int(len(keys) * 0.9)

train_keys = set(keys[:tr_size])
dev_keys = set(keys[tr_size:])

with open('Mixed-Reasoning/train_ids.json', 'w') as f:
    json.dump(list(train_keys), f)
with open('Mixed-Reasoning/dev_ids.json', 'w') as f:
    json.dump(list(dev_keys), f)

# Creating the stage1/2/3 training data

In [None]:
with open('Mixed-Reasoning/processed_step2.json', 'r') as f:
    data = json.load(f)

with open('Mixed-Reasoning/train_ids.json', 'r') as f:
    train_keys = set(json.load(f))
with open('Mixed-Reasoning/dev_ids.json', 'r') as f:
    dev_keys = set(json.load(f))

train_split = []
dev_split = []

for d in data:
    if d['type'] in ['medium', 'easy']:
        table_id = d['table_id']
        with open('tables_tok/{}.json'.format(table_id), 'r') as f:
            table = json.load(f)
        headers = [" , ".join(cell[0]) for cell in table['header']]

        answer_nodes = d['answer-node']
        answer_rows = set([_[1][0] for _ in answer_nodes])

        tmp = []
        labels = []
        for node in d['tf-idf']:
            tmp.append(node + [headers[node[1][1]], 'tf-idf'])        
            if node[1][0] in answer_rows:
                labels.append(1)
            else:
                labels.append(0)

        for node in d['string-overlap']:
            tmp.append(node + [headers[node[1][1]], 'string-overlap'])        
            if node[1][0] in answer_rows:
                labels.append(1)
            else:
                labels.append(0)

        for node in d['link']:
            tmp.append(node + [headers[node[1][1]], 'link'])   
            if node[1][0] in answer_rows:
                labels.append(1)
            else:
                labels.append(0)
        
        if d['question_id'] in train_keys:
            train_split.append({'question': d['question'], 'question_id': d['question_id'], 'table_id': d['table_id'], 
                              'nodes': tmp, 'labels': labels})
        else:
            dev_split.append({'question': d['question'], 'question_id': d['question_id'], 'table_id': d['table_id'], 
                              'nodes': tmp, 'labels': labels})

with open('Mixed-Reasoning/stage1_training_data.json', 'w') as f:
    json.dump(train_split, f, indent=2)

with open('Mixed-Reasoning/stage1_dev_data.json', 'w') as f:
    json.dump(dev_split, f, indent=2)
    
print("Done with Stage1 Data Processing")


In [None]:
def func(d):
    train_split, dev_split = [], []
    if d['type'] in ['medium', 'easy']:
        table_id = d['table_id']
        with open('tables_tok/{}.json'.format(table_id), 'r') as f:
            table = json.load(f)
        
        with open('request_tok/{}.json'.format(table_id), 'r') as f:
            requested_document = json.load(f)
        
        headers = [" , ".join(cell[0]) for cell in table['header']]
        
        answer_nodes = d['answer-node']
        answer_rows = {_[1][0]: _ for _ in answer_nodes}

        labels = []
        for name, source in zip(['tf-idf', 'string-overlap', 'link'], [d['tf-idf'], d['string-overlap'], d['link']]):
            for node in source:
                i = node[1][0]
                if i in answer_rows and i >= 0:
                    tmp = {'question': d['question'], 'question_id': d['question_id'], 'table_id': d['table_id'], 'current': node + [headers[node[1][1]], name]}
                    target_nodes = []
                    labels = []
                    same_row = table['data'][i]
                    for j, cell in enumerate(same_row):
                        for content, url in zip(cell[0], cell[1]):
                            if len(content) > 0:
                                if url:
                                    doc = requested_document[url]
                                    intro = filter_firstKsents(doc, 1)
                                    target_nodes.append((content, (i, j), url, headers[j], intro))
                                    if url == answer_rows[i][2]:
                                        labels.append(1)
                                    else:
                                        labels.append(0)
                                else:
                                    target_nodes.append((content, (i, j), None, headers[j], ''))
                                    if content == answer_rows[i][0]:
                                        labels.append(1)
                                    else:
                                        labels.append(0)
                                
                        if len(cell[0]) > 1:
                            content = ' , '.join(cell[0])
                            if content == answer_rows[i][0]:
                                labels.append(1)
                            else:
                                labels.append(0)
                                
                            target_nodes.append((content, (i, j), None, headers[j], ''))
                        
                    tmp['labels'] = labels

                    assert sum(labels) > 0, d['question_id']
                    
                    tmp['target'] = target_nodes
                    
                    if tmp['question_id'] in train_keys:
                        train_split.append(tmp)
                    else:
                        dev_split.append(tmp)

    return train_split, dev_split


pool = Pool(64)
results = pool.map(func, data)

train_split = []
dev_split = []
for r1, r2 in results:
    train_split.extend(r1)
    dev_split.extend(r2)

with open('Mixed-Reasoning/stage2_training_data.json', 'w') as f:
    json.dump(train_split, f, indent=2)

with open('Mixed-Reasoning/stage2_dev_data.json', 'w') as f:
    json.dump(dev_split, f, indent=2)

print("Done with Stage2 Data Processing")

In [None]:
train_split = []
dev_split = []
inside, self = 0, 0
for d in data:
    if d['where'] == 'passage':
        table_id = d['table_id']
        
        with open('request_tok/{}.json'.format(table_id)) as f:
            requested_documents = json.load(f)        
        
        #tmp = mapping.get(str(table_id), [])
        
        used = set()
        for node in d['answer-node']:
            if node[2] not in used:
                context = requested_documents[node[2]]
                context = 'Title : {} . '.format(node[0]) + context
                
                orig_answer = d['answer-text']

                start = context.lower().find(orig_answer.lower())

                if start == -1:
                    import pdb
                    pdb.set_trace()

                while context[start].lower() != orig_answer[0].lower():
                    start -= 1

                answer = context[start:start+len(orig_answer)]
                #assert(answer.lower() == orig_answer.lower(), "{} -> {}".format(answer, orig_answer))
                
                if d['question_id'] in train_keys:
                    train_split.append({'context': context, 'title': table_id, 
                                      'question': d['question'], 'question_id': d['question_id'],
                                      'answers': [{'answer_start': start, 'text': answer}]})
                else:
                    dev_split.append({'context': context, 'title': table_id, 
                                      'question': d['question'], 'question_id': d['question_id'],
                                      'answers': [{'answer_start': start, 'text': answer}]})
                inside += 1
                used.add(node[2])
            else:
                continue
    
    if d['where'] == 'table':
        table_id = d['table_id']
        
        with open('request_tok/{}.json'.format(table_id)) as f:
            requested_documents = json.load(f)  
            
        used = set()
        for node in d['answer-node']:
            if node[2] and node[2] not in used:
                context = requested_documents[node[2]]
                context = 'Title : {} . '.format(node[0]) + context
                
                orig_answer = node[0]

                start = context.lower().find(orig_answer.lower())

                if start == -1:
                    import pdb
                    pdb.set_trace()

                while context[start].lower() != orig_answer[0].lower():
                    start -= 1
                    
                answer = context[start:start+len(orig_answer)]
                
                if d['question_id'] in train_keys:
                    train_split.append({'context': context, 'title': table_id, 
                                      'question': d['question'], 'question_id': d['question_id'],
                                      'answers': [{'answer_start': start, 'text': answer}]})
                else:
                    dev_split.append({'context': context, 'title': table_id, 
                                      'question': d['question'], 'question_id': d['question_id'],
                                      'answers': [{'answer_start': start, 'text': answer}]})
                self += 1
                used.add(node[2])
            else:
                continue

with open('Mixed-Reasoning/stage3_training_data.json', 'w') as f:
    json.dump(train_split, f, indent=2)
    
with open('Mixed-Reasoning/stage3_dev_data.json', 'w') as f:
    json.dump(dev_split, f, indent=2)

#print("Total amount of training instance = {} and dev instance = {}".format(len(training_data), len(dev_data)))
print("Looking inside the passage = {} and self loop = {}".format(inside, self))

# Transforming all files into gzip

In [None]:
from utils import *

fs = ['Mixed-Reasoning/stage1_training_data.json', 'Mixed-Reasoning/stage1_dev_data.json',
    'Mixed-Reasoning/stage2_training_data.json', 'Mixed-Reasoning/stage2_dev_data.json', 
    'Mixed-Reasoning/stage3_training_data.json', 'Mixed-Reasoning/stage3_dev_data.json']

for f_n in fs:
    compressGZip(f_n)

# Statistics of the table/passages.

In [None]:
column_num = 0
row_num = 0
cell_length = 0

total = 13000
total_cell = 0
total_url = 0
for i in range(0, total):
    with open('tables_tok/{}.json'.format(i)) as f:
        table = json.load(f)
    column_num += len(table['header'])
    row_num += len(table['data'])
    
    for row in table['data']:
        for cell in row:
            content = ' , '.join(cell[0])
            cell_length += len(content.split(' '))
            total_cell += 1
            
            for url in cell[1]:
                if url:
                    total_url += 1
    
print('column num = {}; row num = {}'.format(column_num / total, row_num / total))
print('cell num = {}; average length/cell = {}; average url/table = {}'.format(
    total_cell / total, cell_length / total_cell, total_url / total))

In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

with open('wikipedia/merged_unquote.json', 'r') as f:
    passages = json.load(f)

vs = []
for k, v in passages.items():
    vs.append(tokenizer.tokenize(v)[:12])

In [None]:
sent_length, word_length = 0, 0
for v in vs:
    sent_length += len(v)
    word_length += len((" ".join(v)).split(' '))

print("sentence length = {}, word length = {}".format(sent_length / len(vs), word_length / len(vs)))

In [8]:
import glob
import shutil
import os

for f in glob.glob('tables_tok/*.json'):
    filename = os.path.basename(f)
    try:
        tmp = int(filename.replace('.json', ''))
        assert tmp < 17000
    except Exception:
        f2 = f.replace('tables_tok', 'request_tok')
        shutil.move(f, 'tables_tok_name/')
        shutil.move(f2, 'request_tok_name/')

In [None]:
with open('../OpenDomainWikiTables/released_data/train_dev_test_table_ids.json') as f:
    all_table_ids = json.load(f)

all_table_ids = all_table_ids['train'] + all_table_ids['dev'] + all_table_ids['test']
for table_id in all_table_ids:
    shutil.copy('../OpenDomainWikiTables/table_crawling/data/tables_tok/{}.json', 'tables_tok_name/')
    shutil.copy('../OpenDomainWikiTables/table_crawling/data/tables_tok/{}.json', 'tables_tok_name/')