In [2]:
import numpy as np
import scipy.sparse as sp
import json
import re
from bs4 import BeautifulSoup
no_space = re.compile(r"\s+")
url_pattern = re.compile(r'https?://\S+|www\.\S+')

In [3]:
def clean_data(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = url_pattern.sub(" ", text)
    text = no_space.sub(" ", text)
    text = text.strip().lower()
    return text

In [None]:
data_dir = "../data/"

In [4]:
train_ques, train_body, train_pids = [], [], []
with open(f"{data_dir}/qa_train.txt", 'r', encoding = "utf-8") as file:
    lines = file.readlines()
    for line in lines:
        qa_pair = json.loads(line)
        train_ques.append(clean_data(qa_pair['question']))
        train_body.append(clean_data(qa_pair['body']))
        train_pids.append(qa_pair['pids'])

valid_ques, valid_body = [], []
with open(f"{data_dir}/qa_valid_wo_ans.txt", 'r', encoding = "utf-8") as file:
    lines = file.readlines()
    for line in lines:
        qa_pair = json.loads(line)
        valid_ques.append(clean_data(qa_pair['question']))
        valid_body.append(clean_data(qa_pair['body']))

print(len(train_body), len(valid_body))

  soup = BeautifulSoup(text, "html.parser")


In [None]:
f = open(f"{data_dir}/pid_to_title_abs.json", encoding='utf-8')
papers = json.load(f)

In [None]:
all_pids, all_tabs = [], []
for pid, paper in papers.items():
    all_pids.append(pid)
    abs = clean_data(paper['abstract'])
    if paper['title'] is not None:
        tit = clean_data(paper['title'])    
        all_tabs.append(f"{tit} [SEP] {abs}")
    else:
        all_tabs.append(abs)

label_pid_to_id = {t_id: i for i, t_id in enumerate(all_pids)}

data, row, col = [], [], []
for i, pids in enumerate(train_pids):
    for pid in pids:
        row.append(i)
        col.append(label_pid_to_id[pid])
        data.append(1)

train_Q_A = sp.csr_matrix((data, (row, col)), shape = (len(train_ques), len(all_pids)))
sp.save_npz(f"{data_dir}/train_Q_A.npz", train_Q_A)

In [5]:
with open(f"{data_dir}/train_ques.raw.txt", "w", encoding='utf-8') as file:
    file.writelines('\n'.join(train_ques))

with open(f"{data_dir}/train_body.raw.txt", "w", encoding='utf-8') as file:
    file.writelines('\n'.join(train_body))

with open(f"{data_dir}/valid_ques.raw.txt", "w", encoding='utf-8') as file:
    file.writelines('\n'.join(valid_ques))

with open(f"{data_dir}/valid_body.raw.txt", "w", encoding='utf-8') as file:
    file.writelines('\n'.join(valid_body))

with open(f"{data_dir}/papers.raw.txt", "w", encoding='utf-8') as file:
    file.write('\n'.join(all_tabs))

with open(f"{data_dir}/all_pids.raw.txt", "w", encoding='utf-8') as file:
    file.write('\n'.join(all_pids))

In [5]:
##Create Pretrain Data

pretrain_title, pretrain_abs = [], []
for k, v in papers.items():
    if v['title'] is None or v['abstract'] is None:
        continue
    clean_title = clean_data(v['title']).lower()
    clean_abs = clean_data(v['abstract']).lower()
    
    if len(clean_title) < 5 or len(clean_abs) < 50:
        continue 
    
    pretrain_title.append(clean_title)
    pretrain_abs.append(clean_abs)

pretrain_title, indices = np.unique(pretrain_title, return_index = True)
pretrain_title = list(pretrain_title)
pretrain_unq_abs = []

for idx in indices:
    pretrain_unq_abs.append(pretrain_abs[idx])

with open(f"{data_dir}/pretrain_title.raw.txt", "w", encoding='utf-8') as file:
    file.write('\n'.join(pretrain_title))

with open(f"{data_dir}/pretrain_abstract.raw.txt", "w", encoding='utf-8') as file:
    file.write('\n'.join(pretrain_unq_abs))