In [None]:
import os
import re
import sys
import requests
import pickle
import getopt
import subprocess as sp
import numpy as np
from pprint import pprint
from lxml import etree
from tqdm.notebook import tqdm

In [None]:
# arguments
input_path = os.path.abspath("/mnt/data1/jiawei/acs-data/article-files/")
output_path = os.path.abspath("/mnt/data1/jiawei/acs-data/processed-files/")
txt_path = os.path.abspath("/mnt/data1/jiawei/acs-data/txt-files")
suppl_path = os.path.abspath("/mnt/data1/jiawei/acs-data/suppl-files")

In [None]:
# local file parameters
valid_cache_path = os.path.abspath("./sbol-validation-cache.pkl")
non_acs_article_path = os.path.abspath("./non-acs-article.txt")

In [None]:
# function to collect matching files and dirs
def collect_files(root, res, pattern="", collect_dirs=True, min_depth=None, max_depth=None):
    
    # check max depth
    if not max_depth is None and max_depth == 0:
        return
    
    # go through all item in the dir
    for item in os.listdir(root):
        
        # process item
        item_path = os.path.join(root, item)
        item_is_dir = os.path.isdir(item_path)
        
        # pull valid file in res if min depth has reached
        if min_depth is None or min_depth - 1 <= 0:
            if re.match(pattern, item_path):
                if not item_is_dir or collect_dirs:
                    res.append(item_path)
        
        # recursively collect all files
        if item_is_dir:
            next_min_depth = None if min_depth is None else min_depth - 1
            next_max_depth = None if max_depth is None else max_depth - 1
            collect_files(item_path, res, pattern, collect_dirs, next_min_depth, next_max_depth)

In [None]:
# helps to extract text from paragraph
def p_helper(node):
    
    # <p/> does not have text
    if node.text is None:
        return ""
    
    # each paragarph is put into a line
    line_list = [node.text]
    for child in node:

        # get the text inside the child if the tag isn't 
        # named-content and inline-formula
        # and the text following the child
        if not child.tag in ("named-content", "inline-formula"):
            line_list.append(" ".join(child.xpath(".//text()")))
        line_list.append(child.tail)

    # there might be none in line_list
        
    # re dark magic
    # remove new line and spaces
    line = " ".join(line_list)
    line = line.strip()
    line = line.replace("\n", " ")

    # clean up consecutive spaces
    line = re.sub("\s+", " ", line)

    # fix the space around punctuation
    line = re.sub("\s([.,\):;])", r"\1", line)
    line = re.sub("\(\s", r"(", line)
    line = re.sub("\s*([-/])\s*", r"\1", line)
    return line

In [None]:
def kwd_helper(node):
    
    # return a keyword string
    kwd_tokens = node.xpath(".//text()")
    kwd = " ".join(kwd_tokens).replace("\n", " ").strip()
    kwd = re.sub("\s+", " ", kwd)
    return kwd

In [None]:
# this returns interesting titles
# for example: intro, method, and results
# return None for non interesting titles
def title_helper(node):
    
    # extract text from title node
    title = " ".join(node.xpath(".//text()"))
    title = title.replace("\n", " ")
    title = re.sub("\s+", " ", title)
    title = title.strip()
    title = title.lower()
    
    # categorize title
    res = []
    if "intro" in title:
        res.append("introduction")
    if "result" in title:
        res.append("result")
    if "discuss" in title:
        res.append("discussion")
    if "material" in title:
        res.append("materials")
    if "method" in title or "procedure" in title:
        res.append("method")
    if "summary" in title:
        res.append("summary")
    return res

In [None]:
def extract_body(root):
    
    # we are interested in the text in the body section
    curr_title = []
    text = []
    text_nodes = root.xpath("/article/body//*[self::p or (self::title and not(ancestor::caption))]")
    for text_node in text_nodes:
        
        # handle title
        if text_node.tag == "title":
            tmp_title = title_helper(text_node)
            if len(tmp_title) > 0:
                curr_title = tmp_title
        
        # handle paragraph
        elif text_node.tag == "p":
            text.append({
                "text": p_helper(text_node),
                "section": curr_title
            })
    return text

In [None]:
def extract_abstract(root):
    
    # get the abstract paragraph
    abstract = []
    abstract_nodes = root.xpath("//abstract/p")
    if abstract_nodes:
        abstract.append(p_helper(abstract_nodes[0]))
    return abstract

In [None]:
def extract_keywords(root):
    
    # get the keywords
    keywords = []
    kwd_nodes = root.xpath("//kwd-group/kwd")
    for kwd_node in kwd_nodes:
        keywords.append(kwd_helper(kwd_node))
    return keywords

In [None]:
def extract_date(root):
    
    issue_pub_date = None
    electron_pub_date = None
    
    # traverse to the date note
    date_nodes = root.xpath("/article/front/article-meta/pub-date")
    
    # get the time
    for node in date_nodes:
        year = node.xpath("./year")[0].text.strip()
        month = node.xpath("./month")[0].text.strip()
        day = node.xpath("./day")[0].text.strip()

        if "date-type" in node.attrib and node.attrib["date-type"] == "issue-pub":
            issue_pub_date = "%s/%s/%s" % (month, day, year)
        else:
            electron_pub_date = "%s/%s/%s" % (month, day, year)
    
    return issue_pub_date, electron_pub_date

In [None]:
# collect all xml files
xml_paths = []
collect_files(input_path, xml_paths, pattern=".*\.xml$", collect_dirs=False)
print(f"total xml files: %d" % len(xml_paths))

In [None]:
# don't parse non acs article
non_acs_article_list = set()
with open(non_acs_article_path, "r") as f:
    for line in f:
        non_acs_article_list.add(line.strip())

In [None]:
# parse the files
processed_files = {}
for xml_path in tqdm(xml_paths):

    # print("\nparsing %s" % xml_path)
    pub_num = xml_path.split("/")[-1].split(".")[0]

    # get the root of the xml
    root = etree.parse(xml_path).getroot()
    
    # get the pub date
    issue_pub_date, electron_pub_date = extract_date(root)

    # create a dictionary holding the xml data
    xml_data = {
        "is_research": not pub_num in non_acs_article_list,
        "keywords": extract_keywords(root),
        "abstract": extract_abstract(root),
        "body": extract_body(root),
        "issue_pub_date": issue_pub_date,
        "electron_pub_date": electron_pub_date,
        "suppl_files": []
    }

    # save the data
    processed_files[pub_num] = xml_data

In [None]:
print(len(processed_files))

In [None]:
# find out all the zip files
# suppl_files_zip = []
# collect_files(suppl_path, suppl_files_zip, pattern=".*\.zip$", collect_dirs=False, min_depth=3)
# print("files: %d" % len(suppl_files_zip))

In [None]:
# extract all the zip files in place
# for zip_file in suppl_files_zip:
#     zip_file_dir = re.sub("/[^/]*$", "", zip_file)
#     res = sp.run(["unzip", "-n", zip_file, "-d", zip_file_dir])
#     if res.returncode != 0:
#         print(res)

In [None]:
# collect all suppl files
suppl_files_all = []
collect_files(suppl_path, suppl_files_all, pattern="", collect_dirs=False, min_depth=3)
suppl_files_all = [x for x in suppl_files_all if not re.match(".*__MACOSX.*", x)]
print("files: %d" % len(suppl_files_all))

In [None]:
# attach all the suppl path to processed files
suppl_ext = set() # debug
for path in suppl_files_all:
    path = path.split("/")
    
    # get basic attrib
    suppl_filename = path[-1]
    suppl_dir = suppl_path.split("/")[-1]
    suppl_dir_idx = 0
    for i, item in enumerate(path):
        if item == suppl_dir:
            suppl_dir_idx = i
            break
    else:
        assert False # we should not be here
    pub_num = path[suppl_dir_idx + 1]
    rpath = os.path.join(*path[suppl_dir_idx + 1:]) # relative path
    
    # create info dict
    suppl_info = {
        "suppl_filename": suppl_filename,
        "rpath": rpath,
        "is_sequence": False
    }
    
    # push it into the processed files dict
    if pub_num in processed_files:
        processed_files[pub_num]["suppl_files"].append(suppl_info)
    
    # collect file extension (for debug)
    ext = suppl_filename.split(".")[-1]
    suppl_ext.add(ext)

In [None]:
suppl_ext

In [None]:
# request params
sbol_validator_url = "https://validator.sbolstandard.org/validate/"
allowed_file_type = set([
    "gb", "fasta", "sbol", "txt", "xml"
])
validator_param = {
    'options': {
        'language' : "SBOL2",
        'test_equality': False,
        'check_uri_compliance': False,
        'check_completeness': False,
        'check_best_practices': False,
        'fail_on_first_error': True,
        'provide_detailed_stack_trace': False,
        'subset_uri': '',
        'uri_prefix': 'dummy',
        'version': '',
        'insert_type': False
    },
    "main_file": None,
    "return_file": True
}

In [None]:
def validate_sequence(file):
    
    # restrict file size to be less than 64mb
    file_size = os.path.getsize(file)
    if file_size >= 64 * 2 ** 20:
        return False, None
    
    # try to read the content
    try:
        with open(file, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        return False, None
    
    # validate file
    validator_param["main_file"] = content
    res = requests.post(sbol_validator_url, json=validator_param).json()
    return res["valid"], res

In [None]:
# use the api to check if the file is a sequence file
# we will cache the request result to reduce server load
valid_res_cache = None
if os.path.exists(valid_cache_path):
    with open(valid_cache_path, "rb") as f:
        valid_res_cache = pickle.load(f)
else:
    valid_res_cache = {}

# go through all the suppl files
for pub_num, item in tqdm(processed_files.items()):
    for suppl_file in item["suppl_files"]:

        # get the name of the supplementary file
        name = suppl_file["suppl_filename"]

        # only allow the following extension
        ext = name.split(".")[-1]
        if not ext in allowed_file_type:
            continue
        
        # check file
        path = os.path.join(suppl_path, suppl_file["rpath"])
        print(f"{path}")
        
        # use api or cache
        if not suppl_file["rpath"] in valid_res_cache:
            is_valid, data = validate_sequence(path)
            valid_res_cache[suppl_file["rpath"]] = (is_valid, data)
        else:
            is_valid, data = valid_res_cache[suppl_file["rpath"]]
        suppl_file["is_sequence"] = is_valid
        print(f"{name}, {is_valid}")

# save cache
with open(valid_cache_path, "wb") as f:
    pickle.dump(valid_res_cache, f)

In [None]:
processed_files["sb500234s"]["suppl_files"]

In [None]:
# pickle the files
for pub_num, data in tqdm(processed_files.items()):
    with open(os.path.join(output_path, pub_num + ".pkl"), "wb") as out:
        pickle.dump(data, out)
    text_data = [d["text"] + "\n" for d in data["body"]]
    
    # save plain text
    if data["is_research"]:
        text_data_path = os.path.join(txt_path, "research")
    else:
        text_data_path = os.path.join(txt_path, "non-research")
    with open(os.path.join(text_data_path, pub_num + ".txt"), "w") as out:
        out.writelines(text_data)

In [None]:
pprint([(n, len(d["suppl_files"])) for n, d in processed_files.items()])

In [None]:
# check one pickle
with open(os.path.join(output_path, "sb500234s.pkl"), "rb") as ifile:
    pprint(pickle.load(ifile))