In [None]:
import os
import re
import sys
import requests
import pickle
import subprocess as sp
import numpy as np
import pandas as pd
from pprint import pprint
from lxml import etree
from tqdm.notebook import tqdm

In [None]:
# arguments
# the directory that stores our supplementary data dump
# since the supplementary data dump also contains the original article
# we can use this as the input path
input_path = os.path.abspath("/mnt/data1/jiawei/acs-data/suppl-files/")

# the directory that stores our processed pickle files
output_path = os.path.abspath("/mnt/data1/jiawei/acs-data/processed-files/")

# we also provide extracted text in plain text format, this is the output path
# of the plain text
txt_path = os.path.abspath("/mnt/data1/jiawei/acs-data/txt-files")

In [None]:
# local file parameters don't change unless you know what you are doing

# sbol api query cache, use to speed up the process time
valid_cache_path = os.path.abspath("./sbol-validation-cache.pkl")

# deprecated, the list is inaccurate (false negative, non research article not in the list)
non_acs_article_path = os.path.abspath("./non-acs-article.txt")

In [None]:
# sbol api request params
sbol_validator_url = "https://validator.sbolstandard.org/validate/"

# these are the file format 
sbol_allowed_file_type = set([
    "gb", "fasta", "sbol", "txt", "xml", "dna"
])
validator_param = {
    'options': {
        'language' : "FASTA",
        'test_equality': False,
        'check_uri_compliance': False,
        'check_completeness': False,
        'check_best_practices': False,
        'fail_on_first_error': True,
        'provide_detailed_stack_trace': False,
        'subset_uri': '',
        'uri_prefix': 'dummy', # we need to have something here or the api will return an error
        'version': '',
        'insert_type': False
    },
    "main_file": None,
    "return_file": True
}

In [None]:
# function to collect matching files and dirs
# basically a fancy find method
# pattern is a re pattern
# collect_dirs is a bool, when set to true it will collect directories/folders
# return a list of files/directories that matches the pattern
def collect_files(root, res, pattern="", collect_dirs=True, min_depth=None, max_depth=None):
    
    # check max depth
    if not max_depth is None and max_depth == 0:
        return
    
    # go through all item in the dir
    for item in os.listdir(root):
        
        # process item
        item_path = os.path.join(root, item)
        item_is_dir = os.path.isdir(item_path)
        
        # put valid file in res if min depth has reached
        if min_depth is None or min_depth - 1 <= 0:
            if re.match(pattern, item_path):
                if not item_is_dir or collect_dirs:
                    res.append(item_path)
        
        # recursively collect all files
        if item_is_dir:
            next_min_depth = None if min_depth is None else min_depth - 1
            next_max_depth = None if max_depth is None else max_depth - 1
            collect_files(item_path, res, pattern, collect_dirs, next_min_depth, next_max_depth)

In [None]:
# helps to extract text from paragraph
def p_helper(node):
    
    # <p/> does not have text
    if node.text is None:
        return ""
    
    # each paragarph is put into a line
    line_list = [node.text]
    for child in node:

        # get the text inside the child if the tag isn't 
        # named-content and inline-formula (those are mathametical formulas)
        # and the text following the child
        if not child.tag in ("named-content", "inline-formula"):
            line_list.append(" ".join(child.xpath(".//text()")))
        line_list.append(child.tail)

    # there might be none in line_list
        
    # re dark magic below
    # remove new line and spaces
    line = " ".join(line_list)
    line = line.strip()
    line = line.replace("\n", " ")

    # clean up consecutive spaces
    line = re.sub("\s+", " ", line)

    # fix the space around punctuation
    line = re.sub("\s([.,\):;])", r"\1", line)
    line = re.sub("\(\s", r"(", line)
    line = re.sub("\s*([-/])\s*", r"\1", line)
    return line

In [None]:
# strip format from keyword nodes
def kwd_helper(node):
    
    # return a keyword string
    kwd_tokens = node.xpath(".//text()")
    kwd = " ".join(kwd_tokens).replace("\n", " ").strip()
    kwd = re.sub("\s+", " ", kwd)
    return kwd

In [None]:
# this returns interesting titles
# for example: intro, method, and results
# return None for non interesting titles
def title_helper(node):
    
    # extract text from title node
    title = " ".join(node.xpath(".//text()"))
    title = title.replace("\n", " ")
    title = re.sub("\s+", " ", title)
    title = title.strip()
    title = title.lower()
    
    # categorize title
    res = []
    if "intro" in title:
        res.append("introduction")
    if "result" in title:
        res.append("result")
    if "discuss" in title:
        res.append("discussion")
    if "material" in title:
        res.append("materials")
    if "method" in title or "procedure" in title:
        res.append("method")
    if "summary" in title:
        res.append("summary")
    return res

In [None]:
# extract text from xpath nodes
def extract_body(root):
    
    # we are interested in the text in the body section
    curr_title = []
    text = []
    text_nodes = root.xpath("/article/body//*[self::p or (self::title and not(ancestor::caption))]")
    for text_node in text_nodes:
        
        # handle title
        if text_node.tag == "title":
            tmp_title = title_helper(text_node)
            if len(tmp_title) > 0:
                curr_title = tmp_title
            title = " ".join(text_node.xpath(".//text()"))
            title = title.replace("\n", " ")
            title = re.sub("\s+", " ", title)
            title = title.strip()
            text.append({
                "text": title,
                "section": curr_title
            })
        
        # handle paragraph
        elif text_node.tag == "p":
            text.append({
                "text": p_helper(text_node),
                "section": curr_title
            })
    return text

In [None]:
# extract abstract
def extract_abstract(root):
    
    # get the abstract paragraph
    abstract = []
    abstract_nodes = root.xpath("//abstract/p")
    if abstract_nodes:
        abstract.append(p_helper(abstract_nodes[0]))
    return abstract

In [None]:
# extract_keywords from meta data
def extract_keywords(root):
    
    # get the keywords
    keywords = []
    kwd_nodes = root.xpath("//kwd-group/kwd")
    for kwd_node in kwd_nodes:
        keywords.append(kwd_helper(kwd_node))
    return keywords

In [None]:
# extract date information from meta data
def extract_date(root):
    
    issue_pub_date = None
    electron_pub_date = None
    
    # traverse to the date note
    date_nodes = root.xpath("/article/front/article-meta/pub-date")
    
    # get the time
    for node in date_nodes:
        year = node.xpath("./year")[0].text.strip()
        month = node.xpath("./month")[0].text.strip()
        day = node.xpath("./day")[0].text.strip()

        if "date-type" in node.attrib and node.attrib["date-type"] == "issue-pub":
            issue_pub_date = "%s/%s/%s" % (month, day, year)
        else:
            electron_pub_date = "%s/%s/%s" % (month, day, year)
    
    return issue_pub_date, electron_pub_date

In [None]:
# extract article-id from meta data
def extract_id(root):
    
    id_node = root.xpath("/article/front/article-meta/article-id")[0]
    article_id = id_node.text.strip()
    return article_id

In [None]:
# extract internal id, the one in the first line of the xml file
def extract_internal_id(root):
    
    id_node = root.xpath("/article")[0]
    return id_node.attrib["id"]

In [None]:
# extract a list of history from the meta data
def extract_history(root):
    
    res = []
    dates = root.xpath("/article/front/article-meta/history/date")
    for date in dates:
        year = date.xpath("./year")[0].text.strip()
        month = date.xpath("./month")[0].text.strip()
        day = date.xpath("./day")[0].text.strip()
        res.append({
            "event": date.attrib["date-type"],
            "time": "%s/%s/%s" % (month, day, year)
        })
    return res

In [None]:
# extract the subject of the article from meta data
def extract_article_type(root):
    article_type = root.xpath("/article/front/article-meta/article-categories/subj-group")
    if len(article_type) > 1:
        print("article have 2 or more types")
    raw = article_type[0].xpath(".//text()")
    # clean up
    res = ''.join(raw).strip()
    res = re.sub("\n", "", res)
    return res

In [None]:
# collect all xml files
xml_paths = []
collect_files(input_path, xml_paths, pattern=".*\.xml$", collect_dirs=False, min_depth=1, max_depth=2)
print(f"total xml files: %d" % len(xml_paths))

In [None]:
# deprecated: get a list of non research article
non_acs_article_list = set()
with open(non_acs_article_path, "r") as f:
    for line in f:
        non_acs_article_list.add(line.strip())

In [None]:
# parse the files
# after this block, the information will be extracted to var processeed_files
# processed_files: key(pub_num) -> value(article_info)
processed_files = {}
for xml_path in tqdm(xml_paths):

    # print("\nparsing %s" % xml_path)
    pub_num = xml_path.split("/")[-1].split(".")[0]

    # get the root of the xml
    root = etree.parse(xml_path).getroot()
    
    # get the pub date
    issue_pub_date, electron_pub_date = extract_date(root)
    
    # get the article type
    article_type = extract_article_type(root)

    # create a dictionary holding the extracted data
    xml_data = {
        "is_research": "research" in article_type.lower(),
        "keywords": extract_keywords(root),
        "abstract": extract_abstract(root),
        "body": extract_body(root),
        "issue_pub_date": issue_pub_date,
        "electron_pub_date": electron_pub_date,
        "article_id": extract_id(root),
        "internal_id": extract_internal_id(root),
        "history": extract_history(root),
        "type": article_type
    }

    # save the data
    processed_files[pub_num] = xml_data

In [None]:
# print one sample for inspection
pprint(processed_files["sb6b00034"])

In [None]:
# show some stats
type_count = {}
event_count = {}
for k, article_info in processed_files.items():
    article_type = article_info["type"]
    if not article_type in type_count:
        type_count[article_type] = 0
    type_count[article_type] += 1
    
    events = article_info["history"]
    for e in events:
        if not e["event"] in event_count:
            event_count[e["event"]] = 0
        event_count[e["event"]] += 1
pprint(type_count)
print()
pprint(event_count)

In [None]:
# find out all the zip files and unzip them
# do this once when there is new data
# suppl_files_zip = []
# collect_files(input_path, suppl_files_zip, pattern=".*\.zip$", collect_dirs=False, min_depth=3)
# print("zip files: %d" % len(suppl_files_zip))
# for zip_file in suppl_files_zip:
#     zip_file_dir = re.sub("/[^/]*$", "", zip_file)
#     res = sp.run(["unzip", "-n", zip_file, "-d", zip_file_dir])
#     if res.returncode != 0:
#         print(res)

In [None]:
# collect all suppl files
suppl_files_all = []
collect_files(input_path, suppl_files_all, pattern="", collect_dirs=False, min_depth=3)
suppl_files_all = [x for x in suppl_files_all if not re.match(".*__MACOSX.*", x)]
print("suppl files: %d" % len(suppl_files_all))

In [None]:
# attach all the suppl path to processed files
# clear the suppl list of each article first to make the following code idempotent
for article_info in processed_files.values():
    article_info["suppl_files"] = []
    
for path in suppl_files_all:
    path = path.split("/")
    
    # get basic attrib
    suppl_filename = path[-1]
    suppl_dir = input_path.split("/")[-1]
    suppl_dir_idx = 0
    for i, item in enumerate(path):
        if item == suppl_dir:
            suppl_dir_idx = i
            break
    else:
        assert False # we should not be here
    pub_num = path[suppl_dir_idx + 1]
    rpath = os.path.join(*path[suppl_dir_idx + 1:]) # relative path
    
    # create info dict
    suppl_info = {
        "suppl_filename": suppl_filename,
        "rpath": rpath,
        "sequences": None
    }
    
    # push it into the processed files dict
    if pub_num in processed_files:
        processed_files[pub_num]["suppl_files"].append(suppl_info)

In [None]:
# get supplementary files by extension
def filter_suppl_file_by_ext(all_articles, allowed_ext):
    suppl_files_to_check = []
    for article_info in all_articles.values():
        for suppl_info in article_info["suppl_files"]:
            # only allow the following extension
            ext = suppl_info["suppl_filename"].split(".")[-1]
            if ext in allowed_ext:
                suppl_files_to_check.append((suppl_info, article_info))
    return suppl_files_to_check

In [None]:
# use sbol api to validate sequence files
def validate_sequence(file):
    
    # restrict file size to be less than 64mb
    # this is an api restriction
    file_size = os.path.getsize(file)
    if file_size >= 64 * 2 ** 20:
        return False, None
    
    # try to read the content
    try:
        with open(file, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        return False, None
    
    # validate file
    validator_param["main_file"] = content
    res = requests.post(sbol_validator_url, json=validator_param).json()
    return res["valid"], res

In [None]:
# use the api to check if the file is a sequence file
# we will cache the request result to reduce server load
valid_res_cache = None
if os.path.exists(valid_cache_path):
    with open(valid_cache_path, "rb") as f:
        valid_res_cache = pickle.load(f)
else:
    valid_res_cache = {}

# create a list of all the suppl files that we need to check
sbol_suppl_files_to_check = filter_suppl_file_by_ext(processed_files, sbol_allowed_file_type)

for suppl_info, article_info in tqdm(sbol_suppl_files_to_check):

    # use api or cache to get convert and store the sequence file in fasta format
    path = os.path.join(input_path, suppl_info["rpath"])
    if not suppl_info["rpath"] in valid_res_cache:
        is_valid, data = validate_sequence(path)
        valid_res_cache[suppl_info["rpath"]] = (is_valid, data)
    else:
        is_valid, data = valid_res_cache[suppl_info["rpath"]]
    suppl_info["sequences"] = data["result"] if is_valid else None

# save cache
with open(valid_cache_path, "wb") as f:
    pickle.dump(valid_res_cache, f)

In [None]:
# finds any strings that looks like a sequence in pdf
# need to have pdftotext installed
def extract_sequence_from_pdf(path):
    sp.run(["pdftotext", "-raw", path, "/tmp/sbks-pdf-tmp.txt"]) # use tmp dir maybe it is ram so faster?
    seqs = []
    with open("/tmp/sbks-pdf-tmp.txt", "r") as file:
        lines = file.readlines()
        text = "".join(lines).upper()
        
        # first find all the multiline sequences
        for multi_line_seq in re.findall("([ATCG]{3,}(\n[ATCG]{3,}){1,})", text):
            seqs.append(re.sub("\n", "", multi_line_seq[0]))
            
        # remove extracted sequences to avoid double counting
        text = re.sub("([ATCG]{3,}(\n[ATCG]{3,}){1,})", " ###REMOVED### ", text)
        
        # then find all the one line sequences
        for single_line_seq in re.findall("[ATCG]{10,}", text):
            seqs.append(single_line_seq)
            
#     sp.run(["rm", "tmp.txt"])
    return seqs

In [None]:
# extract sequences from pdf
# create a list of all the suppl files that we need to check
pdf_suppl_files_to_check = filter_suppl_file_by_ext(processed_files, set(["pdf"]))

for suppl_info, article_info in tqdm(pdf_suppl_files_to_check):

    # use pdftotext to extract the sequences from pdf
    path = os.path.join(input_path, suppl_info["rpath"])
    sequences = extract_sequence_from_pdf(path)
    if sequences:
        res = []
        for i, s in enumerate(sequences):
            res.append(f">{article_info['internal_id']}_{suppl_info['suppl_filename']}_{i}\n")
            res.append(s + "\n")
        suppl_info["sequences"] = "".join(res)

In [None]:
# def extract_sequence_from_csv(path):
#     print(path)
#     seqs = []
#     with open(path, "r") as ifile:
#         for line in ifile:
#             for single_line_seq in re.findall("[ATCG]{10,}", line):
#                 seqs.append(single_line_seq)
#     return seqs

In [None]:
# xlsx_files_to_check = filter_suppl_file_by_ext(processed_files, ["csv"])
# for suppl_info, article_info in xlsx_files_to_check:
#     path = os.path.join(suppl_path, suppl_info["rpath"])
#     extract_sequence_from_csv(path)

In [None]:
# write the sequence files
for pub_num, article_info in processed_files.items():
    seq_to_write = []
    for suppl_info in article_info["suppl_files"]:
        if not suppl_info["sequences"] is None:
            seq_to_write.append(suppl_info["sequences"])
    if len(seq_to_write) > 0:
        with open(os.path.join("sequence-files", pub_num + "_" + article_info["internal_id"] + ".seq.txt"), "w") as outfile:
            for seq in seq_to_write:
                outfile.write(seq)

In [None]:
# pickle the processed files
for pub_num, data in tqdm(processed_files.items()):
    with open(os.path.join(output_path, pub_num + ".pkl"), "wb") as out:
        pickle.dump(data, out)
    text_data = [d["text"] + "\n" for d in data["body"]]
    
    # save plain text
    if data["is_research"]:
        text_data_path = os.path.join(txt_path, "research")
    else:
        text_data_path = os.path.join(txt_path, "non-research")
    with open(os.path.join(text_data_path, pub_num + ".txt"), "w") as out:
        out.writelines(text_data)

In [None]:
# check one pickle
with open(os.path.join(output_path, "sb300092n.pkl"), "rb") as ifile:
    pprint(pickle.load(ifile)["type"])