In [1]:
import os
import re
import pickle
import numpy as np
from pprint import pprint
from lxml import etree
from tqdm.notebook import tqdm

In [2]:
input_path = os.path.abspath("../acs-data/unzip-files/")
output_path = os.path.abspath("../acs-data/processed-files/")

In [3]:
# function to collect matching files and dirs
def collect_files(root, res, pattern="", collect_dirs=True):
    
    # go through all item in the dir
    for item in os.listdir(root):
        
        # process item
        item_path = os.path.join(root, item)
        item_is_dir = os.path.isdir(item_path)
        
        # pull valid file in res
        if re.match(pattern, item_path):
            if not item_is_dir or collect_dirs:
                res.append(item_path)
        
        # recursively collect all files
        if item_is_dir:
            collect_files(item_path, res, pattern, collect_dirs)

In [4]:
# collect all xml files
xml_paths = []
collect_files(input_path, xml_paths, pattern=".*\.xml$", collect_dirs=False)
print(f"total xml files: %d" % len(xml_paths))

total xml files: 1545


In [5]:
# helps to extract text from paragraph
def p_helper(node):
    
    # <p/> does not have text
    if node.text is None:
        return ""
    
    # each paragarph is put into a line
    line_list = [node.text]
    for child in node:

        # get the text inside the child if the tag isn't 
        # named-content and inline-formula
        # and the text following the child
        if not child.tag in ("named-content", "inline-formula"):
            line_list.append(" ".join(child.xpath(".//text()")))
        line_list.append(child.tail)

    # there might be none in line_list
        
    # re dark magic
    line = " ".join(line_list)
    line = line.strip()
    line = line.replace("\n", " ")

    # clean up consecutive spaces
    line = re.sub("\s+", " ", line)

    # fix the space around punctuation
    line = re.sub("\s([.,\):;])", r"\1", line)
    line = re.sub("\(\s", r"(", line)
    line = re.sub("\s*([-/])\s*", r"\1", line)
    return line

In [6]:
def kwd_helper(node):
    
    # return a keyword string
    kwd_tokens = node.xpath(".//text()")
    kwd = " ".join(kwd_tokens).replace("\n", " ").strip()
    kwd = re.sub("\s+", " ", kwd)
    return kwd

In [7]:
def extract_body(root):
    
    # we are interested in the text in the body section
    text = []
    text_nodes = root.xpath("/article/body//*[self::p or (self::title and not(ancestor::caption))]")
    for text_node in text_nodes:
        
        # handle title
        if text_node.tag == "title":
#             title = " ".join(text_node.xpath(".//text()"))
#             title = title.replace("\n", " ")
#             title = re.sub("\s+", " ", title)
#             title = title.strip()
#             print(title)
            pass
        
        # handle paragraph
        elif text_node.tag == "p":
            text.append(p_helper(text_node))
    return text

In [8]:
def extract_abstract(root):
    
    # get the abstract paragraph
    abstract = []
    abstract_nodes = root.xpath("//abstract/p")
    if abstract_nodes:
        abstract.append(p_helper(abstract_nodes[0]))
    return abstract

In [9]:
def extract_keywords(root):
    
    # get the keywords
    keywords = []
    kwd_nodes = root.xpath("//kwd-group/kwd")
    for kwd_node in kwd_nodes:
        keywords.append(kwd_helper(kwd_node))
    return keywords

In [10]:
# parse the files
for xml_path in tqdm(xml_paths):
    
    print("parsing %s" % xml_path)
    
    # get the root of the xml
    root = etree.parse(xml_path).getroot()
    
    # create a dictionary holding the xml data
    xml_data = {
        "keywords": extract_keywords(root),
        "abstract": extract_abstract(root),
        "body": extract_body(root)
    }
    
    # pickle the data
    # name the file to <pub #>.pkl
    pub_num = xml_path.split("/")[-1].split(".")[0]
    with open(os.path.join(output_path, pub_num + ".pkl"), "wb") as f:
        pickle.dump(xml_data, f)

HBox(children=(FloatProgress(value=0.0, max=1545.0), HTML(value='')))

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00050/sb7b00050.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00301/sb6b00301.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300068g/sb300068g.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00050/sb6b00050.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00034/sb6b00034.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00187/sb5b00187.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00088/sb7b00088.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00471/sb8b00471.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400072v/sb400072v.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300069k/sb300069k.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00194/sb8b00194.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500331x/sb5003

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00459/sb7b00459.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500345t/sb500345t.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400137b/sb400137b.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500273n/sb500273n.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00080/sb9b00080.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400205x/sb400205x.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00166/sb6b00166.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00484/sb8b00484.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00206/sb6b00206.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00214/sb7b00214.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500252a/sb500252a.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500035y/sb5000

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00260/sb8b00260.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00038/sb8b00038.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500272d/sb500272d.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500351f/sb500351f.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00019/sb9b00019.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00249/sb5b00249.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00085/sb5b00085.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400073z/sb400073z.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb3000904/sb3000904.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00226/sb9b00226.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00529/sb8b00529.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb2000116/sb2000

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb4000417/sb4000417.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00229/sb8b00229.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00105/sb8b00105.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00336/sb8b00336.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00256/sb7b00256.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00273/sb8b00273.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300028q/sb300028q.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00342/sb7b00342.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00124/sb8b00124.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00287/sb5b00287.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5001406/sb5001406.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00051/sb9b00

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500359q/sb500359q.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00398/sb7b00398.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00063/sb8b00063.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00132/sb5b00132.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500307w/sb500307w.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00191/sb8b00191.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300034m/sb300034m.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00055/sb8b00055.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300050j/sb300050j.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500226j/sb500226j.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00128/sb7b00128.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb200021s/sb2000

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00293/sb6b00293.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb3000657/sb3000657.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00234/sb8b00234.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00150/sb6b00150.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00361/sb6b00361.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00219/sb5b00219.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500233u/sb500233u.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00052/sb5b00052.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00146/sb8b00146.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500240p/sb500240p.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400082j/sb400082j.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00203/sb9b00

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00452/sb8b00452.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300077b/sb300077b.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400128g/sb400128g.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00145/sb5b00145.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00252/sb6b00252.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400042a/sb400042a.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00160/sb6b00160.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00161/sb5b00161.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00138/sb6b00138.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00253/sb5b00253.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500243c/sb500243c.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00229/sb5b00

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00225/sb8b00225.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00242/sb8b00242.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00424/sb7b00424.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00392/sb6b00392.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00430/sb7b00430.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00036/sb9b00036.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00169/sb7b00169.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00534/sb8b00534.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5000143/sb5000143.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00238/sb9b00238.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00535/sb8b00535.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400201u/sb4002

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00038/sb6b00038.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb3001112/sb3001112.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400166j/sb400166j.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00142/sb7b00142.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00160/sb5b00160.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00224/sb6b00224.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00304/sb7b00304.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00338/sb8b00338.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00324/sb8b00324.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00044/sb9b00044.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00212/sb5b00212.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00222/sb5b00

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5000884/sb5000884.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00290/sb8b00290.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00188/sb8b00188.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500324p/sb500324p.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00354/sb6b00354.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400067v/sb400067v.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00057/sb7b00057.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00411/sb8b00411.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00017/sb6b00017.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500042m/sb500042m.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00342/sb8b00342.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300120n/sb3001

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00064/sb6b00064.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500113b/sb500113b.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00103/sb9b00103.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00239/sb6b00239.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00371/sb6b00371.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300055e/sb300055e.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00309/sb8b00309.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400133g/sb400133g.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00317/sb8b00317.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00112/sb7b00112.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb3000589/sb3000589.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00505/sb8b00

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00361/sb8b00361.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00165/sb5b00165.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00005/sb9b00005.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00087/sb5b00087.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5002533/sb5002533.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00432/sb8b00432.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400003y/sb400003y.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00054/sb5b00054.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb200001q/sb200001q.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00165/sb6b00165.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00526/sb8b00526.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500249g/sb5002

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00390/sb6b00390.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00032/sb9b00032.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00043/sb5b00043.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00403/sb7b00403.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00219/sb9b00219.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00114/sb9b00114.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00066/sb9b00066.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00141/sb5b00141.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00123/sb9b00123.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500270h/sb500270h.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500018f/sb500018f.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300075t/sb3000

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300010a/sb300010a.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00330/sb6b00330.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb4000614/sb4000614.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300052u/sb300052u.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00092/sb9b00092.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00192/sb8b00192.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00306/sb6b00306.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00201/sb5b00201.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb3001276/sb3001276.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00521/sb8b00521.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300092n/sb300092n.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300060e/sb3000

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500302y/sb500302y.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb400047f/sb400047f.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00007/sb6b00007.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00442/sb7b00442.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00270/sb7b00270.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300033q/sb300033q.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00541/sb8b00541.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300018h/sb300018h.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00308/sb7b00308.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00175/sb9b00175.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb5b00246/sb5b00246.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300041z/sb3000

parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00078/sb7b00078.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb300110b/sb300110b.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb6b00040/sb6b00040.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00533/sb8b00533.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500255k/sb500255k.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb8b00510/sb8b00510.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00462/sb7b00462.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500225r/sb500225r.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb9b00131/sb9b00131.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb3000645/sb3000645.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb500195w/sb500195w.xml
parsing /home/jiawei/sbks-ucsd/biobert/acs-data/unzip-files/sb7b00167/sb7b00

In [11]:
# load some file to check
pkl_paths = []
collect_files(output_path, pkl_paths, pattern=".*\.pkl$", collect_dirs=False)
print("pkl files found: %d" % len(pkl_paths))

pkl files found: 1545


In [12]:
with open(pkl_paths[101], "rb") as f:
    pprint(pickle.load(f))

{'abstract': ['Metabolic engineering and synthetic biology usually require '
              'universal expression systems for stable and efficient gene '
              'expression in various organisms. In this study, a '
              'host-independent and stable T7 expression system had been '
              'developed by integrating T7 RNA polymerase and its cognate '
              'transcriptional units in single plasmid. The expression of T7 '
              'RNA polymerase was restricted below its lethal threshold using '
              'a T7 RNA polymerase antisense gene cassette, which allowed long '
              'periods of cultivation and protein production. In addition, by '
              'designing ribosome binding sites, we further tuned the '
              'expression capacity of this novel T7 system within a wide '
              'range. This host-independent expression system efficiently '
              'expressed genes in five different Gram-negative strains and one '
     

          'designed RBSs were constructed using standard cloning techniques, '
          'including PCR, restriction enzyme digestion and ligation. For '
          'application in P. putida and T. morbirosei, a HITES fragment was '
          'amplified from pET30-HITES with primers HITES-KT-F and HITES-KT-R, '
          'the plasmid fragment was amplified from pBBR1MCS-5 with primers '
          'pBBR1MCS5-KT-F and pBBR1MCS5-KT-R, and the HITES fragment was '
          'cloned into the plasmid fragment by Gibson assembly, resulting in '
          'pBBR1MCS5-KT-HITES. (Figure S5) For application in Sinorhizobium, a '
          'HITES fragment was amplified from pET30-HITES with primers '
          'HITES-Sino-F and HITES-Sino-R, the plasmid fragment was amplified '
          'from pBBR1MCS-5 with primers pBBR1MCS5-Sino-F and pBBR1MCS5-Sino-R, '
          'and the HITES fragment was cloned into the plasmid fragment by '
          'Gibson assembly, resulting in pBBR1MCS5-Sino-HITES. (Figu