In [68]:
from crossref.restful import Works
import re
import pandas as pd
import os
import time
import requests

from citeproc import CitationStylesStyle, CitationStylesBibliography
from citeproc import Citation, CitationItem
from citeproc import formatter
from citeproc.source.json import CiteProcJSON
import numpy as np
from tqdm import tqdm
import warnings
import json

CSL_JSON_HEADER = {
        "Accept": "application/vnd.citationstyles.csl+json"
}

In [69]:
bibnumber_pattern = "^([0-9]+\.|\[[0-9]+\]|\(.+\))\s+"
delay = 0.25
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

In [52]:
input_file_path = "./testdata.txt"
bibliography_lines = []
bibnumbers = []
with open(input_file_path,"r") as f:
    for l in f:
        bibnumber = re.search(bibnumber_pattern,l).groups()
        assert len(bibnumber) <= 1
        bibnumber = bibnumber[0] if len(bibnumber) == 1 else ""
        bibnumbers.append(bibnumber)
        bibliography_lines.append(l.replace(bibnumber,"").strip())
bibliography_df = pd.DataFrame({"bibnumber":bibnumbers,"bibliography":bibliography_lines})

In [53]:
bibliography_df.head()

Unnamed: 0,bibnumber,bibliography
0,1.0,Barresi M J F and Gilbert S F (2019) Developme...
1,2.0,Ladoux B and Mège R-M (2017) Mechanobiology of...
2,3.0,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ..."
3,4.0,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d..."
4,5.0,Anlaş A A and Nelson C M (2018) Tissue mechani...


In [54]:
works = Works()

In [56]:
for i, row in tqdm(list(bibliography_df.iterrows())):
    line = row["bibliography"]
    query = works.query(bibliographic=line).sort('relevance').select("DOI")
    res = [r for _,r in zip(range(2),query)]
    time.sleep(delay)

    csl_json_data = []
    for r in res:
        d = requests.get(f"http://doi.org/{r['DOI']}", headers = CSL_JSON_HEADER)
        if d.status_code == 200:
            csl_json_data.append(d.json())
        time.sleep(delay)
    bibliography_df.loc[i,"csl_json_data"] = json.dumps(csl_json_data)
bibliography_df.to_csv("testdata_csl_json.csv")

100%|██████████| 188/188 [17:10<00:00,  5.48s/it]


In [71]:
def _calc_similarity(b1,b2):
    s1 = set(b1.split())
    s2 = set(b2.split())
#    print(s1.union(s2))
#    print(s1.intersection(s2))
    return len(s1.intersection(s2))/len(s1.union(s2))

In [93]:
bib_style = CitationStylesStyle("./microscopy.csl", validate=False)
for i, row in bibliography_df.iterrows():
    csl_json_data = json.loads(row["csl_json_data"])
    for d in csl_json_data:
        d["id"] = d["DOI"]
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning)
        try:
            bib_source = CiteProcJSON(csl_json_data)
        except:
            row["formatted_bibliography"] = ""
            continue
    bibliography = CitationStylesBibliography(bib_style, bib_source, formatter.plain)
    for d in csl_json_data:
        c = Citation([CitationItem(d["id"])])
        bibliography.register(c)
    
    formatted_bibliography = []
    scores = []
    for item in bibliography.bibliography():
        formatted = re.sub(bibnumber_pattern,"",str(item)).strip()
        formatted_bibliography.append(formatted)
        scores.append(_calc_similarity(row["bibliography"],formatted))
    bibliography_df.loc[i,"formatted_bibliography"] = formatted_bibliography[np.argmax(scores)].replace("\n"," ")

In [94]:
inputted_bibliography_path = os.path.join(output_dir,"inputted_bibliography.txt")
searched_bibliography_path = os.path.join(output_dir,"searched_bibliography.txt")
with open(inputted_bibliography_path,"w") as f1, open(searched_bibliography_path,"w") as f2:
    for i, row in bibliography_df.iterrows():
        bibnumber = row["bibnumber"] + " " if row["bibnumber"] != "" else ""
        f1.write(bibnumber+row["bibliography"]+"\n")
        f2.write(bibnumber+row["formatted_bibliography"]+"\n")

In [102]:
container_titles = []
container_titles_short = []
for i, row in bibliography_df.iterrows():
    csl_json_data = json.loads(row["csl_json_data"])
    for d in csl_json_data:
        if "container-title" in d and "container-title-short" in d:
            container_titles.append(d["container-title"])
            container_titles_short.append(d["container-title-short"])
container_df = pd.DataFrame({"container-title":container_titles,"container-title-short":container_titles_short})

In [105]:
container_df.drop_duplicates().to_csv(os.path.join(output_dir,"container_titles.csv"),index=False)