In [97]:
from crossref.restful import Works
import re
import pandas as pd
import os
import time
import requests

from citeproc import CitationStylesStyle, CitationStylesBibliography
from citeproc import Citation, CitationItem
from citeproc import formatter
from citeproc.source.json import CiteProcJSON
from citeproc.source.bibtex import BibTeX
import numpy as np
from tqdm import tqdm
import warnings
import json

CSL_JSON_HEADER = {
        "Accept": "application/vnd.citationstyles.csl+json"
}
BIBTEX_HEADER = {
        "Accept": "application/x-bibtex"
}

In [98]:
bibnumber_pattern = "^([0-9]+\.|\[[0-9]+\]|\(.+\))\s+"
delay = 0.25
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

In [99]:
input_file_path = "./testdata.txt"
bibliography_lines = []
bibnumbers = []
with open(input_file_path,"r") as f:
    for l in f:
        bibnumber = re.search(bibnumber_pattern,l).groups()
        assert len(bibnumber) <= 1
        bibnumber = bibnumber[0] if len(bibnumber) == 1 else ""
        bibnumbers.append(bibnumber)
        bibliography_lines.append(l.replace(bibnumber,"").strip())
bibliography_df = pd.DataFrame({"bibnumber":bibnumbers,"bibliography":bibliography_lines})

In [100]:
bibliography_df.head()

Unnamed: 0,bibnumber,bibliography
0,1.0,Barresi M J F and Gilbert S F (2019) Developme...
1,2.0,Ladoux B and Mège R-M (2017) Mechanobiology of...
2,3.0,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ..."
3,4.0,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d..."
4,5.0,Anlaş A A and Nelson C M (2018) Tissue mechani...


In [101]:
works = Works()

In [102]:
for i, row in tqdm(list(bibliography_df.iterrows())):
    line = row["bibliography"]
    query = works.query(bibliographic=line).sort('relevance').select("DOI")
    res = [r for _,r in zip(range(2),query)]
    time.sleep(delay)

    csl_json_data = []
    bibtex_data = []
    for r in res:
        d = requests.get(f"http://doi.org/{r['DOI']}", headers = CSL_JSON_HEADER)
        if d.status_code == 200:
            csl_json_data.append(d.json())
        time.sleep(delay)
        d = requests.get(f"http://doi.org/{r['DOI']}", headers = BIBTEX_HEADER)
        if d.status_code == 200:
            bibtex_data.append(d.text)
        time.sleep(delay)
    bibliography_df.loc[i,"csl_json_data"] = json.dumps(csl_json_data)
    bibliography_df.loc[i,"bibtex_data"] = "\n".join(bibtex_data)
bibliography_df.to_csv("testdata_with_metadata.csv")

100%|██████████| 188/188 [26:30<00:00,  8.46s/it]


In [104]:
bibliography_df = pd.read_csv("testdata_with_metadata.csv",index_col=0,converters={"bibnumber":str})

In [139]:
# XXX Ad hoc fix for a encoding bug; Should be fixed later
bibliography_df["bibtex_data"] = bibliography_df["bibtex_data"].str.replace("â€“","-")
bibliography_df.loc[11,"bibtex_data"]

In [218]:
d = json.loads(bibliography_df.loc[5,"csl_json_data"])
with open("devbio.json","w") as f:
    json.dump(d[0],f)

In [219]:
d = json.loads(bibliography_df.loc[10,"csl_json_data"])
with open("science.json","w") as f:
    json.dump(d[0],f)

In [273]:
bibliography_df.loc[10,"bibtex_data"]

' @article{Keller_2013, title={Imaging Morphogenesis: Technological Advances and Biological Insights}, volume={340}, ISSN={1095-9203}, url={http://dx.doi.org/10.1126/science.1234168}, DOI={10.1126/science.1234168}, number={6137}, journal={Science}, publisher={American Association for the Advancement of Science (AAAS)}, author={Keller, Philipp J.}, year={2013}, month=jun }\n\n @inbook{1966, ISSN={0065-2962}, url={http://dx.doi.org/10.1016/b978-1-4831-9952-8.50012-2}, DOI={10.1016/b978-1-4831-9952-8.50012-2}, booktitle={Advances in Morphogenesis}, publisher={Elsevier}, year={1966}, pages={337-340} }\n'

In [232]:
d = json.loads(bibliography_df.loc[32,"csl_json_data"])
with open("elife.json","w") as f:
    json.dump(d[0],f)

In [254]:
d = json.loads(bibliography_df.loc[19,"csl_json_data"])
with open("wrong-year.json","w") as f:
    json.dump(d[0],f)

In [141]:
bibliography_df.head()

Unnamed: 0,bibnumber,bibliography,csl_json_data,bibtex_data,formatted_bibliography,selected_csl_json_data,container-title,abbreviation,bibliography2,formatted_bibliography2
0,1.0,Barresi M J F and Gilbert S F (2019) Developme...,"[{""indexed"": {""date-parts"": [[2022, 8, 5]], ""d...","@article{Belousov_2011, title={Scott F. Gilbe...","Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""...",American Journal of Medical Genetics,Am. J. Med. Genet.,Barresi M J F and Gilbert S F (2019) Developme...,"Sperber G H (1995) Developmental biology, 4th ..."
1,2.0,Ladoux B and Mège R-M (2017) Mechanobiology of...,"[{""indexed"": {""date-parts"": [[2024, 1, 7]], ""d...","@article{Ladoux_2017, title={Mechanobiology o...",Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da...",Nature Reviews Molecular Cell Biology,Nat. Rev. Mol. Cell Biol.,Ladoux B and Mège R-M (2017) Mechanobiology of...,Ladoux B and Mège R-M (2017) Mechanobiology of...
2,3.0,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","[{""indexed"": {""date-parts"": [[2024, 1, 4]], ""d...","@article{Hollandi_2022, title={Nucleus segmen...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da...",Trends in Cell Biology,Trends Cell Biol.,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ..."
3,4.0,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","[{""indexed"": {""date-parts"": [[2023, 12, 31]], ...","@article{Ma_ka_2023, title={The Cell Tracking...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""...",Nature Methods,Nat. Methods,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d..."
4,5.0,Anlaş A A and Nelson C M (2018) Tissue mechani...,"[{""indexed"": {""date-parts"": [[2023, 12, 30]], ...","@article{Anla__2018, title={Tissue mechanics ...",Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""...",Current Opinion in Cell Biology,Curr. Opin. Cell Biol.,Anlaş A A and Nelson C M (2018) Tissue mechani...,Anlaş A A and Nelson C M (2018) Tissue mechani...


In [142]:
def _calc_similarity(b1,b2):
    s1 = set(b1.split())
    s2 = set(b2.split())
#    print(s1.union(s2))
#    print(s1.intersection(s2))
    return len(s1.intersection(s2))/len(s1.union(s2))

In [143]:
import io


In [257]:
bib_style = CitationStylesStyle("./microscopy.csl", validate=False)
for i, row in bibliography_df.iterrows():
    
    csl_json_data = json.loads(row["csl_json_data"])
    for d in csl_json_data:
        d["id"] = d["DOI"]
        # XXX dirty logic!
        if "page" not in d:
            if "article-number" in d:
                d["page"] = d["article-number"]
            else:
                if "DOI" in d:
                    doi = d["DOI"]
                    if d["container-title"] == "Development":
                        d["page"] = doi.split("/")[-1].replace(".","")
                    elif d["container-title"] == "Science":
                        d["page"] = doi.split("/")[-1].split(".")[-1]
                        print(d["page"])
                    elif d["container-title"] == "eLife":
                        d["page"] = "e"+doi.split("/")[-1].split(".")[-1]
                    else:
                        d["page"] = doi.split("/")[-1].split(".")[-1]
                        print(d["page"])
        if "issued" in d and "published-print" in d:
            d["issued"] = d["published-print"].copy()
            print(d["issued"])
 
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning)
        try:
            #with io.StringIO(row["bibtex_data"]) as f:
            #    bib_source = BibTeX(f) 
            bib_source = CiteProcJSON(csl_json_data)
        except:
            row["formatted_bibliography"] = ""
            continue
    bibliography = CitationStylesBibliography(bib_style, bib_source, formatter.plain)
    for d in bib_source:
        c = Citation([CitationItem(d)])
        bibliography.register(c)
    
    formatted_bibliography = []
    scores = []
    for item in bibliography.bibliography():
        formatted = re.sub(bibnumber_pattern,"",str(item)).strip()
        formatted_bibliography.append(formatted)
        scores.append(_calc_similarity(row["bibliography"],formatted))
    ind = np.argmax(scores)
    bibliography_df.loc[i,"formatted_bibliography"] = formatted_bibliography[ind].replace("\n"," ")
    bibliography_df.loc[i,"selected_csl_json_data"] = json.dumps(csl_json_data[ind])

{'date-parts': [[2011, 9]]}
{'date-parts': [[1995, 7, 17]]}
{'date-parts': [[2017, 12]]}
{'date-parts': [[2017, 12]]}
{'date-parts': [[2022, 4]]}
814962
{'date-parts': [[2023, 7]]}
450019
{'date-parts': [[2018, 10]]}
{'date-parts': [[2014, 2]]}
{'date-parts': [[2020, 3, 1]]}
{'date-parts': [[2017, 7]]}
{'date-parts': [[2021, 1]]}
{'date-parts': [[2021, 6]]}
{'date-parts': [[2021, 12]]}
630272
{'date-parts': [[2022, 3]]}
{'date-parts': [[2014, 11]]}
{'date-parts': [[2022, 2]]}
{'date-parts': [[2020, 4]]}
1234168
{'date-parts': [[2013, 6, 7]]}
{'date-parts': [[1966]]}
aba1195
{'date-parts': [[2020, 7, 24]]}
487968
{'date-parts': [[2023, 1, 6]]}
487968
4715
{'date-parts': [[2023]]}
{'date-parts': [[2023, 2]]}
{'date-parts': [[2013, 2, 26]]}
{'date-parts': [[2013, 4, 9]]}
{'date-parts': [[2015, 5]]}
{'date-parts': [[2015, 11]]}
{'date-parts': [[2009, 11]]}
s001
{'date-parts': [[2005, 12]]}
{'date-parts': [[2006, 3]]}
{'date-parts': [[2015, 1]]}
{'date-parts': [[2013, 7]]}
{'date-parts': [[

In [258]:
bibliography_df

Unnamed: 0,bibnumber,bibliography,csl_json_data,bibtex_data,formatted_bibliography,selected_csl_json_data,container-title,abbreviation,bibliography2,formatted_bibliography2,selected_csl_json_data2
0,1.,Barresi M J F and Gilbert S F (2019) Developme...,"[{""indexed"": {""date-parts"": [[2022, 8, 5]], ""d...","@article{Belousov_2011, title={Scott F. Gilbe...","Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""...",American Journal of Medical Genetics,Am. J. Med. Genet.,Barresi M J F and Gilbert S F (2019) Developme...,"Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""..."
1,2.,Ladoux B and Mège R-M (2017) Mechanobiology of...,"[{""indexed"": {""date-parts"": [[2024, 1, 7]], ""d...","@article{Ladoux_2017, title={Mechanobiology o...",Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da...",Nature Reviews Molecular Cell Biology,Nat. Rev. Mol. Cell Biol.,Ladoux B and Mège R-M (2017) Mechanobiology of...,Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da..."
2,3.,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","[{""indexed"": {""date-parts"": [[2024, 1, 4]], ""d...","@article{Hollandi_2022, title={Nucleus segmen...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da...",Trends in Cell Biology,Trends Cell Biol.,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da..."
3,4.,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","[{""indexed"": {""date-parts"": [[2023, 12, 31]], ...","@article{Ma_ka_2023, title={The Cell Tracking...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""...",Nature Methods,Nat. Methods,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""..."
4,5.,Anlaş A A and Nelson C M (2018) Tissue mechani...,"[{""indexed"": {""date-parts"": [[2023, 12, 30]], ...","@article{Anla__2018, title={Tissue mechanics ...",Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""...",Current Opinion in Cell Biology,Curr. Opin. Cell Biol.,Anlaş A A and Nelson C M (2018) Tissue mechani...,Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""..."
...,...,...,...,...,...,...,...,...,...,...,...
183,184.,"Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","[{""indexed"": {""date-parts"": [[2024, 1, 6]], ""d...","@article{Goltsev_2018, title={Deep Profiling ...","Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","{""indexed"": {""date-parts"": [[2024, 1, 6]], ""da...",Cell,Cell,"Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","{""indexed"": {""date-parts"": [[2024, 1, 6]], ""da..."
184,185.,"Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","[{""indexed"": {""date-parts"": [[2023, 12, 31]], ...","@article{Li_2023, title={Challenges and oppor...","Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""...",Nature Methods,Nat. Methods,"Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""..."
185,186.,"Nogare D D, Hartley M, Deschamps J, Ellenberg ...","[{""indexed"": {""date-parts"": [[2023, 12, 28]], ...","@article{Nogare_2023, title={Using AI in bioi...","Nogare D D, Hartley M, Deschamps J, Ellenberg ...","{""indexed"": {""date-parts"": [[2023, 12, 28]], ""...",Nature Methods,Nat. Methods,"Nogare D D, Hartley M, Deschamps J, Ellenberg ...","Nogare D D, Hartley M, Deschamps J, Ellenberg ...","{""indexed"": {""date-parts"": [[2023, 12, 28]], ""..."
186,187.,"Carpenter A E, Cimini B A, and Eliceiri K W (2...","[{""indexed"": {""date-parts"": [[2023, 10, 28]], ...","@article{Carpenter_2023, title={Smart microsc...","Carpenter A E, Cimini B A, and Eliceiri K W (2...","{""indexed"": {""date-parts"": [[2023, 10, 28]], ""...",Nature Methods,Nat. Methods,"Carpenter A E, Cimini B A, and Eliceiri K W (2...","Carpenter A E, Cimini B A, and Eliceiri K W (2...","{""indexed"": {""date-parts"": [[2023, 10, 28]], ""..."


In [259]:
container_titles = []
container_titles_short = []
for i, row in bibliography_df.iterrows():
    try:
        csl_json_datum = json.loads(row["selected_csl_json_data"])
    except:
        continue
    title = csl_json_datum["container-title"]
    bibliography_df.loc[i,"container-title"] = title if isinstance(title,str) else ""
    if "container-title" in csl_json_datum and "container-title-short" in csl_json_datum:
        container_titles.append(csl_json_datum["container-title"])
        container_titles_short.append(csl_json_datum["container-title-short"])
container_df = pd.DataFrame({"container-title":container_titles,"container-title-short":container_titles_short})
container_df.drop_duplicates().to_csv(os.path.join(output_dir,"container_titles.csv"),index=False)
container_df2 = pd.read_csv(os.path.join(output_dir,"container_titles.csv"))

In [260]:
bibliography_df.head()

Unnamed: 0,bibnumber,bibliography,csl_json_data,bibtex_data,formatted_bibliography,selected_csl_json_data,container-title,abbreviation,bibliography2,formatted_bibliography2,selected_csl_json_data2
0,1.0,Barresi M J F and Gilbert S F (2019) Developme...,"[{""indexed"": {""date-parts"": [[2022, 8, 5]], ""d...","@article{Belousov_2011, title={Scott F. Gilbe...","Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""...",American Journal of Medical Genetics,Am. J. Med. Genet.,Barresi M J F and Gilbert S F (2019) Developme...,"Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""..."
1,2.0,Ladoux B and Mège R-M (2017) Mechanobiology of...,"[{""indexed"": {""date-parts"": [[2024, 1, 7]], ""d...","@article{Ladoux_2017, title={Mechanobiology o...",Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da...",Nature Reviews Molecular Cell Biology,Nat. Rev. Mol. Cell Biol.,Ladoux B and Mège R-M (2017) Mechanobiology of...,Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da..."
2,3.0,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","[{""indexed"": {""date-parts"": [[2024, 1, 4]], ""d...","@article{Hollandi_2022, title={Nucleus segmen...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da...",Trends in Cell Biology,Trends Cell Biol.,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da..."
3,4.0,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","[{""indexed"": {""date-parts"": [[2023, 12, 31]], ...","@article{Ma_ka_2023, title={The Cell Tracking...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""...",Nature Methods,Nat. Methods,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""..."
4,5.0,Anlaş A A and Nelson C M (2018) Tissue mechani...,"[{""indexed"": {""date-parts"": [[2023, 12, 30]], ...","@article{Anla__2018, title={Tissue mechanics ...",Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""...",Current Opinion in Cell Biology,Curr. Opin. Cell Biol.,Anlaş A A and Nelson C M (2018) Tissue mechani...,Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""..."


In [261]:
container_df2.head()

Unnamed: 0,container-title,container-title-short
0,American Journal of Medical Genetics,Am. J. Med. Genet.
1,Nature Reviews Molecular Cell Biology,Nat Rev Mol Cell Biol
2,Trends in Cell Biology,Trends in Cell Biology
3,Nature Methods,Nat Methods
4,Current Opinion in Cell Biology,Current Opinion in Cell Biology


In [262]:
from pyiso4.ltwa import Abbreviate
abbr = Abbreviate.create()
abbr("Journal of Microscopy")

'J. Microsc.'

In [263]:
container_df2["abbreviation"] = container_df2["container-title"].map(abbr)
abbreviation_df = container_df2.set_index("container-title")
abbreviation_df

Unnamed: 0_level_0,container-title-short,abbreviation
container-title,Unnamed: 1_level_1,Unnamed: 2_level_1
American Journal of Medical Genetics,Am. J. Med. Genet.,Am. J. Med. Genet.
Nature Reviews Molecular Cell Biology,Nat Rev Mol Cell Biol,Nat. Rev. Mol. Cell Biol.
Trends in Cell Biology,Trends in Cell Biology,Trends Cell Biol.
Nature Methods,Nat Methods,Nat. Methods
Current Opinion in Cell Biology,Current Opinion in Cell Biology,Curr. Opin. Cell Biol.
...,...,...
Developmental Biology,Developmental Biology,Dev. Biol.
"Development, Growth &amp; Differentiation",Dev Growth Differ,Dev. Growth &amp; Differ.
WIREs Developmental Biology,WIREs Developmental Biology,WIREs Dev. Biol.
Journal of Biological Chemistry,Journal of Biological Chemistry,J. Biol. Chem.


In [264]:
bibliography_df

Unnamed: 0,bibnumber,bibliography,csl_json_data,bibtex_data,formatted_bibliography,selected_csl_json_data,container-title,abbreviation,bibliography2,formatted_bibliography2,selected_csl_json_data2
0,1.,Barresi M J F and Gilbert S F (2019) Developme...,"[{""indexed"": {""date-parts"": [[2022, 8, 5]], ""d...","@article{Belousov_2011, title={Scott F. Gilbe...","Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""...",American Journal of Medical Genetics,Am. J. Med. Genet.,Barresi M J F and Gilbert S F (2019) Developme...,"Sperber G H (1995) Developmental biology, 4th ...","{""indexed"": {""date-parts"": [[2023, 10, 27]], ""..."
1,2.,Ladoux B and Mège R-M (2017) Mechanobiology of...,"[{""indexed"": {""date-parts"": [[2024, 1, 7]], ""d...","@article{Ladoux_2017, title={Mechanobiology o...",Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da...",Nature Reviews Molecular Cell Biology,Nat. Rev. Mol. Cell Biol.,Ladoux B and Mège R-M (2017) Mechanobiology of...,Ladoux B and Mège R-M (2017) Mechanobiology of...,"{""indexed"": {""date-parts"": [[2024, 1, 7]], ""da..."
2,3.,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","[{""indexed"": {""date-parts"": [[2024, 1, 4]], ""d...","@article{Hollandi_2022, title={Nucleus segmen...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da...",Trends in Cell Biology,Trends Cell Biol.,"Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","Hollandi R, Moshkov N, Paavolainen L, Tasnadi ...","{""indexed"": {""date-parts"": [[2024, 1, 4]], ""da..."
3,4.,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","[{""indexed"": {""date-parts"": [[2023, 12, 31]], ...","@article{Ma_ka_2023, title={The Cell Tracking...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""...",Nature Methods,Nat. Methods,"Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","Maška M, Ulman V, Delgado-Rodriguez P, Gómez-d...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""..."
4,5.,Anlaş A A and Nelson C M (2018) Tissue mechani...,"[{""indexed"": {""date-parts"": [[2023, 12, 30]], ...","@article{Anla__2018, title={Tissue mechanics ...",Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""...",Current Opinion in Cell Biology,Curr. Opin. Cell Biol.,Anlaş A A and Nelson C M (2018) Tissue mechani...,Anlaş A A and Nelson C M (2018) Tissue mechani...,"{""indexed"": {""date-parts"": [[2023, 12, 30]], ""..."
...,...,...,...,...,...,...,...,...,...,...,...
183,184.,"Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","[{""indexed"": {""date-parts"": [[2024, 1, 6]], ""d...","@article{Goltsev_2018, title={Deep Profiling ...","Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","{""indexed"": {""date-parts"": [[2024, 1, 6]], ""da...",Cell,Cell,"Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","Goltsev Y, Samusik N, Kennedy-Darling J, Bhate...","{""indexed"": {""date-parts"": [[2024, 1, 6]], ""da..."
184,185.,"Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","[{""indexed"": {""date-parts"": [[2023, 12, 31]], ...","@article{Li_2023, title={Challenges and oppor...","Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""...",Nature Methods,Nat. Methods,"Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","Li X, Zhang Y, Wu J, and Dai Q (2023) Challeng...","{""indexed"": {""date-parts"": [[2023, 12, 31]], ""..."
185,186.,"Nogare D D, Hartley M, Deschamps J, Ellenberg ...","[{""indexed"": {""date-parts"": [[2023, 12, 28]], ...","@article{Nogare_2023, title={Using AI in bioi...","Nogare D D, Hartley M, Deschamps J, Ellenberg ...","{""indexed"": {""date-parts"": [[2023, 12, 28]], ""...",Nature Methods,Nat. Methods,"Nogare D D, Hartley M, Deschamps J, Ellenberg ...","Nogare D D, Hartley M, Deschamps J, Ellenberg ...","{""indexed"": {""date-parts"": [[2023, 12, 28]], ""..."
186,187.,"Carpenter A E, Cimini B A, and Eliceiri K W (2...","[{""indexed"": {""date-parts"": [[2023, 10, 28]], ...","@article{Carpenter_2023, title={Smart microsc...","Carpenter A E, Cimini B A, and Eliceiri K W (2...","{""indexed"": {""date-parts"": [[2023, 10, 28]], ""...",Nature Methods,Nat. Methods,"Carpenter A E, Cimini B A, and Eliceiri K W (2...","Carpenter A E, Cimini B A, and Eliceiri K W (2...","{""indexed"": {""date-parts"": [[2023, 10, 28]], ""..."


In [276]:
for i, row in bibliography_df.iterrows():
    title = row["container-title"]
    if title in abbreviation_df.index:
        abbreviated_title = abbreviation_df.loc[title,"abbreviation"]
        short_title = abbreviation_df.loc[title,"container-title-short"]
        patterns = [
            re.escape(abbreviated_title),
            re.escape(abbreviated_title.replace(".","")),
            re.escape(abbreviated_title.replace("\&amp;","")),
            re.escape(abbreviated_title.replace("\&amp;","").replace(".","")),
            re.escape(short_title),
            re.escape(short_title.replace(".","")),
            re.escape(short_title.replace("\&amp;","")),
            re.escape(short_title.replace("\&amp;","").replace(".","")),
        ]
        pattern = "|".join(patterns)
        print(pattern)
        bibliography_df.loc[i,"abbreviation"] = abbreviated_title
        bibliography_df.loc[i,"bibliography2"] = re.sub(pattern,title,bibliography_df.loc[i,"bibliography"])
        bibliography_df.loc[i,"formatted_bibliography2"] = re.sub(pattern,title,bibliography_df.loc[i,"formatted_bibliography"])
    else:
        bibliography_df.loc[i,"abbreviation"] = ""
        bibliography_df.loc[i,"bibliography2"] = bibliography_df.loc[i,"bibliography"]
        bibliography_df.loc[i,"formatted_bibliography2"] = bibliography_df.loc[i,"formatted_bibliography"]

Am\.\ J\.\ Med\.\ Genet\.|Am\ J\ Med\ Genet|Am\.\ J\.\ Med\.\ Genet\.|Am\ J\ Med\ Genet|Am\.\ J\.\ Med\.\ Genet\.|Am\ J\ Med\ Genet|Am\.\ J\.\ Med\.\ Genet\.|Am\ J\ Med\ Genet
Nat\.\ Rev\.\ Mol\.\ Cell\ Biol\.|Nat\ Rev\ Mol\ Cell\ Biol|Nat\.\ Rev\.\ Mol\.\ Cell\ Biol\.|Nat\ Rev\ Mol\ Cell\ Biol|Nat\ Rev\ Mol\ Cell\ Biol|Nat\ Rev\ Mol\ Cell\ Biol|Nat\ Rev\ Mol\ Cell\ Biol|Nat\ Rev\ Mol\ Cell\ Biol
Trends\ Cell\ Biol\.|Trends\ Cell\ Biol|Trends\ Cell\ Biol\.|Trends\ Cell\ Biol|Trends\ in\ Cell\ Biology|Trends\ in\ Cell\ Biology|Trends\ in\ Cell\ Biology|Trends\ in\ Cell\ Biology
Nat\.\ Methods|Nat\ Methods|Nat\.\ Methods|Nat\ Methods|Nat\ Methods|Nat\ Methods|Nat\ Methods|Nat\ Methods
Curr\.\ Opin\.\ Cell\ Biol\.|Curr\ Opin\ Cell\ Biol|Curr\.\ Opin\.\ Cell\ Biol\.|Curr\ Opin\ Cell\ Biol|Current\ Opinion\ in\ Cell\ Biology|Current\ Opinion\ in\ Cell\ Biology|Current\ Opinion\ in\ Cell\ Biology|Current\ Opinion\ in\ Cell\ Biology
Dev\.\ Cell|Dev\ Cell|Dev\.\ Cell|Dev\ Cell|Developmental\ C

In [277]:
inputted_bibliography_path = os.path.join(output_dir,"inputted_bibliography.txt")
searched_bibliography_path = os.path.join(output_dir,"searched_bibliography.txt")
with open(inputted_bibliography_path,"w") as f1, open(searched_bibliography_path,"w") as f2:
    for i, row in bibliography_df.iterrows():
        bibnumber = row["bibnumber"] + " " if row["bibnumber"] != "" else ""
        f1.write(bibnumber+row["bibliography2"]+"\n")
        f2.write(bibnumber+row["formatted_bibliography2"]+"\n")