In [60]:
!pip install requests crossrefapi
!pip install dehyphen

Collecting dehyphen
  Downloading dehyphen-0.3.4-py3-none-any.whl (18 kB)
Collecting clean-text[gpl] (from dehyphen)
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting pd3f-flair (from dehyphen)
  Downloading pd3f_flair-0.6.0.1-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting emoji<2.0.0,>=1.0.0 (from clean-text[gpl]->dehyphen)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ftfy<7.0,>=6.0 (from clean-text[gpl]->dehyphen)
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unidecode<2.0.0,>=1.1.1 (from clean

# Retrieve Crossref data

In [309]:
import requests
import json
from time import sleep
from tqdm import tqdm
import difflib
import re
from unicodedata import normalize
import pickle

In [432]:
def get_article_info_crossref(title,container_title=None):
    base_url = "https://api.crossref.org/works"
    params = {"query.title": title}
    if container_title:
        params.update({"query.container-title":container_title})
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data['message']['items']:
            article = data['message']['items'][:3]
            return article
        else:
            return {}
    else:
        raise RuntimeError("Failed to retrieve data.")

In [433]:
def get_article_info_crossref2(text):
    base_url = "https://api.crossref.org/works"
    params = {"query.bibliographic": text}
    response = requests.get(base_url, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data['message']['items']:
            article = data['message']['items'][0]
            return article
        else:
            return {}
    else:
        raise RuntimeError("Failed to retrieve data.")

In [434]:
with open("testdata.txt","r") as f:
    original_data = f.readlines()
original_data_with_ref_num = {}
for d in original_data:
    res = re.search("([0-9]+)\.",d)
    k = res.groups(0)[0]
    original_data_with_ref_num[k] = d
original_data_with_ref_num

{'1': '1. Barresi M J F and Gilbert S F (2019) Developmental Biology. Twelve edition, (Sinauer Associates Inc, New York).\n',
 '2': '2. Ladoux B and Mège R-M (2017) Mechanobiology of collective cell behaviours. Nat. Rev. Mol. Cell Biol. 18: 743–757.\n',
 '3': '3. Hollandi R, Moshkov N, Paavolainen L, Tasnadi E, Piccinini F, and Horvath P (2022) Nucleus segmentation: towards automated solutions. Trends Cell Biol. 32: 295–310.\n',
 '4': '4. Maška M, Ulman V, Delgado-Rodriguez P, Gómez-de-Mariscal E, Nečasová T, Guerrero Peña FA, Ren TI, Meyerowitz EM, Scherr T, Löffler K, and Mikut R (2023) The cell tracking challenge: 10 years of objective benchmarking. Nat. Methods 20: 1–11.\n',
 '5': '5. Anlaş A A and Nelson C M (2018) Tissue mechanics regulates form, function, and dysfunction. Curr. Opin. Cell Biol. 54: 98–105.\n',
 '6': '6. Chan C J and Hiiragi T (2020) Integration of luminal pressure and signalling in tissue self-organization. Development 147: dev181297.\n',
 '7': '7. Goodwin K and

In [435]:
with open("testdata-csl.json") as f:
    records = json.load(f)
records[0]

{'citation-number': '1.',
 'author': [{'family': 'F', 'given': 'Barresi M.J.'},
  {'family': 'F', 'given': 'Gilbert S.'}],
 'title': 'Developmental Biology',
 'edition': 'Twelve',
 'publisher': 'Sinauer Associates Inc',
 'language': 'pt',
 'type': 'book',
 'issued': {'date-parts': [[2019]]},
 'publisher-place': 'New York'}

In [440]:
for record in tqdm(records):
    """
    if "title" in record:
        record["matched_result"]=get_article_info_crossref(record["title"],record.get('container-title',None))
    else:
        print(record)
        record["matched_result"]=False
    """
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    original_line = re.sub("^[0-9]\.","",original_line)
    record["matched_result"]=get_article_info_crossref2(original_line)
    sleep(1)
with open("testdata_retrieved2.pickle","wb") as f:
    pickle.dump(records,f)

100%|██████████| 188/188 [10:00<00:00,  3.19s/it]


In [441]:

#with open("testdata_retrieved.pickle","rb") as f:
#    records=pickle.load(f)

In [447]:
def title_normalize(s):
    import re
    s= s.replace("-","").replace("‐","").replace("’","'").replace(" "," ").replace("–","").replace(chr(160)," ").lower()
    s = re.sub("\.$","",s)
    return s

In [448]:
title_matched = []
title_unmatched = []
for record in records:
    t1=title_normalize(record.get("title",""))
    t2 = title_normalize(record["matched_result"].get("title",[""])[0] if record["matched_result"] else "")
    if record["matched_result"] and  t1 == t2:
        title_matched.append(record.copy())
    else:
        print(t1)
        print(t2)
        title_unmatched.append(record.copy())
print(len(title_matched),len(title_unmatched))

developmental biology
scott f. gilbert—developmental biology, 2010, sinauer associates, inc., sunderland, ma ninth edition
role of ca 2+ transients at the node of the mouse embryo in breaking of leftright symmetry
role of ca
            <sup>2+</sup>
            transients at the node of the mouse embryo in breaking of leftright symmetry
lightsheet microscopy reveals dorsoventral asymmetric membrane dynamics of amoeba proteus during pressuredriven locomotion
lightsheet microscopy reveals dorsoventral asymmetric membrane dynamics of<i>amoeba proteus</i>during pressuredriven locomotion
cell segmentation: 50 years down the road [life sciences
cell segmentation: 50 years down the road [life sciences]
napari: a multidimensional image viewer for python
napari: a python multidimensional image viewer platform for the research community

dynamic hydrologic simulation of the bear brook watershed in maine (bbwm)
image data science with python and napari @epfl
napari: a python multidimensional ima

# Analysis for title-unmatched entries

In [449]:
def print_ndiff(d):
    res = []
    chars = []
    for i,s in enumerate(d):
        if s[0]==' ': continue
        elif s[0]=='-':
            chars.append(s[-1])
            res.append(u'Delete "{}" from position {}'.format(ord(s[-1]),i))
        elif s[0]=='+':
            chars.append(s[-1])
            res.append(u'Add "{}" to position {}'.format(ord(s[-1]),i))   
    return res, chars

In [450]:
for record in title_unmatched:
    n = record["citation-number"]
    t1 = record["matched_result"]["title"][0] if record["matched_result"] else ""
    t2 = record["title"] if "title" in record else ""
    res, chars = print_ndiff(difflib.ndiff(title_normalize(t1),title_normalize(t2)))
    if len(res)>5:
        continue
        print("Too much")
    print(n)
    print(t1)
    print(t2)
    for r in res:
        print(r)


41.
Cell Segmentation: 50 Years Down the Road [Life Sciences]
Cell segmentation: 50 years down the road [life sciences
Delete "93" from position 56


In [451]:
with open("title_unmatched.txt","w") as f:
    for record in title_unmatched:
        n = record["citation-number"]
        f.write(original_data_with_ref_num[re.sub("\.$","",n)])

# Analysis for title-matched citations

## Authors

In [452]:
def normalize_name(name):
    return name.lower()

def compare_initials(initials, full_name):
    # Split the initials and full name into parts
    initials_parts = re.split("[\-\s\.]",initials)
    full_name_parts = re.split("[\-\s]",full_name)

    # Compare each part
    for initial, name in zip(initials_parts, full_name_parts):
        if not name.startswith(initial.replace(".","")):
            return False

    return True

def compare_names(name1, name2):
    # Normalize names
    surname1, given_name1 = map(normalize_name, name1)
    surname2, given_name2 = map(normalize_name, name2)

    # Compare surnames
    if normalize("NFC",surname1).replace("’","'").replace("-","") != normalize("NFC",surname2).replace("’","'").replace("-",""):
        return False

    # Compare given names
    if len(given_name1) == len(given_name2):
        return given_name1 == given_name2
    elif len(given_name1) < len(given_name2):
        return compare_initials(given_name1, given_name2)
    else:
        return compare_initials(given_name2, given_name1)

# Test the function
names1 = [('Ladoux', 'B'), ('Mège', 'R-M')]
names2 = [('Ladoux', 'Benoit'), ('Mège', 'René-Marc')]
names3 = [('Ladoux', 'Benoit'), ('Mège', 'René-Narc')]
names4 = [('Anlaş', 'Alişya A')]
names5 = [('Anlaş', 'A A')]
#names6 = [('Müller', 'Claus B.'), ('Enderlein', 'Jörg')]
#names7 = [('Müller', 'C B'), ('Enderlein', 'J')]

assert all(compare_names(n1, n2) for n1, n2 in zip(names1, names2))  # Should return True
assert not all(compare_names(n1, n2) for n1, n2 in zip(names1, names3))  # Should return False
#assert all(compare_names(n1, n2) for n1, n2 in zip(names5, names4))  # Should return False
#assert all(compare_names(n1, n2) for n1, n2 in zip(names7, names6))  # Should return False


In [453]:
def split_initials(s):
    names = s.split()
    fn = []
    gn = []
    for n in names:
        n = n.strip(".")
        if len(n)>1 and not all([len(nn)==1 for nn in n.split("-")]):
            fn.append(n)
        else:
            gn.append(n)
    return " ".join(fn), " ".join(gn)

In [454]:
for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    # author
    author1 = re.search("^[0-9]+\. (.+)\([0-9]",original_line).groups()[0]
    author1 = [s.strip() for s in re.split("\,| and",author1) if s]
    author1 = [split_initials(a) for a in author1]
    if "author" in record["matched_result"]:
        author2 = [(a["family"],a["given"]) for a in record['matched_result']["author"] if "family" in a and "given" in a]
    else:
        author2 = []
    record["author1"] = author1
    record["author2"] = author2
    if len(author1) != len(author2):
        record["author_need_et_al"] = True
    else:
        record["author_need_et_al"] = False
        #print(author1,author2)
    record["author_valid"] = all(compare_names(n1, n2) for n1, n2 in zip(author2, author1))
    if not record["author_valid"]:
        for a2, a1 in zip(author2, author1):
            if not compare_names(a2,a1):
                print(a1,a2)


('Guerrero Peña FA', '') ('Guerrero Peña', 'Fidel A.')
('Ren TI', '') ('Ren', 'Tsang Ing')
('Meyerowitz EM', '') ('Meyerowitz', 'Elliot M.')
('Hammer III', 'J A') ('Hammer', 'John A.')
('Pérez', 'F') ('Perez', 'Fernando')
('da Silveira', 'S R') ('Rodrigues da Silveira', 'Sylvia')
('Dollár', 'P') ('Dollar', 'Piotr')
('Dengel', 'A') ('Dale', 'Timothy')
('Ahmed', 'S') ('Dengel', 'Andreas')
('Trygg', 'J') ('Ahmed', 'Sheraz')
('Sjögren', 'R') ('Trygg', 'Johan')
('Chamier von', 'L') ('von Chamier', 'Lucas')
('Troys', 'M V') ('Van Troys', 'Marleen')
('Löffler', 'K') ('Loffler', 'Katharina')
('Magnusson KE', '') ('Magnusson', 'Klas E G')
('Karhohs KW', '') ('Karhohs', 'Kyle W.')
('Cimini BA', '') ('Cimini', 'Beth A.')
('Toman-cak', 'P') ('Haase', 'Robert')
('Haase', 'R') ('Jug', 'Florian')
('Campàs (2018) fluid-to-solid jamming transition underlies vertebrate body axis elongation Nature 561', 'O A') ('Campàs', 'Otger')
('Kuo IY', '') ('Kuo', 'Ivana Y.')


In [455]:
for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    if not record["author_valid"]:
        print(original_line.strip())
        for a2, a1 in zip(record["author2"],record["author1"]):
            if not compare_names(a2,a1):
                print(" > ", " ".join(a1) , "->", " ".join(a2))
        print()

4. Maška M, Ulman V, Delgado-Rodriguez P, Gómez-de-Mariscal E, Nečasová T, Guerrero Peña FA, Ren TI, Meyerowitz EM, Scherr T, Löffler K, and Mikut R (2023) The cell tracking challenge: 10 years of objective benchmarking. Nat. Methods 20: 1–11.
 >  Guerrero Peña FA  -> Guerrero Peña Fidel A.
 >  Ren TI  -> Ren Tsang Ing
 >  Meyerowitz EM  -> Meyerowitz Elliot M.

27. Chen B-C, Legant W R, Wang K, Shao L, Milkie D E, David-son M W, Janetopoulos C, Wu X S, Hammer III J A, Liu Z, and English B P (2014) Lattice Light-Sheet Microscopy: Imaging Molecules to Embryos at High Spatiotemporal Resolution. Science 346: 417–23.
 >  Hammer III J A -> Hammer John A.

46. Granger B E, and Pérez F (2021) Jupyter: Thinking and Story-telling With Code and Data. Comput. Sci. Eng. 23: 7–14.
 >  Pérez F -> Perez Fernando

50. Strauss S, Runions A, Lane B, Eschweiler D, Bajpai N, Trozzi N, Routier-Kierzkowska A L, Yoshida S, da Silveira S R, Vijayan A, and Tofanelli R (2022) Using positional information to pro

In [456]:
for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    if record["author_need_et_al"]:
        print(original_line.strip())

4. Maška M, Ulman V, Delgado-Rodriguez P, Gómez-de-Mariscal E, Nečasová T, Guerrero Peña FA, Ren TI, Meyerowitz EM, Scherr T, Löffler K, and Mikut R (2023) The cell tracking challenge: 10 years of objective benchmarking. Nat. Methods 20: 1–11.
14. Katoh T A, Omori T, Mizuno K, Sai X, Minegishi K, Ikawa Y, Nishimura H, Itabashi T, Kajikawa E, Hiver S, and Iwane A H (2023) Immotile cilia mechanically sense the direction of fluid flow for left-right determination. Science 379: 66–71.
16. Shimozawa T, Yamagata K, Kondo T, Hayashi S, Shitamukai A, Konno D, Matsuzaki F, Takayama J, Onami S, Nakayama H, and Kosugi Y (2013) Improving spinning disk confocal microscopy by preventing pinhole cross-talk for intravital imaging. Proc. Natl. Acad. Sci. U.S.A 110: 3399–3404.
27. Chen B-C, Legant W R, Wang K, Shao L, Milkie D E, David-son M W, Janetopoulos C, Wu X S, Hammer III J A, Liu Z, and English B P (2014) Lattice Light-Sheet Microscopy: Imaging Molecules to Embryos at High Spatiotemporal Resolut

## Journal etc

### Journal

In [457]:
def compare_titles(j1,j2):
    if len(j1.split()) != len(j2.split()):
        return False
    for s1, s2 in zip(j1.split(),j2.split()):
        s1 = s1.replace(".","")
        s2 = s2.replace(".","")
        l = min(len(s1),len(s2))
        if s1[:l] != s2[:l]:
            return False
    return True
def _normalize_journal(s):
    return s.replace("&amp;"," ").replace(" in "," ").replace(" of "," ")

for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    if 'short-container-title' in record["matched_result"] and "container-title" in record:
        j1 = _normalize_journal(record["container-title"].lower())
        j2 = _normalize_journal(record["matched_result"]['short-container-title'][0].lower())
        if not compare_titles(j1,j2):
            print(original_line.strip())
            print(j1,j2)
        record["title-checked"] = True
    elif 'container-title' in record["matched_result"] and "container-title" in record:
        j1 = _normalize_journal(record["container-title"].lower())
        j2 = _normalize_journal(record["matched_result"]['container-title'][0].lower())
        if not compare_titles(j1,j2):
            print(original_line.strip())
            print(j1,j2)
        record["title-checked"] = True

    else:
        record["title-checked"] = False

8. Paci G and Mao Y (2021) Forced into shape: mechanical forces in Drosophila development and homeostasis. Semin. Cell Dev. Biol.120: 160–170.
semin. cell dev seminars cell   developmental biology
9. Valet M, Siggia E D, and Brivanlou A H (2022) Mechanical regulation of early vertebrate embryogenesis. Nat. Rev. Mol. Cell Biol.23: 169–184.
nat. rev. mol. cell nat rev mol cell biol
15. Katoh T A, Omori T, Ishikawa T, Okada Y, and Hamada H (2023) Biophysical analysis of mechanical signals in immotile cilia of mouse embryonic nodes using advanced microscopic techniques. Bio. Protoc. 13: e4715.
bio. protoc bio-protocol
38. de Medeiros G, Ortiz R, Strnad P, Boni A, Moos F, Repina N, Challet Meylan L, Maurer F, and Liberali P (2022) Multi-scale light-sheet organoid imaging framework. Nat. Commun.13: 4864.
nat nat commun
51. Aigouy B, Umetsu D, and Eaton S (2016) Segmentation and quantitative analysis of epithelial tissues. In: Dahmann C (ed.), Drosophila: Methods and Protocols, Methods in Mol

In [458]:
for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    if not record["title-checked"]:
        print(original_line.strip())
        break

28. Liu T-L, Upadhyayula S, Milkie D E, Singh V, Wang K, Swinburne I A, Mosaliganti K R, Collins Z M, Hiscock T W, Shea J, Kohrman A Q (2018) Observing the Cell in Its Native State: Imaging Subcellular Dynamics in Multicellular Organisms. Science 360: eaaq1392.


### year, volume page

In [477]:
v1

'124'

In [483]:
for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    original_line = original_line.replace("Methods","Methods ")
    if not re.search("([0-9]+):",original_line):
        continue
    r = re.search("[\s\.]([0-9]+)\s*:",original_line)
    if r is None:
        r = ""
    else:
        r = r.groups()[0]
    v1=record.get("volume",r)
    if v1=="":
        v1=r
    if v1 is None:
        v1=""
    else:
        v1 = v1.split(".")[-1]
        
    if v1!=record["matched_result"].get("volume"):
        print(original_line.strip())
        print(v1,"/",record["matched_result"].get("volume"))

28. Liu T-L, Upadhyayula S, Milkie D E, Singh V, Wang K, Swinburne I A, Mosaliganti K R, Collins Z M, Hiscock T W, Shea J, Kohrman A Q (2018) Observing the Cell in Its Native State: Imaging Subcellular Dynamics in Multicellular Organisms. Science 360: eaaq1392.
360 / None
87. Ershov D, Phan M S, Pylvänäinen J W, Rigaud S U, Le Blanc L, Charles-Orszag A, Conway J R, Laine R F, Roy N H, Bonazzi D, and Duménil G (2022) TrackMate 7: integrating state-of-the-art segmentation algorithms into tracking pipelines. Nat. Methods 19: 829–832.
 / 19
88. Fukai Y T and Kawaguchi K (2023) LapTrack: linear assign-ment particle tracking with tunable metrics. Bioinformatics 39: btac799.
39 / None
108. Arzt M, Deschamps J, Schmied C, Pietzsch T, Schmidt D, Toman-cak P, Haase R, and Jug F (2022) LABKIT: labeling and seg-mentation toolkit for big image data. Front. Comput. Sci. 4: 777728.
4 / None
122. Schott B, Traub M, Schlagenhauf C, Takamiya M, Antrit-ter T, Bartschat A, Löffler K, Blessing D, Otte J C,

In [495]:
for j,record in enumerate(title_matched):
    original_line = original_data_with_ref_num[re.sub("\.$","",record["citation-number"])]
    p1=record.get("page","").replace("–","-")
    p2=record["matched_result"].get("page")
    if p2 is None and record["matched_result"].get("container-title",[""])[0] in ["Science","Development","Physical Review Letters","Nature Communications"]:
        p2 = record["matched_result"]["DOI"].split(".")[-1]
    if p1 != p2:
        print(original_line.strip())
        print(p1,"/",p2)

4. Maška M, Ulman V, Delgado-Rodriguez P, Gómez-de-Mariscal E, Nečasová T, Guerrero Peña FA, Ren TI, Meyerowitz EM, Scherr T, Löffler K, and Mikut R (2023) The cell tracking challenge: 10 years of objective benchmarking. Nat. Methods 20: 1–11.
1-11 / 1010-1020
15. Katoh T A, Omori T, Ishikawa T, Okada Y, and Hamada H (2023) Biophysical analysis of mechanical signals in immotile cilia of mouse embryonic nodes using advanced microscopic techniques. Bio. Protoc. 13: e4715.
4715 / None
25. Gao L, Shao L, Chen B-C, and Betzig E (2014) 3D Live fluo-rescence imaging of cellular dynamics using bessel beam plane illumination microscopy. Nat. Protoc. 9: 1083–101.
1083-101 / 1083-1101
26. Planchon T A, Gao L, Milkie D E, Davidson M W, Galbraith J A, Galbraith C G, and Betzig E (2011) Rapid Three-Dimensional Isotropic Imaging of Living Cells Using Bessel Beam Plane Illumination. Nat. Methods 8: 417–23.
417-23 / 417-423
27. Chen B-C, Legant W R, Wang K, Shao L, Milkie D E, David-son M W, Janetopoul

In [487]:
record["matched_result"]

{'institution': [{'name': 'bioRxiv'}],
 'indexed': {'date-parts': [[2022, 10, 15]],
  'date-time': '2022-10-15T10:27:08Z',
  'timestamp': 1665829628941},
 'posted': {'date-parts': [[2018, 1, 8]]},
 'group-title': 'Cell Biology',
 'reference-count': 38,
 'publisher': 'Cold Spring Harbor Laboratory',
 'content-domain': {'domain': [], 'crossmark-restriction': False},
 'accepted': {'date-parts': [[2018, 1, 9]]},
 'abstract': '<jats:title>Abstract</jats:title><jats:p>True physiological imaging of subcellular dynamics requires studying cells within their parent organisms, where all the environmental cues that drive gene expression, and hence the phenotypes we actually observe, are present. A complete understanding also requires volumetric imaging of the cell and its surroundings at high spatiotemporal resolution without inducing undue stress on either. We combined lattice light sheet microscopy with two-channel adaptive optics to achieve, across large multicellular volumes, noninvasive aberr