### extract bibliographies

In [128]:
import sys
sys.path.append('biblib')

In [129]:
import biblib.bib
import biblib.algo
file_name = 'NuyujukianRefs.bib'
test_file = 'test-bibs/coleman_test.bib'
test_file = 'test-bibs/multiple_first_authors_test.bib'
f = open(file_name, "r")
# f = open(test_file, "r")

In [130]:
db = biblib.bib.Parser(month_style='mine').parse(f, log_fp=sys.stderr).get_entries()
db = biblib.bib.resolve_crossrefs(db)

MONTH STYLE mine


In [131]:
def get_first_and_rest_authors(authors):
    authors_list = biblib.algo.parse_names(authors)
    n = len(authors_list)
    if n == 0:
        raise Exception("ZERO AUTHORS")
    # add first authors
    first = [authors_list[0]]
    if n - len(first) > 0:
        for i in range(1, n):
            if '*' not in authors_list[i].first:
                break
            first.append(authors_list[i])

    rest = []
    if n - len(first) > 0:
        rest = authors_list[1:]
    clean_first = [name.last for name in first]
    clean_rest = [name.last for name in rest]
    return clean_first, clean_rest

In [132]:
def get_pdf_name(ent, print_vals=False):
    if 'title' not in ent:
        raise Exception("NO TITLE FOUND")
    url = ''
    if 'bdsk-url-1' in ent:
        url = ent['bdsk-url-1']
    else:
        url = ent['title']
    if 'author' not in ent:
        print("NO AUTHOR FOUND")
        return ent.key, url
    if 'year' not in ent:
        raise Exception("NO YEAR FOUND")
    # get authors
    first, rest = get_first_and_rest_authors(ent['author'])
    num_first = len(first)
    num_rest = len(rest)
    # get journal
    title_wo_date = ent.key[:-4]
    for name in first + rest:
        title_wo_date = title_wo_date.replace(name, '', 1)
    title_wo_EtAl = title_wo_date.replace('EtAl', '', 1)
    title_wo_ElAl = title_wo_EtAl.replace('ElAl', '', 1)
    journal = title_wo_ElAl.replace('Etl', '', 1)
    year = ent['year']

    # build file names
    title = ''
    if num_first + num_rest <= 2:
        # If there are two or less authors, list all authors and drop the et al.
        title += ''.join(first) + ''.join(rest)
        title += journal + year
    elif num_first == 2:
        # If there are two co-first authors, list both before the et al.
        title += ''.join(first)
        title += 'EtAl' + journal + year
    elif num_first > 2:
        # If there are more than two co-first authors, just list the first author.
        title += first[0] + 'EtAl' + journal + year
    else:
        # If there is one first authors, just list the first author.
        title += first[0] + 'EtAl' + journal + year
    return title, url

In [134]:
def debug_db(db):
    count = 0
    db.values()
    for ent in db.values():
        pdf_name = get_pdf_name(ent)[0]
        if pdf_name != ent.key:
            print('{ent.key} ({ent.typ}):'.format(ent=ent))
            print(get_pdf_name(ent))
            print()
            count+=1
    len(db.values())

In [135]:
debug_db(db)

ZijlmansElAlNatRevNeurol2019 (article):
('ZijlmansEtAlNatRevNeurol2019', 'https://doi.org/10.1038/s41582-019-0224-y')

SalomonLancet2012 (article):
('SalomonEtAlLancet2012', 'http://www.ncbi.nlm.nih.gov/pubmed/23245605')

TatumClinNeurophys2018 (article):
('TatumEtAlClinNeurophys2018', 'http://www.ncbi.nlm.nih.gov/pubmed/29483017')

NO AUTHOR FOUND
NO AUTHOR FOUND
BurstenElecClinNeuro1957 (article):
('BURSTENBurstenElecClinNeuro1957', 'https://doi.org/10.1016/0013-4694(57)90043-3')

VesunaKauvarEtAlNature2020 (article):
('VesunaEtAlNature2020', 'https://doi.org/10.1038/s41586-020-2731-9')

MehrotraDasguptaEtAlLCTES2018 (inproceedings):
('MehrotraEtAlLCTES2018', 'https://doi.org/10.1145/3211332.3211344')

NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
SternsonEtAlAnnRevNeuro2014 (article):
('SternsonRothAnnRevNeuro2014', 'http://dx.doi.org/10.1146/annurev-neuro-071013-014048')

BaniAhmedThesis2013 (jurthesis):
('Bani-AhmedBaniAhmedThesis2013', 'https://k

In [136]:
pdf_names = [get_pdf_name(ent) for ent in db.values()]

NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND
NO AUTHOR FOUND


In [137]:
pdf_names_w_doi_link = []
doi_links = []

pdf_names_w_link = []
other_links = []

pdf_names_wo_link = []

for pdf_name, url in pdf_names:
    
    if 'doi' in url:
        pdf_names_w_doi_link.append('{},{}\n'.format(pdf_name, url))
        # pdf_names_w_doi_link.append(pdf_name + '\n')
        # doi_links.append(url + '\n')
    elif 'https' in url or 'http' in url:
        pdf_names_w_link.append(pdf_name + '\n')
        other_links.append(url + '\n')
    else:
        pdf_names_wo_link.append('{},{}\n'.format(pdf_name, url))
assert((len(pdf_names_w_doi_link) + len(pdf_names_w_link) + len(pdf_names_wo_link)) == len(pdf_names))

In [139]:
len(pdf_names_w_doi_link), len(pdf_names_w_link), len(pdf_names_wo_link)

(260, 20, 180)

In [138]:
test_output = 'test-output/'
pdf_names_w_doi_link_file = open(test_output + 'pdf_names_w_doi_link.txt', "w")
pdf_names_w_doi_link_file.writelines(pdf_names_w_doi_link)
doi_links_file = open(test_output + 'doi_links.txt', "w")
doi_links_file.writelines(doi_links)

pdf_names_w_link_file = open(test_output + 'pdf_names_w_link.txt', "w")
pdf_names_w_link_file.writelines(pdf_names_w_link)
other_links_file = open(test_output + 'other_links.txt', "w")
other_links_file.writelines(other_links)

pdf_names_wo_link_file = open(test_output + 'pdf_names_wo_link.txt', "w")
pdf_names_wo_link_file.writelines(pdf_names_wo_link)

In [127]:
len(pdf_names_w_doi_link)

260