In [1]:
import json
from bs4 import BeautifulSoup

In [2]:
def remove_titles(name):
    '''
    There are so many titles in this data. 
    This function removes as many as I could identify.
    It also makes the name string lowercase and strips white space from both sides.
    '''
    name = name.lower().strip()
    titles = [
        'prof.', 'prof ', 'professor ', \
'dr ',  'dr. ',\
'mr ', 'mr. ', 'mrs ', \
'md', 'm.d.'
', phd', ', ph.d.', 'ph.d', 'pharm.d.', 'phd',\
'fccp', 'facc', 'faha', 'fapha','fashp', 'frsb', 'fbphs',\
'frse', 'frcp', 'faap', 'facp','fgsa', ', f-abft', 'fesc',\
', m.p.h.', 'm.s.', 'b.s.pharm.', 'bcps', 'bsc(hons)', 'mph',\
'fast', 'mace', 'cbe', 'frs', '\n', '\n            ', ',', 'dds', ':'
              
             ]
    for title in titles:
        name = name.replace(title, '')
        
    return name.strip()

In [3]:
'''
This cell deals with journals where names are embedded in mailto links in <a> tags.
'''
oxford = {}
oxford_questionable = {}
with open('../../data/oxford.jl', 'r') as f:
    for line in f:
        jobj = json.loads(line)
        journal = '_'.join(jobj['journal'].split(' '))
        if 'editors' not in jobj:
            continue
        soup = BeautifulSoup(jobj['editors'], 'html.parser')
        names = set()
        for ln in soup.find_all('a'):
            if 'mailto' in str(ln) and '@' not in ln.text:
                name = ln.text.strip()
                name = remove_titles(name)
                names.add(name)
        if len(names) > 5:
            oxford[journal] = names
            print('Journal: {} # Editors: {}'.format(journal, len(oxford[journal])))
        else:
            oxford_questionable[journal] = names

Journal: Annals_of_Botany # Editors: 39
Journal: AoB_PLANTS # Editors: 50
Journal: Systematic_Biology # Editors: 45
Journal: Statute_Law_Review # Editors: 7
Journal: Socio-Economic_Review # Editors: 7
Journal: The_Quarterly_Journal_of_Mathematics # Editors: 21
Journal: The_English_Historical_Review # Editors: 6
Journal: DNA_Research # Editors: 26
Journal: Acta_Biochimica_et_Biophysica_Sinica # Editors: 108
Journal: Nucleic_Acids_Research # Editors: 19
Journal: National_Science_Review # Editors: 6
Journal: Molecular_Biology_and_Evolution # Editors: 23
Journal: Law,_Probability_and_Risk # Editors: 9
Journal: Journal_of_Plant_Ecology # Editors: 50
Journal: Journal_of_Petrology # Editors: 29
Journal: Journal_of_Human_Rights_Practice # Editors: 6
Journal: The_Journal_of_Deaf_Studies_and_Deaf_Education # Editors: 10
Journal: Journal_of_Complex_Networks # Editors: 41
Journal: International_Mathematics_Research_Papers # Editors: 51
Journal: International_Mathematics_Research_Notices # Editors:

In [4]:
## Defining "Done" as having more than 5 editors for now...not a good criteria
oxford_done = set(oxford.keys())

In [5]:
oxford_questionable.keys()

dict_keys(['Past_&_Present', 'American_Journal_of_Hypertension', 'Rheumatology_Advances_in_Practice', "Journal_of_Crohn's_and_Colitis", 'Communication,_Culture_and_Critique', 'Behavioral_Ecology', 'Virus_Evolution', 'Journal_of_International_Criminal_Justice', 'Rheumatology', 'European_Heart_Journal_-_Cardiovascular_Pharmacotherapy', 'American_Entomologist', 'Industrial_Law_Journal', 'Journal_of_Language_Evolution', 'Yearbook_of_European_Law', 'Oxford_Journal_of_Legal_Studies', 'The_Condor:_Ornithological_Applications', 'Biostatistics', 'European_Journal_of_Cardio-Thoracic_Surgery', 'Animal_Frontiers', 'Journal_of_Integrable_Systems', 'Radiation_Protection_Dosimetry', 'Literature_and_Theology', 'Mind', 'Inflammatory_Bowel_Diseases', 'The_Gerontologist', 'ICSID_Review_-_Foreign_Investment_Law_Journal', 'American_Journal_of_Agricultural_Economics', 'Journal_of_Legal_Analysis', 'Social_Work', 'Human_Reproduction', 'Paediatrics_&_Child_Health', 'GigaScience', 'European_Journal_of_Public_He

In [6]:
'''
This cell deals with names that are listed in <strong> tags
'''
with open('../../data/oxford.jl', 'r') as f:
    for line in f:
        jobj = json.loads(line)
        journal = '_'.join(jobj['journal'].split(' '))
        if journal in oxford_done:
            continue

        if 'editors' not in jobj:
            continue
        soup = BeautifulSoup(jobj['editors'], 'html.parser')
        names = set()
        for ln in soup.find_all('strong'):
            name = ln.text.strip()
            if 'editor' in name.lower() or 'director' in name.lower() or name.strip() == '':
                continue

            name = remove_titles(name)
            names.add(name)
        if len(names) > 5:
            oxford[journal] = names
            print('Journal: {} # Editors: {}'.format(journal, len(oxford[journal])))
        else:
            oxford_questionable[journal] = names

Journal: Annals_of_the_Entomological_Society_of_America # Editors: 25
Journal: Annals_of_Oncology # Editors: 96
Journal: Antibody_Therapeutics # Editors: 30
Journal: Arbitration_International # Editors: 20
Journal: Arthropod_Management_Tests # Editors: 22
Journal: The_World_Bank_Economic_Review # Editors: 27
Journal: Virus_Evolution # Editors: 57
Journal: Transportation_Safety_and_Environment # Editors: 32
Journal: Toxicological_Sciences # Editors: 31
Journal: Synthetic_Biology # Editors: 45
Journal: Review_of_Finance # Editors: 46
Journal: QJM:_An_International_Journal_of_Medicine # Editors: 18
Journal: Public_Policy_&_Aging_Report # Editors: 40
Journal: Progress_of_Theoretical_and_Experimental_Physics # Editors: 116
Journal: Clean_Energy # Editors: 47
Journal: Clinical_Infectious_Diseases # Editors: 16
Journal: Clinical_Kidney_Journal # Editors: 34
Journal: Pathogens_and_Disease # Editors: 19
Journal: Pain_Medicine # Editors: 21
Journal: The_Computer_Journal # Editors: 35
Journal: Cr

In [7]:
print(len(oxford))

132


In [8]:
for journal in oxford:
    print('Journal: {}'.format(journal))
    print(oxford[journal])
    print('\n\n')

Journal: National_Science_Review
{'xiuling xu', 'suzhen liu', 'bingzi zhang', 'yuan gao', 'weijie zhao', 'xiaoling yu'}



Journal: The_English_Historical_Review
{'peter marshall', 'catherine wright', 'kim reynolds', 'stephen conway', 'catherine holmes', 'hannah skoda'}



Journal: FEMS_Microbiology_Reviews
{'staffan kjelleberg', 'william margolin', 'oscar kuipers', 'bernardo gonzalez', 'birgitta henriques normark', 'aimee shen', 'bart thomma', 'christiaan van ooij', 'ehud banin', 'franz narberhaus', 'kenn gerdes', 'suzana salcedo', 'wilbert bitter', 'mecky pohlschroder', 'karin sauer', 'sonja-verena albers', 'tâm mignot', 'miguel camara', 'justin nodwell', 'karine gibbs', 'marie-therese giudici-orticoni', 'jan roelof van der meer', 'erh-min lai', 'david blackbourn', 'gerhard h. braus', 'bart tomma', 'corina p. d. brussaard', 'urs greber', 'grzegorz wegrzyn', 'michael bott', 'chris whitfield', 'antoine danchin', 'christoph dehio'}



Journal: Journal_of_Human_Rights_Practice
{'brian ph

In [9]:
'''
These journals were hand identified as having issues given above output. They remain in questionable.
'''
del oxford['Crohn\'s_&_Colitis_360']
del oxford['EP_Europace']
del oxford['Clinical_Kidney_Journal']
del oxford['Innovation_in_Aging']
del oxford['Journal_of_the_Endocrine_Society']
del oxford['Pain_Medicine']
del oxford['Endocrine_Reviews']
del oxford['Virus_Evolution']
del oxford['The_Journal_of_Applied_Poultry_Research']
del oxford['The_American_Journal_of_Clinical_Nutrition']
del oxford['Cardiovascular_Research']
del oxford['Operative_Neurosurgery']
del oxford['The_Journal_of_Clinical_Endocrinology_&_Metabolism']
del oxford['International_Journal_of_Neuropsychopharmacology']

In [10]:
len(oxford)

118

In [11]:
print(oxford)

{'National_Science_Review': {'xiuling xu', 'suzhen liu', 'bingzi zhang', 'yuan gao', 'weijie zhao', 'xiaoling yu'}, 'The_English_Historical_Review': {'peter marshall', 'catherine wright', 'kim reynolds', 'stephen conway', 'catherine holmes', 'hannah skoda'}, 'FEMS_Microbiology_Reviews': {'staffan kjelleberg', 'william margolin', 'oscar kuipers', 'bernardo gonzalez', 'birgitta henriques normark', 'aimee shen', 'bart thomma', 'christiaan van ooij', 'ehud banin', 'franz narberhaus', 'kenn gerdes', 'suzana salcedo', 'wilbert bitter', 'mecky pohlschroder', 'karin sauer', 'sonja-verena albers', 'tâm mignot', 'miguel camara', 'justin nodwell', 'karine gibbs', 'marie-therese giudici-orticoni', 'jan roelof van der meer', 'erh-min lai', 'david blackbourn', 'gerhard h. braus', 'bart tomma', 'corina p. d. brussaard', 'urs greber', 'grzegorz wegrzyn', 'michael bott', 'chris whitfield', 'antoine danchin', 'christoph dehio'}, 'Journal_of_Human_Rights_Practice': {'brian phillips', 'sheray warmington',