In [1]:
import os
import requests
import json
import csv
import pandas as pd
from dotenv import load_dotenv
from time import sleep
from pprint import pprint

In [2]:
WORKS_ENDPOINT = "https://api.openalex.org/works?"
AUTHORS_ENDPOINT = "https://api.openalex.org/authors?"
API_KEY = ""
header = {
        "apikey": API_KEY,
    }

In [3]:
PATH_TO_INPUT_CSV = "../../authorlist.csv"
OUTPUT_FILEPATH = "oa_authorlist.csv"

# https://dev.elsevier.com/sc_author_search_tips.html for list of subjects

affiliation_list = ['University of British Columbia','Simon Fraser University', 'University of Victoria', 'University of Washington']

subjects_list = ['Biology', 'Medicine', 'Computer science', 'Psychology']

# affil_id_list = ['60023077', '60010365', '60012423'] #[UBC faculty of medicine, UBC, UBC hospital]

# affil_search_string = "affil(UBC or University of British Columbia or Djavad Mowafaghian or Simon Fraser University or University of Victoria or University of Washington') and subjarea(NEUR or BIOC or MEDI or PSYC or HEAL or IMMU or NURS or PHAR)"

cols_of_interest = ["Authorlist Name", "OpenAlex Name", "Author ID","cited_by_count", "works_count", "h_index", "i10_index", "Affiliation", "ORCID", "Research Areas", "Warning"]


In [4]:
def get_api_response(param, url):
    try:
        response = requests.get(url=url, headers=header, params=param)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as errh:
        print("Http Error:", errh)
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
    except requests.exceptions.RequestException as err:
        print("Something Else", err)
    except ValueError as e: 
        print("Decoding JSON has failed:", e)
        print("Raw Response:", response.text)
    

In [5]:
def is_matching_subjects(author_subjects) -> bool:
    try:
        return any([topic in author_subjects for topic in subjects_list])
    except:
        print("no subjects on profile")
        return False

In [6]:
def is_matching_affils(author_affils) -> bool:
    try:
        # return (any([affil in author.affiliation for affil in affiliation_list]) 
        #         and any([city in author.city for city in city_list]))
        return any([affil in author_affils for affil in affiliation_list]) 
    except:
        print("no affiliation on profile")
        return False

In [7]:
def is_target_profile(search_json) -> bool:
    '''criteria for whether or not profile matches the target requirements'''
    # print("subject match:",is_matching_subjects(author))
    # print("affiliation match:",is_matching_affils(author))
    # return is_matching_affils(author)

    return (is_matching_subjects(
        [concept["display_name"] 
         for concept in search_json["x_concepts"] 
         if concept["level"]==0]) 
            or is_matching_affils(
                [affils["institution"]["display_name"] 
                 for affils in search_json["affiliations"]]
                ))

In [8]:
def add_author_row(out_dict, authorlist_name, search_json={}, has_results:bool=False, warning="", affil_override="", subj_override=""):
    '''
    fills in author information to the output dictionary.
    
    Params
    ------
    out_dict (dict)
        dictionary which contains author information
    authorlist_name (str)
        the original name from authorlist
    search_json (json object/dict)
        the json object that is returned from the OpenAlex Author API call
        contains keys: ['id', 'orcid', 'display_name', 'display_name_alternatives', 'relevance_score', 'works_count', 'cited_by_count', 'summary_stats', 'ids', 'affiliations', 'last_known_institution', 'last_known_institutions', 'x_concepts', 'counts_by_year', 'works_api_url', 'updated_date', 'created_date']
    has_results (bool)
        has an API response
    is_target (bool)
        matches author finding conditions
    '''
    match has_results:
        case True:
            #add author information to dictionary
            display_name = search_json["display_name"]
            print("adding " + display_name + "...\n")

            affil_name = search_json["affiliations"][0]["institution"]["display_name"] #most recently affiliated institution

            author_id = search_json["id"].split("/")[-1]

            orcid = ""

            try:
                orcid = search_json["orcid"].split("/")[-1]
            except:
                pass

            subject = [concept["display_name"] for concept in search_json["x_concepts"] if concept["level"]==0]

            cited_by = search_json['cited_by_count']
            works_count = search_json['works_count']
            h_index = search_json["summary_stats"]["h_index"]
            i10_index = search_json["summary_stats"]["i10_index"]

            out_dict["Authorlist Name"].append(authorlist_name)
            out_dict["OpenAlex Name"].append(display_name) 
            out_dict["Author ID"].append(author_id) 
            out_dict['cited_by_count'].append(cited_by)
            out_dict["works_count"].append(works_count)
            out_dict["h_index"].append(h_index)
            out_dict["i10_index"].append(i10_index)
            out_dict["Affiliation"].append(affil_name) 
            out_dict["ORCID"].append(orcid) 
            out_dict["Research Areas"].append(subject) 
            out_dict["Warning"].append(warning) 
        case False:
            #add blank rows with warning or affiliation/research area overrides
            print("No OpenAlex profile found\n")
            out_dict["Authorlist Name"].append(authorlist_name) 
            out_dict["OpenAlex Name"].append("") 
            out_dict["Author ID"].append("") 
            out_dict['cited_by_count'].append("")
            out_dict["works_count"].append("")
            out_dict["h_index"].append("")
            out_dict["i10_index"].append("")
            out_dict["Affiliation"].append("")
            out_dict["ORCID"].append("") 
            out_dict["Research Areas"].append("")
            out_dict["Warning"].append("no profiles found") 
            

In [9]:
def compare_all_profiles(authors):
    for i in range(len(authors)):
        print("looking at profile " + str(i))
        if is_target_profile(authors[i]):
            return i
    return None

In [11]:
#read member csv file accordingly
authorlist_df = pd.read_csv(PATH_TO_INPUT_CSV)
member_arr = authorlist_df["Name"].to_list()

# skip header rows
# member_arr = member_arr[1:]
print(len(member_arr))

46


In [12]:
# if changing columns, make sure to change code in add_author_row() as well!
output_dict = {f"{key}":[] for key in cols_of_interest}

#go through all members
for member in member_arr:
    start = 0
    sleep(1) #pause for one second to prevent API overload

    #search for author
    # lastname = member[0].strip()
    # firstname = member[1].strip()
    print(f"searching for author {member}")

    s = get_api_response(param={
        "search": member,
        "per_page": 200,
    }, url=AUTHORS_ENDPOINT)

    authors = s["results"]
    num_authors_found = len(s["results"])

    match num_authors_found:
        case 0:
            #if there are no profiles, add blank row
            add_author_row(
                authorlist_name=member, 
                out_dict=output_dict,
                has_results=False)

        case 1:
            #if there is a profile, save it if the subjects match
            if is_target_profile(authors[0]):
                add_author_row(
                authorlist_name=member, 
                out_dict=output_dict,
                search_json=authors[0],
                has_results=True,)
            else:
                add_author_row(authorlist_name=member, 
                               out_dict=output_dict,
                               has_results=True, 
                               search_json=authors[0],
                               warning=f"!!! profile does not pass addition condition",
                               affil_override=[affils["institution"]["display_name"] for affils in authors[0]["affiliations"]],
                               subj_override=[concept["display_name"] for concept in authors[0]["x_concepts"] if concept["level"]==0])
        case _:
            #if there are multiple profile, take a look through each
            best_match_author = compare_all_profiles(authors)
            if best_match_author == None:
                #add first result if there are no authors
                add_author_row(authorlist_name=member, 
                               out_dict=output_dict,
                               has_results=True, 
                               search_json=authors[0],
                               warning=f"!!! FIRST profile returned",
                               affil_override=[affils["institution"]["display_name"] for affils in authors[0]["affiliations"]],
                               subj_override=[concept["display_name"] for concept in authors[0]["x_concepts"] if concept["level"]==0])
                print("no match found among profiles\n")
            else:
                add_author_row(authorlist_name=member, 
                               out_dict=output_dict,
                               has_results=True,
                               search_json=authors[best_match_author])

searching for author Tim H. Murphy
looking at profile 0
adding Timothy H. Murphy...

searching for author Annie Ciernia
looking at profile 0
adding Annie Vogel Ciernia...

searching for author Brian MacVicar
looking at profile 0
adding Brian A. MacVicar...

searching for author Fidel Vila-Rodriguez
looking at profile 0
adding Fidel Vila‐Rodriguez...

searching for author Shernaz Bamji
adding Shernaz X. Bamji...

searching for author Lara Boyd
looking at profile 0
adding Lara A. Boyd...

searching for author Paul Pavlidis
looking at profile 0
adding Paul Pavlidis...

searching for author Martin McKeown
looking at profile 0
adding Martin J. McKeown...

searching for author A Jon Stoessl
looking at profile 0
adding A. Jon Stoessl...

searching for author Peter Cripton
looking at profile 0
adding Peter A. Cripton...

searching for author Jason Snyder
looking at profile 0
adding Jason S. Snyder...

searching for author Wolfram Tetzlaff
looking at profile 0
adding Wolfram Tetzlaff...

search

In [14]:
# save file
df = pd.DataFrame.from_dict(output_dict)
df.to_csv(OUTPUT_FILEPATH, sep="|", index=False)
# df.to_csv(OUTPUT_FILEPATH)