OpenAlex Merging Author ID's
Goal: reduce number of author ids per first and last name search and make sure we merge ids to the right people
Last updated: 4/05/2023
Author: Ashley You

In [2]:
import openalexapi
import requests, json
import pandas as pd
import ast
import math
import heapq
import itertools
import csv
import os

name_search = ["william pao",
    "frederick suchy",
    "malcolm cox",
    "nancy cooke",
    "allan sniderman",
    "vincent dennis",
    "alessandra pernis",
    "john minna",
    "dennis bier",
    "roger pomerantz"]


#Getting all author ids through search of first and last name:
base_url = 'https://api.openalex.org/'
def get_authorIDs(name):
    listofIDs = []
    page = 1
    full_query= f'https://api.openalex.org/authors?search={name}&page={page}'
    response = requests.get(full_query)
    visualize_data = response.json()
    num_pages = math.ceil(visualize_data['meta']['count']/25)
    
    while page <= num_pages:
        full_query= f'https://api.openalex.org/authors?search={name}&page={page}'
        response = requests.get(full_query)
        visualize_data = response.json()
        for result in visualize_data['results']:
            openalex_id = result['id'].replace("https://openalex.org/", "")
            listofIDs.append(openalex_id)
            
            #for concepts in result['x_concepts']:
                #if concepts['display_name'] == 'Medicine':
        page += 1 

    print(f'There are {len(listofIDs)} author ids for {name}')
    return listofIDs

#get_authorIDs("William pao")


#Finds all work_ids with given authorId
def work_id(givenAuthorID):
        page = 'page={}'
        filtered_works_url = f'https://api.openalex.org/works?filter=author.id:{givenAuthorID}&{page}'
        page = 1
        has_more_pages = True
        fewer_than_10000_results = True
        all_worksID = []

        # loop through pages
        while has_more_pages and fewer_than_10000_results:

            # set page value and request page from OpenAlex
            url = filtered_works_url.format(page)
            page_with_results = requests.get(url).json()

            # loop through partial list of results
            results = page_with_results['results']
            for i,work in enumerate(results):
                openalex_id = work['id'].replace("https://openalex.org/", "")
                all_worksID.append(openalex_id)
            # next page
            page += 1

            # end loop when either there are no more results on the requested page 
            # or the next request would exceed 15 results
            per_page = page_with_results['meta']['per_page']
            has_more_pages = len(results) == per_page
            fewer_than_10000_results = per_page * page <= 10000
        print(f'There are {len(all_worksID)} works for {givenAuthorID}')
        return (all_worksID)
#work_id('A2250212419')

#Tests if finding asci names and concepts works under a small scale
#----------------------TESTER----------------- FOR THE findAAConcepts FUNCTION AFTER
def findIndvConcepts(names):
    authorsConcepts = {}
    dir = os.path.dirname(os.path.realpath("Open_AlexMerging.ipynb")).replace("open_alex_data", "asci_aap_data")
    os.chdir(dir)
    with open(r"asci_aap_dataJSONUpdated.json") as fileJson:
        data = json.load(fileJson)
        allData = data["people"]

    for name in names:
        for indv in allData:
            first = indv["first_name"].lower()
            last = indv["last_name"].lower()
            if (first+" "+last)== name.lower():                                               
                authorsConcepts[name] = (ast.literal_eval(indv["original specialization"]))
            
        if (name in authorsConcepts):
            print(f'{name} successfully found')
        else:
            authorsConcepts[name] = []
            print(f'{name} not found') 

    return authorsConcepts
findIndvConcepts(["William Pao",
                  "Ashley You",
                  "Kjersti Aagaard"])
                #   "E. Abel",
                #   "Janis Abkowitz"])


#Goes through asci/aap data and gets name and concepts
def findAAConcepts():
    authorsConcepts = {}
    dir = os.path.dirname(os.path.realpath("Open_AlexMerging.ipynb")).replace("open_alex_data", "asci_aap_data")
    os.chdir(dir)
    with open(r"asci_aap_dataJSONUpdated.json") as fileJson:        
        data = json.load(fileJson)
        allData = data["people"]
        print(f'There are {len(allData)} amount of people in ASCI/AAP json file')

    for indv in allData:
        first = indv["first_name"].lower()
        last = indv["last_name"].lower()
        name = first+" "+last
        if len(indv["original specialization"])!= 2:
            authorsConcepts[name] = (ast.literal_eval(indv["original specialization"]))
    #new_dict = {key: value for key, value in authorsConcepts.items() if value}
    print(f'There are {len(authorsConcepts)} amount of people with specialites listed in ASCI/AAP json file')
    return authorsConcepts  
#findAAConcepts()   

def findAANames():
    authors = []
    dir = os.path.dirname(os.path.realpath("Open_AlexMerging.ipynb")).replace("open_alex_data", "asci_aap_data")
    os.chdir(dir)
    with open(r"asci_aap_dataJSONUpdated.json") as fileJson:        
        data = json.load(fileJson)
        allData = data["people"]
        print(f'There are {len(allData)} amount of people in ASCI/AAP json file')

    for indv in allData:
        first = indv["first_name"].lower()
        last = indv["last_name"].lower()
        name = first+" "+last
        authors.append(name)
    return authors  
#findAANames()

#finished
#finds author concepts given list of names
#filter through medicine 
def authorConcepts(people):
    authors = {}
    for name in people:
        totalConcepts = []
        authorIds = get_authorIDs(name)
        for id in authorIds:
            authorTopics= {}
            tempConcepts = []
            full_query= f'https://api.openalex.org/authors/{id}'
            response = requests.get(full_query)
            visualize_data = response.json()
            for concepts in visualize_data["x_concepts"]:
                if (float(concepts['score']) >= 90.0 and float(concepts['level']) >= 1) or concepts['display_name']== "Medicine":
                    tempConcepts.append(concepts['display_name'])
            authorTopics[id]= tempConcepts
            totalConcepts.append(authorTopics)
        authors[name]= totalConcepts
    return authors 
#authorConcepts(['Kjersti Aagaard'])


#finds all work details given work id
def findWork(workId):
    fullquery = base_url+'works/'+workId
    response = requests.get(fullquery)
    visualize_data = response.json()
    visualize_data.pop("abstract_inverted_index")
    visualize_data.pop("related_works")
    visualize_data.pop("ngrams_url")
    #clean the unicode
    #visualize_data[""]
    return visualize_data
#findWork('W2139236349')


#finds work concepts given work link
def workConcepts(workId):
    totalWorkConcepts = []
    allinfo = findWork(workId)
    for concept in allinfo['concepts']:
        if float(concept['score']) >= 0.3 and float(concept['level']) >= 2 or concept['display_name']== "Medicine" :
            totalWorkConcepts.append(concept['display_name'])
    return totalWorkConcepts
workConcepts("W2005052157")



#checks which concepts occur the most often in a work
def checkConcepts(conceptlist):
    count_dict = {}
    temp_dict = {}
    for element in conceptlist:
        if element in count_dict:
            count_dict[element] += 1
        else:
            count_dict[element] = 1

    for element, count in count_dict.items():
        if element!='Medicine':
            temp_dict[element] = count
    # Find the three largest values
    largest_values = heapq.nlargest(3, temp_dict.values())

    # Find the keys corresponding to the largest values
    largest_keys = []
    for key, value in temp_dict.items():
        if value in largest_values:
            largest_keys.append(key)

    # Print the largest values and their keys
    final_dict = {}
    for i in range(len(largest_values)):
        #print("{}. {} has a value of {}".format(i+1, largest_keys[i], largest_values[i]))
        final_dict[largest_keys[i]] = largest_values[i]
        
    return final_dict

testList= ['Medicine',
  'Medicine', 
  'Medicine',
  'Eosinophilic esophagitis',
  'Budesonide',
  'Internal medicine',
  'Heartburn',
  'Eosinophilia',
  'Gastroenterology',
  'Nausea',
  'Vomiting',
  'Corticosteroid',
  'Adverse effect',
  'Esophagitis',
  'Eosinophilic esophagitis',
  'Internal Medicine',
  'Budesonide']
#checkConcepts(testList)

def findWorkConcepts(names): #keys(1. name, 2. authorId 3. workId)
    #searches to get author ids
    finalDict = {}
    count = 0
    for name in names:
        authorIDs = get_authorIDs(name)
        listofAuthors = []
        for id in authorIDs:
            tempDict = {}
            tempWorkList = []
            workIds = work_id(id)
            for wID in workIds:
               if 'Medicine' in workConcepts(wID)[wID]: #preliminary filter
                tempWorkList.append(workConcepts(wID))
               else: 
                   count +=1
            tempDict[id] = tempWorkList
            listofAuthors.append(tempDict)
        finalDict[name] = listofAuthors
    print(f'{count} amount of workIds did not have Medicine in their concepts')
    return finalDict
                
    #searches to get work ids
    #access work ids
    #access concepts in work id
    #loops through concepts in work id and saves it 
#findWorkConcepts(["Seema Aceves"])

William Pao successfully found
Ashley You not found
Kjersti Aagaard successfully found


Following below merges author ids based on their work concepts

In [7]:
#Goal: isolates author id and all of its concepts from the works, testing with 'John Adamson'
#Returns: dictionary of ids as keys and list of concepts as value from all of their works with not duplicates 
def getConcepts(name):
    finalDict = {}
    allaIDs = get_authorIDs(name)
    for aID in allaIDs:
        allConcepts = []
        listwIDs = work_id(aID)
        for wId in listwIDs:
            merged_list = list(set(allConcepts) | set(workConcepts(wId))) #merges lists together without any duplicates
            allConcepts = (merged_list)
        finalDict[aID] = allConcepts
        if 'Medicine' not in finalDict[aID]:
            del(finalDict[aID])
    return finalDict
#getConcepts('John Adamson')

#Goal: set standard for how much concepts must overlap to be the same author
#Return: dictionary of ids as keys and list of ids as values that have more than 4 concepts that they share with the key
def identifyIds(name):
    try:
        totalDict = {}
        dictConcepts = getConcepts(name)
        allaIDs = list(dictConcepts.keys())

        for aID in allaIDs:
            authors_similar = []
            currentKey = aID
            indvConcepts1 = dictConcepts[currentKey]
            for key, value in dictConcepts.items():
                if currentKey!=key:
                    overlap = [x for x in indvConcepts1 if x in value] #finds concepts that are overlapped
                    if len(overlap) > 4:
                        authors_similar.append(key)
            totalDict[aID]= authors_similar
            
        with open("authorIds_dict.json", "w") as json_file:
            json.dump(totalDict, json_file)

    except json.JSONDecodeError:
                print("---------Error-------------")
    return totalDict

#identifyIds('John Adamson') 

#Goal: merge author ids
def mergeIds(name):
    inputDict = identifyIds(name)
    removedKeys = {}

    for key, value in inputDict.items():
        tempList = value
        tempList.append(key)
        for other_key, other_value in inputDict.items():
            if key != other_key and (len(value) > 0) and (len(other_value) > 0): 
                if all(elem in tempList for elem in other_value): #other_value is all in tempList
                    removedKeys[other_key] = other_value
        tempList.remove(key)

    for indv in list(removedKeys.keys()):
        del(inputDict[indv])
        
    print(f'{len(list(removedKeys.keys()))} number of author ids were merged into another id')
    print(f'There are {len(list(inputDict.keys()))} author Ids left for {name}')

    return inputDict
    
#mergeIds('John Adamson')

Following below is a merging of author ids using instituions

In [8]:
#Goal: finds authors institution affiliations for each authorId
#Return: dictionary with person name and list of authorids with affiliation 
def authorInstitutions(name):
    totalInstitutions = {}
    authorIds = get_authorIDs(name)
    for id in authorIds:
        full_query= f'https://api.openalex.org/authors/{id}'
        response = requests.get(full_query)
        visualize_data = response.json()
        authorDict = visualize_data["last_known_institution"]
        if authorDict == None:
            totalInstitutions[id]= None
        else:
            totalInstitutions[id]= authorDict['display_name']
    return totalInstitutions
#authorInstitutions('John Adamson')


#Goal: Finds authorIds that share affiliation 
#Return: dictionary with person name and list of lists with authorids that share institution

def identifyInstitutions(name):
    totalDict = {}
    firstDict = authorInstitutions(name)
    allaIDs = list(firstDict.keys())
    for aID in allaIDs:
            authors_similar = []
            currentKey = aID
            indvInstitution1 = firstDict[currentKey]
            if indvInstitution1!= None:
                for key, value in firstDict.items():
                    if currentKey!=key:
                        if value != None:
                            if indvInstitution1 == value:
                                authors_similar.append(key) 
            totalDict[aID]= authors_similar
    return totalDict
#identifyInstitutions('John Adamson')




In [9]:
def mergeInstitutions(name):
    inputDict = identifyInstitutions(name)
    removeKeys = []
    checker = []

    for key, value in inputDict.items():
        tempList = value
        tempList.append(key)
        for other_key, other_value in inputDict.items():
            tempList2 = other_value
            tempList2.append(other_key)
            if key != other_key and (len(value) > 0) and (len(other_value) > 0): 
                if all(elem in tempList for elem in other_value) and (tempList not in checker) and (tempList2 not in checker): #other_value is all in tempList
                    removeKeys.append(other_key)
                    checker.append(tempList)
            tempList2.remove(other_key)
        tempList.remove(key)

    for indv in removeKeys:
        del(inputDict[indv])
        
    print(f'{len(removeKeys)} number of author IDs were merged into another ID')
    print(f'There are {len(list(inputDict.keys()))} author Ids left for {name}')

    return inputDict

mergeInstitutions('John Adamson') #Kjersti Aagaard


There are 35 author ids for John Adamson
24 number of author IDs were merged into another ID
There are 11 author Ids left for John Adamson


{'A4338988036': ['A4347349823',
  'A4345940772',
  'A4342530884',
  'A4340970251',
  'A4335235559',
  'A4349866829',
  'A4349785327',
  'A4331103553',
  'A4336431390',
  'A4342077751',
  'A4350165947',
  'A4346282386',
  'A4335240014',
  'A2406690590',
  'A4349232974',
  'A4343479288',
  'A4350492330',
  'A4323480629',
  'A4331630488',
  'A4337095612',
  'A4339183015',
  'A4350323753',
  'A4352456493'],
 'A4334433008': ['A2052798532'],
 'A2585988028': [],
 'A4304423741': [],
 'A3083458673': [],
 'A3198642075': [],
 'A3200964741': [],
 'A4318638943': [],
 'A4327582801': [],
 'A4346260474': [],
 'A4365775785': []}