In [3]:
import openalexapi
import requests
import json
import csv
import time
import math
# Getting all author ids through search of first and last name:
base_url = 'https://api.openalex.org/'

In [4]:
# finds all author ids given name
#%%time
def get_authorIDs(name):
    setofIDs = set()
    page = 1
    visualize_data = {}  # Initialize with an empty dictionary
    
    while True:
        full_query = f'https://api.openalex.org/authors?search={name}&page={page}'
        response = requests.get(full_query)
        visualize_data = response.json()
        
        for result in visualize_data['results']:
            openalex_id = result['id'].replace("https://openalex.org/", "")
            setofIDs.add(openalex_id)
        
        page += 1
        if page > math.ceil(visualize_data['meta']['count'] / 25):
            break

    print(f'There are {len(setofIDs)} author ids for {name}')
    return setofIDs

#get_authorIDs("Freddy Nguyen")

In [5]:
#finds all work ids given author ids
def work_id(givenAuthorID):
    filtered_works_url = f'https://api.openalex.org/works?filter=author.id:{givenAuthorID}&page='
    page = 1
    all_worksID = []

    # Loop through pages
    while True:
        # Request page from OpenAlex
        url = filtered_works_url + str(page)
        page_with_results = requests.get(url).json()
        results = page_with_results['results']
        
        # Append work IDs using list comprehension
        all_worksID.extend([work['id'].replace("https://openalex.org/", "") for work in results])
        
        # Check if there are more pages
        if len(results) != page_with_results['meta']['per_page']:
            break
        
        # Next page
        page += 1

    #print(f'There are {len(all_worksID)} works for {givenAuthorID}')
    return all_worksID

#work_id('A2118799503')

In [6]:
#finds entire work summary given work id
def findWork(workId):
    fullquery = base_url + 'works/' + workId
    response = requests.get(fullquery)
    visualize_data = response.json()

    # Remove multiple keys using a dictionary comprehension
    keys_to_remove = ["abstract_inverted_index", "related_works", "ngrams_url"]
    visualize_data = {key: visualize_data[key] for key in visualize_data if key not in keys_to_remove}

    return visualize_data
#findWork('W1986121817')

In [7]:
#finds all work titles given name
def findAllTitles(name):
    count = 0
    final = {}
    aIDs = get_authorIDs(name)
    
    for i in aIDs:
        temp = {}
        wIDs = work_id(i)
        for i2 in wIDs:
            allinfo = findWork(i2)
            for concept in allinfo['authorships']:
                firstDict = concept['author']
                nameIn = firstDict['display_name'].lower()
                if name.lower()==nameIn.lower():
                    temp[i2]=allinfo['title']
        final[i]=temp
    return final
#findAllTitles("Freddy Nguyen")     

In [8]:
#finds all work concepts given author id
def findWorkConcepts(aID):  
    finalWorkSet = set()
    workIds = work_id(aID)
    for wID in workIds:
        allinfo = findWork(wID)
        for concept in allinfo['concepts']:
            if float(concept['score']) >= 0.3 and float(concept['level']) >= 1:
                finalWorkSet.add(concept['display_name'].lower())
    return finalWorkSet
#findWorkConcepts('A4344140327')

In [9]:
#find institution given author id
def findAuthorInstitutions(aID):
    full_query = f'https://api.openalex.org/authors/{aID}'
    response = requests.get(full_query)
    visualize_data = response.json()
    
    # Use a ternary operator for conditional return
    return visualize_data["last_known_institution"]["display_name"] if visualize_data["last_known_institution"] is not None else None

#findAuthorInstitutions('A4334433008')  

In [10]:
#finds work institutions given author id and name
def findWorkInstitutions(i, name):  
    finalSet = set()
    workIds = work_id(i)
    for wID in workIds:
        allinfo = findWork(wID)
        for item2 in allinfo['authorships']:
            item3 = item2['author']
            fixedName = item3['display_name']
            partName = item3['display_name'].split()
            if len(partName)==3:
                del partName[1]
                fixedName = " ".join(partName)
            if fixedName==name:
                for item3 in item2["institutions"]:
                    finalSet.add(item3['display_name'])
                
    return finalSet
#findWorkInstitutions('A4366268973', "Joon You")

In [11]:
#finds coAuthors of all works given author id
#%%time
def findCoAuthors(aID):
    totalCoAuthors = set()
    for wID in work_id(aID):
        allinfo = findWork(wID)
        for concept in allinfo['authorships']:
            firstDict = concept['author']
            name = firstDict['display_name'].lower()
            parts = name.split()
            if len(parts)==3:
                del parts[1]
                name = " ".join(parts)
            totalCoAuthors.add(name)
    return totalCoAuthors

#findCoAuthors('A4350360547')       
    

In [209]:
def scores(value1, value2, listFilter):
    try:
        final_scores = []
        for v1, v2, v3 in zip(value1, value2, listFilter):
            score = 0
            if v1 is not None and v2 is not None:
                if isinstance(v1, set) and isinstance(v2, set) and len(v1.intersection(v2))>=1:
                    intersection = len(v1.intersection(v2))
                    if v3 == "institution" and intersection > 1:
                        score = 0.3  
                    elif v3 == "institution" and intersection == 1:
                        score = 0.2
                    elif v3 == "concept" or v3 == "coauthor":
                        score = (intersection/len(v1)) + (intersection/len(v2)) #2 is 100% same list (threshold < 0.3)
            final_scores.append(score)
        
        total = sum(final_scores)
        return total
    
    except Exception as e:
        return str(e)
    
scores2([{'Massachusetts Institute of Technology', "MIT"}],
       [{'Massachusetts Institute of Technology', "MIT"}], ["institution"])

0.3

In [210]:
def combination(name, listFilter):
    totalDict = {}
    removeIds = set()
    ids = get_authorIDs(name)
    for i in ids:
        indvList = []
        for var in listFilter:
            if var.lower() == "institution":
                indvList.append(findWorkInstitutions(i,name))
            elif var.lower() == "concept":
                indvList.append(findWorkConcepts(i))
            elif var.lower() == "coauthor":
                coauthors = findCoAuthors(i)
                if name.lower() not in coauthors:
                    print(f'THERE IS NO {name} IN SET FOR {i}')
                    removeIds.add(i)
                else:
                    coauthors.discard(name.lower())
                indvList.append(coauthors)
        totalDict[i] = indvList
    for i2 in removeIds:
        totalDict.pop(i2)
    print(f'THERE ARE {len(totalDict.keys())} KEYS LEFT SINCE {len(removeIds)} IDS DID NOT HAVE {name} IN AUTHORSHIPS')
    return totalDict

#combination2("Freddy Nguyen", ["institution", "coauthor","concept"])

In [212]:
%%time
def finalMergeTemp(name, listFilter):
    dictGiven = combination(name, listFilter)
    print(f'Starting with {len(dictGiven)} ids')
    finalDict = {}
    removeKeys = set()
    for key, value in dictGiven.items():
        tempSet = set()
        for key2, value2 in dictGiven.items():
            if key != key2:
                numCheck = scores(value, value2, listFilter)  # Use the correct function name "scores"
                if round(numCheck, ndigits=1) >= 0.3: 
                    tempSet.add((key2))
                    removeKeys.add(key2)
        finalDict[key] = tempSet
    
    print(f'Ending with {len(finalDict)} ids')
    return finalDict
finalMergeTemp("Freddy Nguyen", ["institution", "coauthor","concept"])

There are 10 author ids for Freddy Nguyen
THERE ARE 10 KEYS LEFT SINCE 0 IDS DID NOT HAVE Freddy Nguyen IN AUTHORSHIPS
Starting with 10 ids
Ending with 10 ids
Wall time: 1min 47s


{'A4340714383': {'A2142210918', 'A4344140327', 'A4348767648'},
 'A4308523569': {'A4344140327', 'A4370212430'},
 'A4376068906': {'A4308239957', 'A4344140327', 'A4344815999', 'A4370212430'},
 'A4308239957': {'A4376068906'},
 'A4348767648': {'A2142210918', 'A4340714383', 'A4344140327', 'A4344815999'},
 'A4344140327': {'A2142210918',
  'A4308523569',
  'A4336971941',
  'A4340714383',
  'A4344815999',
  'A4348767648',
  'A4370212430',
  'A4376068906'},
 'A4336971941': {'A4344140327', 'A4370212430'},
 'A4370212430': {'A4308523569', 'A4336971941', 'A4344140327', 'A4376068906'},
 'A2142210918': {'A4340714383', 'A4344140327', 'A4348767648'},
 'A4344815999': {'A4344140327', 'A4348767648', 'A4376068906'}}

In [214]:
finalMergeTemp("Joon You", ["institution", "coauthor","concept"])

There are 29 author ids for Joon You
THERE IS NO Joon You IN SET FOR A2229040901
THERE IS NO Joon You IN SET FOR A2790487895
THERE IS NO Joon You IN SET FOR A2395109633
THERE IS NO Joon You IN SET FOR A3174627998
THERE IS NO Joon You IN SET FOR A2698238978
THERE IS NO Joon You IN SET FOR A4319761361
THERE IS NO Joon You IN SET FOR A2483372419
THERE IS NO Joon You IN SET FOR A4346101035
THERE IS NO Joon You IN SET FOR A2149869665
THERE IS NO Joon You IN SET FOR A2224552268
THERE IS NO Joon You IN SET FOR A3176251314
THERE IS NO Joon You IN SET FOR A3045734672
THERE IS NO Joon You IN SET FOR A2338177814
THERE IS NO Joon You IN SET FOR A2716888932
THERE IS NO Joon You IN SET FOR A2982847546
THERE IS NO Joon You IN SET FOR A3141800122
THERE IS NO Joon You IN SET FOR A2156423010
THERE IS NO Joon You IN SET FOR A2645040244
THERE ARE 11 KEYS LEFT SINCE 18 IDS DID NOT HAVE Joon You IN AUTHORSHIPS
Starting with 11 ids
Ending with 11 ids


{'A2721069163': {'A4333091535', 'A4350360547', 'A4366268973'},
 'A4343486061': {'A2118799503'},
 'A4343655724': {'A2125109978', 'A4333091535'},
 'A4330509534': set(),
 'A4351687287': {'A4350360547'},
 'A4334580901': set(),
 'A4350360547': {'A2125109978',
  'A2721069163',
  'A4333091535',
  'A4351687287',
  'A4366268973'},
 'A4333091535': {'A2125109978',
  'A2721069163',
  'A4343655724',
  'A4350360547',
  'A4366268973'},
 'A4366268973': {'A2721069163', 'A4333091535', 'A4350360547'},
 'A2125109978': {'A4333091535', 'A4343655724', 'A4350360547'},
 'A2118799503': {'A4343486061'}}

In [215]:
def finalMerge3(inputDict): 
    mergedValues = set()
    duplicates = set()
    delete = set()
    finalDict = dict(inputDict)
    
    for key1, value1 in inputDict.items():
        for key2, value2 in inputDict.items():
            
            if key1 != key2 and len(value1) > 0 and len(value2) > 0:
                tempSet1 = set(value1)
                tempSet1.add(key1)
                tempSet2 = set(value2)
                tempSet2.add(key2)
                
            
                if len(tempSet1.intersection(tempSet2))> 0 and key1 not in duplicates and key2 not in duplicates:    
                    if key1 in finalDict[key2]:
                        finalDict[key2].remove(key1)
                    if key1 in tempSet2:
                        tempSet2.remove(key1)
                    finalDict[key1] = finalDict[key1] | tempSet2
                    duplicates.add(key2)
                    del finalDict[key2]
                    delete = delete | (value2)
                    
    for i in delete:
        if i in finalDict.keys():
            del finalDict[i]
    
    return finalDict
    
finalMerge3({'A4340714383': {'A2142210918', 'A4344140327', 'A4348767648'},
 'A4308523569': {'A4344140327', 'A4370212430'},
 'A4376068906': {'A4308239957', 'A4344140327', 'A4344815999', 'A4370212430'},
 'A4308239957': {'A4376068906'},
 'A4348767648': {'A2142210918', 'A4340714383', 'A4344140327', 'A4344815999'},
 'A4344140327': {'A2142210918',
  'A4308523569',
  'A4336971941',
  'A4340714383',
  'A4344815999',
  'A4348767648',
  'A4370212430',
  'A4376068906'},
 'A4336971941': {'A4344140327', 'A4370212430'},
 'A4370212430': {'A4308523569', 'A4336971941', 'A4344140327', 'A4376068906'},
 'A2142210918': {'A4340714383', 'A4344140327', 'A4348767648'},
 'A4344815999': {'A4344140327', 'A4348767648', 'A4376068906'}})

{'A4340714383': {'A2142210918',
  'A4308239957',
  'A4308523569',
  'A4336971941',
  'A4344140327',
  'A4344815999',
  'A4348767648',
  'A4370212430',
  'A4376068906'}}

In [216]:
finalMerge3({'A2721069163': {'A4333091535', 'A4350360547', 'A4366268973'},
 'A4343486061': {'A2118799503'},
 'A4343655724': {'A2125109978', 'A4333091535'},
 'A4330509534': set(),
 'A4351687287': {'A4350360547'},
 'A4334580901': set(),
 'A4350360547': {'A2125109978',
  'A2721069163',
  'A4333091535',
  'A4351687287',
  'A4366268973'},
 'A4333091535': {'A2125109978',
  'A2721069163',
  'A4343655724',
  'A4350360547',
  'A4366268973'},
 'A4366268973': {'A2721069163', 'A4333091535', 'A4350360547'},
 'A2125109978': {'A4333091535', 'A4343655724', 'A4350360547'},
 'A2118799503': {'A4343486061'}})

{'A2721069163': {'A2125109978',
  'A4333091535',
  'A4343655724',
  'A4350360547',
  'A4351687287',
  'A4366268973'},
 'A4343486061': {'A2118799503'},
 'A4330509534': set(),
 'A4334580901': set()}