In [1]:
import requests
import json
from nameparser import HumanName

In [13]:
# Specify the path to your JSON file
id_path = 'openAlex_ids.json'

# Initialize an empty dictionary
id_dictionary = {}

try:
    # Open the JSON file in read mode
    with open(id_path, 'r') as json_file:
        # Load the JSON data into the dictionary
        id_dictionary = json.load(json_file)
except FileNotFoundError:
    print("JSON file not found.")
except json.JSONDecodeError:
    print("Error decoding JSON data.")

In [14]:
def clean(author):
    name = HumanName(author["author"]["display_name"])
    result = {
    "id": author["author"]["id"][21:],
    "name_first": name.first or None,
    "name_middle": name.middle or None,
    "name_last": name.last or None,
    "position": author["author_position"]
    }
    return result

def get_dictionary(work):
    for key in ["apc_list", "apc_paid", "is_retracted", "best_oa_location", "biblio", "corresponding_author_ids", "corresponding_institution_ids", "doi", "is_paratext", "locations", "locations_count", "open_access", "primary_location", "type_crossref", "sustainable_development_goals", "display_name", "institutions_distinct_count","is_authors_truncated"]:
        if key in work:
            work.pop(key)
    
    for index, author in enumerate(work["authorships"]):
        work["authorships"][index] = clean(author)
    
    work["api"] = "openAlex"
    work["id"] = work["id"][21:]
    work["cited_count"] = work.pop("referenced_works_count")
    work["cited"] = work.pop("referenced_works")
    work["year"] = work.pop("publication_year")
    work["created_date"] = work.pop("publication_date")
    
    return 

In [16]:
# Initialize a dictionary to store the queried IDs for each physician-scientist
works_dictionary = {}
# Initialize a set to store the failed queries
failed_queries = set()

for display_name in id_dictionary:
    
    for individual_id in id_dictionary[display_name]:
        cursor = "*"  
        while True:
            query_work = f'https://api.openalex.org/works?filter=author.id:{individual_id}&per_page=100&cursor={cursor}'
            try:
                response = requests.get(query_work)

                if response.status_code == 200:
                    works = response.json()["results"]

                    for work in works:
                        get_dictionary(work)
                        
                        if work["id"] not in works_dictionary:
                            works_dictionary[work["id"]] = work
                            id_dictionary[display_name][individual_id]["works"].append(work["id"])
                
                if not works:
                    break

                # Update the query parameters with the next `cursor` value
                cursor = response.json()["meta"]["next_cursor"]

            except Exception as e:
                print(e)
                failed_queries.add((individual_id, cursor))
                break

        print(display_name + " " + individual_id + " done")

print(f"Failed queries: {failed_queries}")


Kjersti Aagaard A5055697102 done
Kjersti Aagaard A5010484685 done
Kjersti Aagaard A5064937434 done
Kjersti Aagaard A5008439203 done
Kjersti Aagaard A5020848526 done
Kjersti Aagaard A5046005597 done
Kjersti Aagaard A5052104610 done
Derek Abbott A5040978000 done


KeyboardInterrupt: 

In [17]:
with open("openAlex_final_works.json", "w") as f:
    json.dump(works_dictionary, f, indent=4)

with open("openAlex_failed_work_queries.json", "w") as f:
    json.dump(list(failed_queries), f, indent=4)

with open("openAlex_final_ids.json", "w") as f:
    json.dump(id_dictionary, f, indent=4)
