In [1]:
import dimcli
from dimcli.utils import *
import re
import psycopg2
import requests
import os, sys, time, json
from tqdm.notebook import tqdm as progressbar
from nameparser import HumanName

import pandas as pd

import plotly.express as px
from plotly.offline import plot
if not 'google.colab' in sys.modules:
  # make js dependencies local / needed by html exports
    from plotly.offline import init_notebook_mode
    init_notebook_mode(connected=True)

print("==\nLogging in..")
# https://digital-science.github.io/dimcli/getting-started.html#authentication
ENDPOINT = "https://app.dimensions.ai"
if 'google.colab' in sys.modules:
    import getpass
    KEY = getpass.getpass(prompt='API Key: ')  
    dimcli.login(key=KEY, endpoint=ENDPOINT)
else:
    KEY = "9F8D648F0D7E437CB1736BEBDF007F02"
    dimcli.login(key=KEY, endpoint=ENDPOINT)
dsl = dimcli.Dsl()

==
Logging in..
[2mDimcli - Dimensions API Client (v1.2)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v2.10[0m
[2mMethod: manual login[0m
====
Heads up! The latest Dimcli version is  1.3
You have installed:  1.2
====
Please upgrade: `pip install dimcli -U`


In [2]:
# Specify the path to your JSON file
id_path = '/Users/alishali/Desktop/physician-scientists copy/openAlex_scraper/asci_aap_dataJSON.json'

# Initialize an empty dictionary
id_dictionary = {}

try:
    # Open the JSON file in read mode
    with open(id_path, 'r') as json_file:
        # Load the JSON data into the dictionary
        id_dictionary = json.load(json_file)
except FileNotFoundError:
    print("JSON file not found.")
except json.JSONDecodeError:
    print("Error decoding JSON data.")

print(id_dictionary)

{'people': [{'year': '2019', 'first_name': 'Kjersti', 'middle_name': 'M.', 'last_name': 'Aagaard', 'phone': '8012307893', 'email': 'aagaardt@bcm.edu', 'affiliation': "['Baylor College of Medicine']", 'original specialization': "['Obstetrics and Gynecology', 'Clinical research', 'Clinical trials']", 'modified specialization': "['Obstetrics and Gynecology']", 'unactive': 'False', 'organization': 'False', 'email_affiliation': "['ASCI']", 'umbrella_aff': "['bcm']", 'related_aff': '[]', 'umbrella_spec': '[]', 'related_spec': '[]', 'id_num': '[]', 'kumu_num': 'P1'}, {'year': '2016', 'first_name': 'Derek', 'middle_name': 'W.', 'last_name': 'Abbott', 'phone': '2163688564', 'email': 'dwa4@case.edu', 'affiliation': "['Case Western Reserve University School of Medicine']", 'original specialization': "['Molecular Biology', 'Immunology', 'Pathology']", 'modified specialization': "['Pathology']", 'unactive': 'False', 'organization': 'False', 'email_affiliation': "['ASCI']", 'umbrella_aff': "['case']

In [3]:
def execute_command(query):
    try:
        connection = psycopg2.connect(
            "postgresql://selina04_mit_edu:tcM97azb3HqLeOIlV6HGpA@livid-dibbler-6457.g8z.cockroachlabs.cloud:26257/livid-dibbler-6457.test?sslmode=verify-full"
        )

        # Create a cursor and execute the insert statement
        cursor = connection.cursor()
        # print('This is current command: ',query)
        cursor.execute(query)
        connection.commit()  # Commit the transaction
        # print("Data inserted successfully")
        cursor.close()
        connection.close()

    except psycopg2.Error as error:
        print(f"Error: {query}")
        print("------")
        print(error)


In [4]:
#DIMENSIONS HELPER FUNCTIONS
def researcherIds(name):
    try:
        res = dsl.query_iterative(f"""search researchers for "{name}" return researchers""")
        listDict = res.json['researchers']
        allIds = set()
        for idv in listDict:
            allIds.add(idv['id'])

        allIds = list(allIds)  # Convert set back to a list if needed
        
        if len(allIds) > 512:    
            allIds = allIds[0:512]
        print(f'Total of {len(allIds)} unique ids for {name} ')

        # Constructing the DSL query with the list of IDs
        query_ids = ', '.join([f'"{id_val}"' for id_val in allIds])
        query = f'search researchers where id in [{query_ids}] return researchers[id+obsolete+redirect]'

        # Execute the DSL query
        res2 = dsl.query_iterative(query)
        listDict2 = res2.json['researchers']
        allWorkingIDs = set()
        for person in listDict2:
            if person['obsolete'] == 0:  # Current Working Id(s)
                allWorkingIDs.add(person["id"])
            else:
                for ids in person['redirect']:
                    allWorkingIDs.add(ids)
        
        print(f'++++++++ FINAL {len(allWorkingIDs)} Working ID(s) +++++++++++')  
        print(allWorkingIDs)
    except:
        print(f'No IDs found for the {name}')
        return {}
    
    return allWorkingIDs
testIDs = researcherIds("Leslie Schoenfield")

#OPEN ALEX HELPER FUNCTIONS
base_url = 'https://api.openalex.org/'
def author_ids(author_name):

    # Initialize a dictionary to store the queried IDs for each physician-scientist
    ids_dictionary = {}

    # Initialize a set to store the failed queries
    failed_queries = set()

    cursor = "*"

    # Keep making queries until the request is successful or the maximum number of queries is reached
    while True:
        query_author = f'https://api.openalex.org/authors?search={author_name}&per_page=100&cursor={cursor}'
        
        try:
            # Make the API request
            response = requests.get(query_author)

            # Check if the request was successful
            if response.status_code == 200:
                # Get the list of inventors from the response
                authors = response.json()["results"]

                # Iterate over the author and add them to the dictionary
                for author in authors:
                    name = HumanName(author.pop("display_name"))
    
                    # remove unnecessary features
                    author.pop("display_name_alternatives")
                    author.pop("orcid")
                    author.pop("summary_stats")
                    author.pop("x_concepts")
                    author.pop("works_api_url")
                    
                    author["id"] = author["id"][21:]
                    author["name_first"] = name.first if len(name.first) > 1 else None
                    author["name_middle"] = name.middle if len(name.middle) > 1 else None
                    author["name_last"] = name.last if len(name.last) > 1 else None
                    author["works"] = []
                    
                    author["api"] = "openAlex"
                    
                    if author_name not in ids_dictionary:
                        ids_dictionary[author_name] = {}
                    
                    if author["id"] not in ids_dictionary[author_name]:
                        ids_dictionary[author_name][author["id"]] = {}

                    ids_dictionary[author_name][author["id"]] = author

            # Break out of the loop if there are no more authors
            if not authors:
                break

            # Update the query parameters with the next `cursor` value
            cursor = response.json()["meta"]["next_cursor"]
                
        # If the request is unsuccessful, add the query name and `after` value to the failed queries set
        except Exception as e:
            print(e)
            failed_queries.add((author_name, cursor))
            break

    print(author_name + " done")

    # Print the failed queries
    print(f"Failed queries: {failed_queries}")
    return ids_dictionary, failed_queries

#finds all work ids by author ids 
def work_id(givenAuthorID):
        page = 'page={}'
        filtered_works_url = f'https://api.openalex.org/works?filter=author.id:{givenAuthorID}&{page}'
        page = 1
        has_more_pages = True
        all_worksID = set()

        # loop through pages
        while has_more_pages:
            # set page value and request page from OpenAlex
            url = filtered_works_url.format(page)
            page_with_results = requests.get(url).json()
            #print("page_with_results ", page_with_results)

            # loop through partial list of results
            results = page_with_results['results']
            for i,work in enumerate(results):
                openalex_id = work['id'].replace("https://openalex.org/", "")
                all_worksID.add(openalex_id)
            # next page
            page += 1

            # end loop when either there are no more results on the requested page 
            # or the next request would exceed 15 results
            per_page = page_with_results['meta']['per_page']
            has_more_pages = len(results) == per_page
        return (all_worksID)
#list_output = work_id('A5044648110')
#print(f'{len(list_output)}')

#gets each publications by its workid

def findWork(workId):
    fullquery = base_url+'works/'+workId
    response = requests.get(fullquery)
    data = response.json()

    # Specify the keys you're interested in
    keys = [
        'authorships', 'best_oa_location', 'cited_by_api_url', 'cited_by_count', 'concepts',
        'counts_by_year', 'doi', 'grants', 'id', 'Ids', 'is_paratext', 'keywords', 'locations',
        'mesh', 'primary_location', 'publication_date', 'publication_year', 'referenced_works',
        'topics', 'title'
    ]

    # Create a new dictionary with only the specified keys
    visualize_data = {key: data.get(key, None) for key in keys}

    return visualize_data

def findAuthor(authorID):
    fullquery = base_url+'authors/'+authorID
    response = requests.get(fullquery)
    data = response.json()

    # Specify the keys you're interested in
    keys = [
        "affiliations",
        "cited_by_count",
        "display_name",
        "display_name_alternatives",
        "id",
        "ids",
        "last_known_institutions",
        "summary_stats",
        "works_api_url",
        "works_count",
        "topics"
    ]   

    # Create a new dictionary with only the specified keys
    visualize_data = {key: data.get(key, None) for key in keys}

    return visualize_data

Starting iteration with limit=1000 skip=0 ...[0m
0-2 / 2 (0.94s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 2 unique ids for Leslie Schoenfield 


0-2 / 2 (0.49s)[0m
===
Records extracted: 2[0m


++++++++ FINAL 2 Working ID(s) +++++++++++
{'ur.014424615212.32', 'ur.0102031123.62'}


In [11]:
# Function to query Dimensions
def query_dimensions(ids):
    ids_str = ', '.join([f'"{id}"' for id in ids])
    q = f"""search researchers where id in [{ids_str}] return researchers[id+current_research_org+dimensions_url+first_grant_year+first_name+first_publication_year+last_grant_year+last_name+last_publication_year+nih_ppid+obsolete+orcid_id+redirect+research_orgs+score+total_grants+total_publications]"""
    result = dsl.query_iterative(q, show_results=None, limit=100, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None)
    if not "researchers" in result.data:
        return []
    return result.data["researchers"]

# Function to query OpenAlex
def query_openalex(author_name):
    result_dict = [] #Storing a list of dictionaries, each representing an author with the keys as the characteristics (i.e. id, title, pub_date)
    authorIds, failed = author_ids(author_name)
    print(author_name + ": "+ str(authorIds))
    for author_name, author_data in authorIds.items():
        for authorId_dict in author_data.values():
            authorId = authorId_dict['id']  # Extract the ID from the dictionary
            author_details = findAuthor(authorId)
            if 'id' in author_details:
                author_details['id'] = author_details['id'].replace('https://openalex.org/', '')
            if 'ids' in author_details and 'orcid' in author_details['ids']:
                author_details['ids']['orcid'] = author_details['ids']['orcid'].replace('https://orcid.org/', '')
            result_dict.append(author_details)
    print("openalex result dict: ", result_dict)
    return result_dict

In [7]:
#to be filled in next cell, for use in grant, patents, clinical trials querying
researcher_ids = []

In [13]:

# RESEARCHERS TABLE UPLOAD
execute_command("""DROP TABLE IF EXISTS Grants;""")
execute_command("""DROP TABLE IF EXISTS Patents;""")
execute_command("""DROP TABLE IF EXISTS Clinical_Trials;""")
execute_command("""DROP TABLE IF EXISTS Researchers;""")


execute_command("""CREATE TABLE Researchers (
    id STRING PRIMARY KEY,
    current_research_org STRING,
    dimensions_url STRING,
    first_grant_year STRING,
    first_name STRING,
    first_publication_year STRING,
    last_grant_year STRING,
    last_name STRING,
    last_publication_year STRING,
    nih_ppid STRING,
    obsolete STRING,
    orcid_id STRING,
    redirect STRING,
    research_orgs STRING,
    score STRING,
    total_grants STRING,
    total_publications STRING,
    affiliations STRING,
    cited_by_count STRING,
    display_name STRING,
    display_name_alternatives STRING,
    last_known_institutions STRING,
    summary_stats STRING,
    works_api_url STRING,
    works_count STRING,
    topics STRING,
    asci_year STRING
);
""")

print(id_dictionary["people"][2300:2310])
for author in id_dictionary["people"][2300:2310]:
    first_name = author["first_name"].strip(".")
    middle_name = author["middle_name"].strip(".")
    last_name = author["last_name"].strip(".")
    asci_year_value = author["year"]
    
    if author["middle_name"]: 
        author_name = first_name + " " + middle_name + " " + last_name
    else: 
        author_name = first_name + " " + last_name

    print(author_name)
    
    ids = list(researcherIds(author_name))
    researcher_ids += ids

    print(f"-------------querying for {author_name}----------------------")

    dimensions_data = query_dimensions(ids)
    openalex_data = query_openalex(author_name)
    
    combined_data = dimensions_data + openalex_data
    
    columns = [
        "id",
        "current_research_org",
        "dimensions_url",
        "first_grant_year",
        "first_publication_year",
        "first_name",
        "last_grant_year",
        "last_name",
        "last_publication_year",
        "nih_ppid",
        "obsolete",
        "orcid_id",
        "redirect",
        "research_orgs",
        "score",
        "total_grants",
        "total_publications",
        "affiliations",
        "cited_by_count",
        "display_name",
        "display_name_alternatives",
        "last_known_institutions",
        "summary_stats",
        "works_api_url",
        "works_count",
        "topics",
        "asci_year"
    ]
    
    for researcher in combined_data: 
        print("-----------researcher-------------", researcher)
        fields = []
        values = []
        update_fields = []
        for field, value in researcher.items():
            if not isinstance(value, str):
                value = json.dumps(value)
            value = value.replace("'", "''")
            if field in columns:
                fields.append(field)
                values.append(f"'{value}'")
                update_fields.append(f"{field} = EXCLUDED.{field}")

        columns_str = ', '.join(fields)
        values_str = ', '.join(values)
        update_str = ', '.join(update_fields)

        columns_str += ", asci_year"
        values_str += f", '{asci_year_value}'"
        update_str += f", asci_year = EXCLUDED.asci_year"

        # Insert or update the researcher
        insert_query = f"""INSERT INTO Researchers ({columns_str}) VALUES({values_str}) ON CONFLICT (id) DO UPDATE SET {update_str};"""
        execute_command(insert_query)

        # Handle ORCID IDs from Dimensions
        if 'orcid_id' in researcher and researcher['orcid_id']:
            orcid_id = str(researcher['orcid_id']).strip('["]').strip('\'"')
            print("-----------researcherid-------------", researcher['id'])
            print("-----------orcid_id-------------", orcid_id)
            # Check if the ORCID ID already exists
            existing_orcid_query = f"SELECT * FROM Researchers WHERE id = '{orcid_id}'"
            existing_orcid = execute_command(existing_orcid_query)
            if existing_orcid:
                # Merge the new data with the existing data
                existing_data = dict(zip(columns, existing_orcid[0]))
                merged_data = {**existing_data, **researcher}
                
                # Prepare the update query with merged data
                update_fields = []
                for field, value in merged_data.items():
                    if not isinstance(value, str):
                        value = json.dumps(value)
                    value = value.replace("'", "''")
                    if field in columns:
                        update_fields.append(f"{field} = '{value}'")
                
                update_str = ', '.join(update_fields)
                update_orcid_query = f"UPDATE Researchers SET {update_str} WHERE id = '{orcid_id}';"
                execute_command(update_orcid_query)
            else:
                # Insert a new row with the ORCID ID, duplicating other values
                orcid_insert_query = f"INSERT INTO Researchers ({columns_str}) VALUES({values_str.replace(researcher['id'], str(orcid_id))}) ON CONFLICT (id) DO UPDATE SET {update_str};"
                execute_command(orcid_insert_query)

        # Handle ORCID IDs from OpenAlex
        if 'ids' in researcher and 'orcid' in researcher['ids']:
            orcid_id = researcher['ids']['orcid']
            print("-----------researcherid-------------", researcher['id'])
            print("-----------orcid_id-------------", orcid_id)
            # Check if the ORCID ID already exists
            existing_orcid_query = f"SELECT * FROM Researchers WHERE id = '{orcid_id}'"
            existing_orcid = execute_command(existing_orcid_query)
            if existing_orcid:
                # Merge the new data with the existing data
                existing_data = dict(zip(columns, existing_orcid[0]))
                merged_data = {**existing_data, **researcher}
                
                # Prepare the update query with merged data
                update_fields = []
                for field, value in merged_data.items():
                    if not isinstance(value, str):
                        value = json.dumps(value)
                    value = value.replace("'", "''")
                    if field in columns:
                        update_fields.append(f"{field} = '{value}'")
                
                update_str = ', '.join(update_fields)
                update_orcid_query = f"UPDATE Researchers SET {update_str} WHERE id = '{orcid_id}';"
                execute_command(update_orcid_query)
            else:
                # Insert a new row with the ORCID ID, duplicating other values
                orcid_insert_query = f"INSERT INTO Researchers ({columns_str}) VALUES({values_str.replace(researcher['id'], str(orcid_id))}) ON CONFLICT (id) DO UPDATE SET {update_str};"
                execute_command(orcid_insert_query)

Starting iteration with limit=1000 skip=0 ...[0m


[{'year': '2005', 'first_name': 'Jonathan', 'middle_name': '', 'last_name': 'Lindner', 'phone': '5034948750', 'email': 'lindnerj@ohsu.edu', 'affiliation': "['Oregon Health Science University School of Medicine']", 'original specialization': "['Cardiovascular Disease']", 'modified specialization': "['Cardiovascular Disease']", 'unactive': 'False', 'organization': 'False', 'email_affiliation': "['ASCI']", 'umbrella_aff': "['ohsu']", 'related_aff': "['Oregon Health & Science Affiliates*']", 'umbrella_spec': "['Oregon Health & Science University', 'Oregon Health Science University School of Medicine']", 'related_spec': "['Internal Medicine']", 'id_num': "['Cardiovascular Disease', 'Informatics', 'Critical Care Medicine', 'Endocrinology', 'Diabetes', 'Metabolism', 'Gastroenterology', 'Geriatrics', 'Gerontology', 'Hematology', 'Medical Oncology', 'Breast cancer', 'Neuro-oncology', 'Pulmonology', 'Rheumatology', 'Allergy', 'Bone marrow transplantation', 'Cardiology', 'General Medicine', 'Hepa

0-7 / 7 (0.40s)[0m
===
Records extracted: 7[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 7 unique ids for Jonathan Lindner 


0-7 / 7 (0.28s)[0m
===
Records extracted: 7[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 7 Working ID(s) +++++++++++
{'ur.01242700051.26', 'ur.01175006530.14', 'ur.011755311476.25', 'ur.0670046312.82', 'ur.010011051350.54', 'ur.016017457303.89', 'ur.014144320643.55'}
-------------querying for Jonathan Lindner----------------------


0-7 / 7 (2.75s)[0m
===
Records extracted: 7[0m


Jonathan Lindner done
Failed queries: set()
Jonathan Lindner: {'Jonathan Lindner': {'A5037936416': {'id': 'A5037936416', 'relevance_score': 22491.947, 'works_count': 444, 'cited_by_count': 25381, 'ids': {'openalex': 'https://openalex.org/A5037936416', 'orcid': 'https://orcid.org/0000-0003-2604-5277'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I165690674', 'ror': 'https://ror.org/009avj582', 'display_name': 'Oregon Health & Science University', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I165690674']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I4210134211', 'ror': 'https://ror.org/046kb4y45', 'display_name': 'University of Virginia Medical Center', 'country_code': 'US', 'type': 'healthcare', 'lineage': ['https://openalex.org/I2799765794', 'https://openalex.org/I4210134211']}, 'years': [2024, 2023, 2022, 2005, 2004, 2002, 2000, 1998, 1997, 1996]}, {'institution'

Starting iteration with limit=1000 skip=0 ...[0m


Pamela Ling


0-1 / 1 (0.33s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 1 unique ids for Pamela Ling 


0-1 / 1 (0.26s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 1 Working ID(s) +++++++++++
{'ur.0772067763.56'}
-------------querying for Pamela Ling----------------------


0-1 / 1 (0.42s)[0m
===
Records extracted: 1[0m


Pamela Ling done
Failed queries: set()
Pamela Ling: {'Pamela Ling': {'A5030374908': {'id': 'A5030374908', 'relevance_score': 10767.484, 'works_count': 4488, 'cited_by_count': 150313, 'ids': {'openalex': 'https://openalex.org/A5030374908', 'orcid': 'https://orcid.org/0009-0004-6334-051X'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I4210135723', 'ror': 'https://ror.org/02txedb84', 'display_name': 'Shanghai Institute of Technical Physics', 'country_code': 'CN', 'type': 'facility', 'lineage': ['https://openalex.org/I19820366', 'https://openalex.org/I4210135723']}, 'years': [2024]}, {'institution': {'id': 'https://openalex.org/I19820366', 'ror': 'https://ror.org/034t30j35', 'display_name': 'Chinese Academy of Sciences', 'country_code': 'CN', 'type': 'government', 'lineage': ['https://openalex.org/I19820366']}, 'years': [2024, 2022, 2020, 2018, 2017, 2014, 2013, 2012, 2011, 2010]}, {'institution': {'id': 'https://openalex.org/I136199984', 'ror': 'https://ror.org/03vek6s52

Starting iteration with limit=1000 skip=0 ...[0m


Vishwanath Rao Lingappa


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 0 unique ids for Vishwanath Rao Lingappa 
Query Error
1 QuerySyntaxError found

1 ParserError found
  * [Line 1:32] (']') no viable alternative at input '[]'



>>>[Dimcli tip] An error occurred with the batch '0-1000'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m
Starting iteration with limit=100 skip=0 ...[0m


No IDs found for the Vishwanath Rao Lingappa
-------------querying for Vishwanath Rao Lingappa----------------------
Query Error
1 QuerySyntaxError found

1 ParserError found
  * [Line 1:32] (']') no viable alternative at input '[]'



>>>[Dimcli tip] An error occurred with the batch '0-100'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m
Starting iteration with limit=1000 skip=0 ...[0m


Vishwanath Rao Lingappa done
Failed queries: set()
Vishwanath Rao Lingappa: {}
openalex result dict:  []
Daniel C Link


0-2 / 2 (0.28s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 2 unique ids for Daniel C Link 


0-2 / 2 (2.72s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 2 Working ID(s) +++++++++++
{'ur.016375226437.33', 'ur.010633200331.89'}
-------------querying for Daniel C Link----------------------


0-2 / 2 (1.80s)[0m
===
Records extracted: 2[0m


Daniel C Link done
Failed queries: set()
Daniel C Link: {'Daniel C Link': {'A5049284725': {'id': 'A5049284725', 'relevance_score': 29324.01, 'works_count': 799, 'cited_by_count': 35254, 'ids': {'openalex': 'https://openalex.org/A5049284725', 'orcid': 'https://orcid.org/0000-0002-3170-7581'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I204465549', 'ror': 'https://ror.org/01yc7t268', 'display_name': 'Washington University in St. Louis', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I204465549']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I4210135078', 'ror': 'https://ror.org/036c27j91', 'display_name': 'Washington University Medical Center', 'country_code': 'US', 'type': 'healthcare', 'lineage': ['https://openalex.org/I4210135078']}, 'years': [2019, 1992, 1991]}, {'institution': {'id': 'https://openalex.org/I4210119077', 'ror': 'https://ror.org/02kb97560', 'display

Starting iteration with limit=1000 skip=0 ...[0m


MacRae F Linton


0-1 / 1 (0.26s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 1 unique ids for MacRae F Linton 


0-1 / 1 (0.26s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 1 Working ID(s) +++++++++++
{'ur.01326031522.43'}
-------------querying for MacRae F Linton----------------------


0-1 / 1 (0.33s)[0m
===
Records extracted: 1[0m


MacRae F Linton done
Failed queries: set()
MacRae F Linton: {'MacRae F Linton': {'A5089000198': {'id': 'A5089000198', 'relevance_score': 28507.861, 'works_count': 364, 'cited_by_count': 19533, 'ids': {'openalex': 'https://openalex.org/A5089000198', 'orcid': 'https://orcid.org/0000-0002-9277-0453'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I901861585', 'ror': 'https://ror.org/05dq2gs74', 'display_name': 'Vanderbilt University Medical Center', 'country_code': 'US', 'type': 'healthcare', 'lineage': ['https://openalex.org/I4210162197', 'https://openalex.org/I901861585']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I200719446', 'ror': 'https://ror.org/02vm5rt34', 'display_name': 'Vanderbilt University', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I200719446']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://o

Starting iteration with limit=1000 skip=0 ...[0m


Michail S Lionakis


0-1 / 1 (0.36s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 1 unique ids for Michail S Lionakis 


0-1 / 1 (0.24s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 1 Working ID(s) +++++++++++
{'ur.01044511164.71'}
-------------querying for Michail S Lionakis----------------------


0-1 / 1 (0.33s)[0m
===
Records extracted: 1[0m


Michail S Lionakis done
Failed queries: set()
Michail S Lionakis: {'Michail S Lionakis': {'A5022799608': {'id': 'A5022799608', 'relevance_score': 28139.713, 'works_count': 445, 'cited_by_count': 17215, 'ids': {'openalex': 'https://openalex.org/A5022799608', 'orcid': 'https://orcid.org/0000-0003-4994-9500', 'scopus': 'http://www.scopus.com/inward/authorDetails.url?authorID=6507497145&partnerID=MN8TOARS'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I1299303238', 'ror': 'https://ror.org/01cwqze88', 'display_name': 'National Institutes of Health', 'country_code': 'US', 'type': 'government', 'lineage': ['https://openalex.org/I1299022934', 'https://openalex.org/I1299303238']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I4210134534', 'ror': 'https://ror.org/043z4tv69', 'display_name': 'National Institute of Allergy and Infectious Diseases', 'country_code': 'US', 'type': 'facility', 'lineage': ['https:

Starting iteration with limit=1000 skip=0 ...[0m


Lance A Liotta


0-1 / 1 (0.49s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 1 unique ids for Lance A Liotta 


0-1 / 1 (0.27s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 1 Working ID(s) +++++++++++
{'ur.014136153365.29'}
-------------querying for Lance A Liotta----------------------


0-1 / 1 (0.26s)[0m
===
Records extracted: 1[0m


Lance A Liotta done
Failed queries: set()
Lance A Liotta: {'Lance A Liotta': {'A5086886026': {'id': 'A5086886026', 'relevance_score': 60821.297, 'works_count': 1063, 'cited_by_count': 93200, 'ids': {'openalex': 'https://openalex.org/A5086886026', 'orcid': 'https://orcid.org/0000-0001-5155-7907'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I162714631', 'ror': 'https://ror.org/02jqj7156', 'display_name': 'George Mason University', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I162714631']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I79576946', 'ror': 'https://ror.org/00b30xv10', 'display_name': 'University of Pennsylvania', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I79576946']}, 'years': [2019, 2012, 2007]}, {'institution': {'id': 'https://openalex.org/I32971472', 'ror': 'https://ror.org/03v76x132', 'display_name': 'Yale Universit

Starting iteration with limit=1000 skip=0 ...[0m


Steven M Lipkin


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 0 unique ids for Steven M Lipkin 
Query Error
1 QuerySyntaxError found

1 ParserError found
  * [Line 1:32] (']') no viable alternative at input '[]'



>>>[Dimcli tip] An error occurred with the batch '0-1000'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m
Starting iteration with limit=100 skip=0 ...[0m


No IDs found for the Steven M Lipkin
-------------querying for Steven M Lipkin----------------------
Query Error
1 QuerySyntaxError found

1 ParserError found
  * [Line 1:32] (']') no viable alternative at input '[]'



>>>[Dimcli tip] An error occurred with the batch '0-100'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m


Steven M Lipkin done
Failed queries: set()
Steven M Lipkin: {'Steven M Lipkin': {'A5040643638': {'id': 'A5040643638', 'relevance_score': 17096.488, 'works_count': 315, 'cited_by_count': 9094, 'ids': {'openalex': 'https://openalex.org/A5040643638', 'orcid': 'https://orcid.org/0000-0002-0603-9139'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I205783295', 'ror': 'https://ror.org/05bnh6r87', 'display_name': 'Cornell University', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I205783295']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I4210130527', 'ror': 'https://ror.org/03aeycp46', 'display_name': 'Meyer (China)', 'country_code': 'CN', 'type': 'company', 'lineage': ['https://openalex.org/I4210130527']}, 'years': [2017]}, {'institution': {'id': 'https://openalex.org/I145220665', 'ror': 'https://ror.org/01wvxpc32', 'display_name': 'Cornell College', 'country_code': 'US', 

Starting iteration with limit=1000 skip=0 ...[0m


Marc E Lippman


0-1 / 1 (0.45s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 1 unique ids for Marc E Lippman 


0-1 / 1 (0.21s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=100 skip=0 ...[0m


++++++++ FINAL 1 Working ID(s) +++++++++++
{'ur.01242102471.41'}
-------------querying for Marc E Lippman----------------------


0-1 / 1 (2.61s)[0m
===
Records extracted: 1[0m


Marc E Lippman done
Failed queries: set()
Marc E Lippman: {'Marc E Lippman': {'A5028539104': {'id': 'A5028539104', 'relevance_score': 37309.984, 'works_count': 612, 'cited_by_count': 42632, 'ids': {'openalex': 'https://openalex.org/A5028539104', 'orcid': 'https://orcid.org/0000-0001-5280-4084'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I2799903593', 'ror': 'https://ror.org/00hjz7x27', 'display_name': 'Georgetown University Medical Center', 'country_code': 'US', 'type': 'healthcare', 'lineage': ['https://openalex.org/I2799903593']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2004, 2003, 2002, 2001]}, {'institution': {'id': 'https://openalex.org/I184565670', 'ror': 'https://ror.org/05vzafd60', 'display_name': 'Georgetown University', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I184565670']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2017, 2005, 2004, 2003]}, {'institution': {'id': 'https://openalex.org/I145608581', 'ror': 'http

Starting iteration with limit=1000 skip=0 ...[0m


Howard L Lippton


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


Total of 0 unique ids for Howard L Lippton 
Query Error
1 QuerySyntaxError found

1 ParserError found
  * [Line 1:32] (']') no viable alternative at input '[]'



>>>[Dimcli tip] An error occurred with the batch '0-1000'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m
Starting iteration with limit=100 skip=0 ...[0m


No IDs found for the Howard L Lippton
-------------querying for Howard L Lippton----------------------
Query Error
1 QuerySyntaxError found

1 ParserError found
  * [Line 1:32] (']') no viable alternative at input '[]'



>>>[Dimcli tip] An error occurred with the batch '0-100'. Consider using the 'limit' argument to retrieve fewer records per iteration, or use 'force=True' to ignore errors and continue the extraction.[0m


Howard L Lippton done
Failed queries: set()
Howard L Lippton: {'Howard L Lippton': {'A5028376304': {'id': 'A5028376304', 'relevance_score': 9521.105, 'works_count': 112, 'cited_by_count': 3484, 'ids': {'openalex': 'https://openalex.org/A5028376304'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I121820613', 'ror': 'https://ror.org/05ect4e57', 'display_name': 'Louisiana State University', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I121820613']}, 'years': [2001, 1998, 1997, 1995, 1994, 1993, 1992, 1991, 1990, 1989]}, {'institution': {'id': 'https://openalex.org/I75420490', 'ror': 'https://ror.org/01qv8fp92', 'display_name': 'Louisiana State University Health Sciences Center New Orleans', 'country_code': 'US', 'type': 'healthcare', 'lineage': ['https://openalex.org/I75420490']}, 'years': [2001, 1998, 1995]}, {'institution': {'id': 'https://openalex.org/I81020160', 'ror': 'https://ror.org/03151rh82', 'display_name': 'Louisiana State Univer

In [None]:
def query_dimensions(ids):
    ids_str = ', '.join([f'"{id}"' for id in ids])
    q = f"""search researchers where id in [{ids_str}] return researchers[id+current_research_org+dimensions_url+first_grant_year+first_publication_year+last_grant_year+last_name+last_publication_year+nih_ppid+obsolete+orcid_id+redirect+research_orgs+score+total_grants+total_publications]"""
    result = dsl.query_iterative(q, show_results=None, limit=100, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None)
    if not "researchers" in result.data:
        return []
    return result.data["researchers"]

In [14]:
#GRANTS TABLE UPLOAD
execute_command(f"""CREATE TABLE Grants (
    id STRING PRIMARY KEY,
    abstract STRING,
    active_year STRING, 
    concepts STRING, 
    concepts_scores STRING,
    category_rcdc STRING,
    date_inserted STRING, 
    dimensions_url STRING,
    end_date STRING, 
    funder_orgs STRING, 
    funding_USD STRING, 
    investigators STRING,
    keywords STRING,
    original_title STRING, 
    project_numbers STRING,
    research_orgs STRING,
    researchers STRING, 
    score STRING, 
    start_date STRING, 
    title STRING,
    researcher_id STRING,
    FOREIGN KEY (researcher_id) REFERENCES researchers(id)
    );""")

query_work2_template = """search grants where researchers = "{}" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]"""
for id in researcher_ids:
    q = query_work2_template.format(id)
    result = dsl.query_iterative(q, show_results=None, limit=100, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None)
    result_dict = result.data["grants"]
    print(q)
    print(result_dict)
    for grant in result_dict: 
        fields = []
        values = []
        update_fields = []
        for field, value in grant.items():
            if not isinstance(value, str):
                # Convert non-string values to JSON string
                value = json.dumps(value)
            value = value.replace("'", "''")
            fields.append(field)
            values.append(f"'{value}'")
            update_fields.append(f"{field} = EXCLUDED.{field}")

        # Join the lists to create the SQL query
        columns = ', '.join(fields)
        columns += ", researcher_id"
        values_str = ', '.join(values)
        values_str += f", '{id}'"
        update_str = ', '.join(update_fields)
        update_str += f", researcher_id = EXCLUDED.researcher_id"

        insert_query = insert_query = f"INSERT INTO Grants ({columns}) VALUES({values_str}) ON CONFLICT (id) DO UPDATE SET {update_str};"

        execute_command(insert_query)


Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.01242700051.26" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-12 / 12 (0.69s)[0m
===
Records extracted: 12[0m


search grants where researchers = "ur.01175006530.14" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'SUMMARY Aortic stenosis (AS) is a serious condition that affects 2-4% of the elderly, and is responsible for U.S. healthcare expenditures of over $6 billion annually attributable mostly to valve replacement procedures. Frequently, AS is diagnosed by non-invasive imaging before it is severe or symptomatic. Yet there are no pharmacologic therapies to slow progression of disease. The pathobiology of AS involves the myofibroblastic and osteoblastic transformation of valvular interstitial cells (VICs) that mediate matrix remodeling and calcification. The plurality of events and signaling pathways that influence VICs is one reason for lack of effe

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.011755311476.25" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.0670046312.82" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-1 / 1 (0.30s)[0m
===
Records extracted: 1[0m


search grants where researchers = "ur.010011051350.54" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'Inflammatory responses play a deterimental role in cardiac transplant rejection and reperfusion injury after myocardial ischemia. A simple and reliable method for imaging and quantifying tissue inflammatory responses in the clinical setting is not currently available. Such a technique would be useful for both the detection of cardiac allograft rejection and for assessing new treatment strategies for attenuating leukocyte activation and recruitment following coronary reperfusion that are being developed and tested. The central aim of this proposal is to develop and characterize a non-invasive method for assessing the spatial extent and sever

Starting iteration with limit=100 skip=0 ...[0m
0-1 / 1 (0.46s)[0m
===
Records extracted: 1[0m


search grants where researchers = "ur.016017457303.89" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'Since the majority of experienced astronauts are middle aged, they are at risk for developing serious cardiovascular events such as a myocardial infarction or sudden cardiac death, especially during high intensity exertion. Studies led to the current flight medicine practice of screening all astronaut candidates (and following all active crew members) with coronary artery calcium (CAC) scoring. However, atherosclerosis is a progressive process. The development of vascular calcification may be preceded by substantial non-calcified plaque, which may be most prone to rupture and cause an acute coronary syndrome. Radiation and inflammation may 

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.014144320643.55" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-13 / 13 (0.45s)[0m
===
Records extracted: 13[0m


search grants where researchers = "ur.0772067763.56" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'ABSTRACT We have one specific aim: Train postdoctoral fellows who will be qualified and well-positioned to become leaders in the development and implementation of substance use research in tobacco control, public health, policy, and clinical practice. Specifically, we propose a postdoctoral fellowship program that will attract individuals with a strong commitment to transdisciplinary research from a variety of medical, biological, social, behavioral, and policy sciences to help build the next generation of scientific leaders in tobacco control and related substance use. Tobacco remains the leading preventable cause of death the U.S. and dispr

Starting iteration with limit=100 skip=0 ...[0m
0-16 / 16 (0.46s)[0m
===
Records extracted: 16[0m


search grants where researchers = "ur.016375226437.33" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'The long-term goal of this of this research is to identify new genetic causes of congenital neutropenia and characterize their molecular mechanisms of disease pathogenesis. Severe congenital neutropenia (SCN) is an inborn disorder of granulopoiesis characterized by severe chronic neutropenia from birth, premature death secondary to infectious complications, and transformation to myeloid malignancy. Approximately one-third of cases do not have a known genetic cause. We performed whole exome sequencing of 85 cases of congenital neutropenia. Heterozygous missense mutations of CLPB, encoding caseinolytic peptidase B, were identified in 6 of 45 

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.010633200331.89" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-10 / 10 (0.39s)[0m
===
Records extracted: 10[0m


search grants where researchers = "ur.01326031522.43" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'DESCRIPTION (provided by applicant): Atherosclerosis, the underlying cause of heart attack and stroke, is a major cause of death and suffering worldwide. The scavenger receptor BI (SR-BI) plays crucial roles in preventing atherosclerosis both by serving as a hepatic receptor for HDL cholesterol and by regulating macrophage cellular cholesterol homeostasis and survival in the arterial plaque. Recent studies have implicated SR-BI in cell survival by preventing apoptosis. Interestingly, our preliminary studies implicate a critical role for SR-BI in regulating autophagy, another key mechanism for promoting cell survival. Furthermore, our data su

Starting iteration with limit=100 skip=0 ...[0m
0-4 / 4 (0.33s)[0m
===
Records extracted: 4[0m


search grants where researchers = "ur.01044511164.71" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'active_year': [2022], 'concepts': ['CARD9 signaling', 'experimental dermatophytosis', 'protective immunity', 'CARD9', 'dermatophytosis', 'signal', 'immunity', 'skin'], 'concepts_scores': [{'concept': 'CARD9 signaling', 'relevance': 0.326}, {'concept': 'experimental dermatophytosis', 'relevance': 0.301}, {'concept': 'protective immunity', 'relevance': 0.269}, {'concept': 'CARD9', 'relevance': 0.261}, {'concept': 'dermatophytosis', 'relevance': 0.234}, {'concept': 'signal', 'relevance': 0.21}, {'concept': 'immunity', 'relevance': 0.203}, {'concept': 'skin', 'relevance': 0.197}], 'date_inserted': '2023-11-30', 'dimensions_url': 'https://app.dimensions.ai/d

Starting iteration with limit=100 skip=0 ...[0m
0-2 / 2 (0.33s)[0m
===
Records extracted: 2[0m


search grants where researchers = "ur.014136153365.29" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'A major scientific challenge is to understand the molecular events that drive the evolution of premalignant lesions in actual tissue. Laser capture microdissection (LCM) was originated to provide a reliable method to procure pure populations of cells from specific microscopic regions of tissue sections; in one step, under direct visualization. The cells of interest are transferred to a polymer film that is activated by laser pulses. The exact morphology of the procured cells (with intact DNA, RNA and proteins) is retained and held on the transfer film. LCM technology has been successfully applied to DNA, and RNA analysis from frozen and fix

Starting iteration with limit=100 skip=0 ...[0m
0-20 / 20 (0.52s)[0m
===
Records extracted: 20[0m


search grants where researchers = "ur.01242102471.41" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': "Background:  Diabetes is associated not only with an increased risk of developing breast cancer (BC), but also BC metastasis and death.  While BC and diabetes share common risk factors, the biological mechanisms linking the two remain unclear.  We have shown that the Receptor for Advanced Glycation End-products (RAGE) and its ligands (AGEs and s100A8/9) are a critical pathway underlying diabetes and BC and may therefore represent a common link between these two highly prevalent and deadly disease states.  Our preliminary and published data show that RAGE drives the invasive phenotype in BC cells and in metastasis in animal models and that a 

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.01242700051.26" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-12 / 12 (0.66s)[0m
===
Records extracted: 12[0m


search grants where researchers = "ur.01175006530.14" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'SUMMARY Aortic stenosis (AS) is a serious condition that affects 2-4% of the elderly, and is responsible for U.S. healthcare expenditures of over $6 billion annually attributable mostly to valve replacement procedures. Frequently, AS is diagnosed by non-invasive imaging before it is severe or symptomatic. Yet there are no pharmacologic therapies to slow progression of disease. The pathobiology of AS involves the myofibroblastic and osteoblastic transformation of valvular interstitial cells (VICs) that mediate matrix remodeling and calcification. The plurality of events and signaling pathways that influence VICs is one reason for lack of effe

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.011755311476.25" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.0670046312.82" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-1 / 1 (0.26s)[0m
===
Records extracted: 1[0m


search grants where researchers = "ur.010011051350.54" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'Inflammatory responses play a deterimental role in cardiac transplant rejection and reperfusion injury after myocardial ischemia. A simple and reliable method for imaging and quantifying tissue inflammatory responses in the clinical setting is not currently available. Such a technique would be useful for both the detection of cardiac allograft rejection and for assessing new treatment strategies for attenuating leukocyte activation and recruitment following coronary reperfusion that are being developed and tested. The central aim of this proposal is to develop and characterize a non-invasive method for assessing the spatial extent and sever

Starting iteration with limit=100 skip=0 ...[0m
0-1 / 1 (0.32s)[0m
===
Records extracted: 1[0m


search grants where researchers = "ur.016017457303.89" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'Since the majority of experienced astronauts are middle aged, they are at risk for developing serious cardiovascular events such as a myocardial infarction or sudden cardiac death, especially during high intensity exertion. Studies led to the current flight medicine practice of screening all astronaut candidates (and following all active crew members) with coronary artery calcium (CAC) scoring. However, atherosclerosis is a progressive process. The development of vascular calcification may be preceded by substantial non-calcified plaque, which may be most prone to rupture and cause an acute coronary syndrome. Radiation and inflammation may 

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.014144320643.55" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-13 / 13 (0.35s)[0m
===
Records extracted: 13[0m


search grants where researchers = "ur.0772067763.56" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'ABSTRACT We have one specific aim: Train postdoctoral fellows who will be qualified and well-positioned to become leaders in the development and implementation of substance use research in tobacco control, public health, policy, and clinical practice. Specifically, we propose a postdoctoral fellowship program that will attract individuals with a strong commitment to transdisciplinary research from a variety of medical, biological, social, behavioral, and policy sciences to help build the next generation of scientific leaders in tobacco control and related substance use. Tobacco remains the leading preventable cause of death the U.S. and dispr

Starting iteration with limit=100 skip=0 ...[0m
0-16 / 16 (0.48s)[0m
===
Records extracted: 16[0m


search grants where researchers = "ur.016375226437.33" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'The long-term goal of this of this research is to identify new genetic causes of congenital neutropenia and characterize their molecular mechanisms of disease pathogenesis. Severe congenital neutropenia (SCN) is an inborn disorder of granulopoiesis characterized by severe chronic neutropenia from birth, premature death secondary to infectious complications, and transformation to myeloid malignancy. Approximately one-third of cases do not have a known genetic cause. We performed whole exome sequencing of 85 cases of congenital neutropenia. Heterozygous missense mutations of CLPB, encoding caseinolytic peptidase B, were identified in 6 of 45 

Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m


search grants where researchers = "ur.010633200331.89" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[]


0-10 / 10 (0.31s)[0m
===
Records extracted: 10[0m


search grants where researchers = "ur.01326031522.43" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'DESCRIPTION (provided by applicant): Atherosclerosis, the underlying cause of heart attack and stroke, is a major cause of death and suffering worldwide. The scavenger receptor BI (SR-BI) plays crucial roles in preventing atherosclerosis both by serving as a hepatic receptor for HDL cholesterol and by regulating macrophage cellular cholesterol homeostasis and survival in the arterial plaque. Recent studies have implicated SR-BI in cell survival by preventing apoptosis. Interestingly, our preliminary studies implicate a critical role for SR-BI in regulating autophagy, another key mechanism for promoting cell survival. Furthermore, our data su

Starting iteration with limit=100 skip=0 ...[0m
0-4 / 4 (0.25s)[0m
===
Records extracted: 4[0m


search grants where researchers = "ur.01044511164.71" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'active_year': [2022], 'concepts': ['CARD9 signaling', 'experimental dermatophytosis', 'protective immunity', 'CARD9', 'dermatophytosis', 'signal', 'immunity', 'skin'], 'concepts_scores': [{'concept': 'CARD9 signaling', 'relevance': 0.326}, {'concept': 'experimental dermatophytosis', 'relevance': 0.301}, {'concept': 'protective immunity', 'relevance': 0.269}, {'concept': 'CARD9', 'relevance': 0.261}, {'concept': 'dermatophytosis', 'relevance': 0.234}, {'concept': 'signal', 'relevance': 0.21}, {'concept': 'immunity', 'relevance': 0.203}, {'concept': 'skin', 'relevance': 0.197}], 'date_inserted': '2023-11-30', 'dimensions_url': 'https://app.dimensions.ai/d

Starting iteration with limit=100 skip=0 ...[0m
0-2 / 2 (0.29s)[0m
===
Records extracted: 2[0m


search grants where researchers = "ur.014136153365.29" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': 'A major scientific challenge is to understand the molecular events that drive the evolution of premalignant lesions in actual tissue. Laser capture microdissection (LCM) was originated to provide a reliable method to procure pure populations of cells from specific microscopic regions of tissue sections; in one step, under direct visualization. The cells of interest are transferred to a polymer film that is activated by laser pulses. The exact morphology of the procured cells (with intact DNA, RNA and proteins) is retained and held on the transfer film. LCM technology has been successfully applied to DNA, and RNA analysis from frozen and fix

Starting iteration with limit=100 skip=0 ...[0m
0-20 / 20 (0.42s)[0m
===
Records extracted: 20[0m


search grants where researchers = "ur.01242102471.41" return grants [abstract + active_year + concepts + concepts_scores + date_inserted + dimensions_url + end_date + funder_orgs + funding_usd + investigators + keywords + original_title + research_orgs + researchers + score + start_date + title + id + category_rcdc + project_numbers]
[{'abstract': "Background:  Diabetes is associated not only with an increased risk of developing breast cancer (BC), but also BC metastasis and death.  While BC and diabetes share common risk factors, the biological mechanisms linking the two remain unclear.  We have shown that the Receptor for Advanced Glycation End-products (RAGE) and its ligands (AGEs and s100A8/9) are a critical pathway underlying diabetes and BC and may therefore represent a common link between these two highly prevalent and deadly disease states.  Our preliminary and published data show that RAGE drives the invasive phenotype in BC cells and in metastasis in animal models and that a 

In [16]:
#PATENTS TABLE UPLOAD
execute_command(f"""CREATE TABLE IF NOT EXISTS Patents (
    id STRING PRIMARY KEY,
    abstract STRING,
    application_number STRING,
    assignee_names STRING,
    assignees STRING,
    associated_grant_ids STRING,
    claims_amount STRING,
    current_assignee_names STRING,
    current_assignees STRING,
    category_rcdc STRING,
    date STRING,
    date_inserted STRING,
    dimensions_url STRING,
    expiration_date STRING,
    federal_support STRING,
    filing_date STRING,
    filing_status STRING,
    funders STRING,
    granted_date STRING,
    granted_year STRING,
    inventor_names STRING,
    inventors STRING,
    kind STRING,
    priority_date STRING,
    priority_year STRING,
    publication_date STRING,
    publication_ids STRING,
    publication_year STRING,
    publications STRING,
    reference_ids STRING,
    researchers STRING,
    score STRING,
    times_cited STRING,
    title STRING,
    year STRING,
    researcher_id STRING,
    FOREIGN KEY (researcher_id) REFERENCES researchers(id)
);""")

query_work2_template = """search patents where researchers = "{}" return patents [abstract + application_number + assignee_names + assignees + associated_grant_ids + cited_by_ids + claims_amount + current_assignee_names + current_assignees + date + date_inserted + dimensions_url + expiration_date + federal_support + filing_date + filing_status + funders + granted_date + granted_year + id + inventor_names + inventors + kind + priority_date + priority_year + publication_date + publication_ids + publication_year + publications + reference_ids + researchers + score + times_cited + title + year + category_rcdc]"""

for id in researcher_ids:

    q = query_work2_template.format(id)
    result = dsl.query_iterative(q, show_results=None, limit=500, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None)
    result_dict = result.data["patents"]

    for patent in result_dict: 
        fields = []
        values = []
        update_fields = []
        for field, value in patent.items():
            if not isinstance(value, str):
                # Convert non-string values to JSON string
                value = json.dumps(value)
            value = value.replace("'", "''")
            fields.append(field)
            values.append(f"'{value}'")  # Enclose string values in single quotes
            update_fields.append(f"{field} = EXCLUDED.{field}")

        # Join the lists to create the SQL query
        columns = ', '.join(fields)
        columns += ", researcher_id"
        values_str = ', '.join(values)
        values_str += f", '{id}'"
        update_str = ', '.join(update_fields)
        update_str += f", researcher_id = EXCLUDED.researcher_id"

        insert_query = insert_query = f"INSERT INTO Patents ({columns}) VALUES({values_str}) ON CONFLICT (id) DO UPDATE SET {update_str};"

        execute_command(insert_query)


Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
0-8 / 8 (0.43s)[0m
===
Records extracted: 8[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
0-1 / 1 (0.32s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=500 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=500 skip=0 ...[0m
0-22 / 22 (0.99s)[0m
===
Records extracted: 22[0m
Starting iteration with limit=500 skip=0 ...[0m
0-2 / 2 (0.33s)[0m
===
Record

In [17]:
#CLINICAL TRIALS TABLE UPLOAD
execute_command(f"""CREATE TABLE IF NOT EXISTS Clinical_Trials (
    id STRING PRIMARY KEY,
    abstract STRING,
    active_years STRING, 
    associated_grant_ids STRING, 
    brief_title STRING,
    conditions STRING,
    date_inserted STRING, 
    dimensions_url STRING,
    funders STRING,
    end_date STRING, 
    investigators STRING,
    publications STRING,
    publication_ids STRING,
    research_orgs STRING,
    researchers STRING, 
    mesh_terms STRING,
    score STRING, 
    category_rcdc STRING,
    start_date STRING, 
    title STRING,
    researcher_id STRING,
    FOREIGN KEY (researcher_id) REFERENCES researchers(id)
);""")

query_work2_template = """search clinical_trials in investigators for "{}" return clinical_trials [abstract + end_date + active_years + associated_grant_ids + brief_title + conditions + date_inserted + dimensions_url + funders + id + investigators + publications + publication_ids + research_orgs + researchers + score + start_date + title + category_rcdc + mesh_terms]"""

for id in researcher_ids:

    q = query_work2_template.format(id)
    
    result = dsl.query_iterative(q, show_results=None, limit=100, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None)
    result_dict = result.data["clinical_trials"]
    
    for ct in result_dict: 
        fields = []
        values = []
        update_fields = []
        for field, value in ct.items():
            if not isinstance(value, str):
                # Convert non-string values to JSON string
                value = json.dumps(value)
            value = value.replace("'", "''")
            fields.append(field)
            values.append(f"'{value}'")  # Enclose string values in single quotes
            update_fields.append(f"{field} = EXCLUDED.{field}")

        # Join the lists to create the SQL query
        columns = ', '.join(fields)
        columns += ", researcher_id"
        values_str = ', '.join(values)
        values_str += f", '{id}'"
        update_str = ', '.join(update_fields)
        update_str += f", researcher_id = EXCLUDED.researcher_id"

        insert_query = insert_query = f"INSERT INTO Clinical_Trials ({columns}) VALUES({values_str}) ON CONFLICT (id) DO UPDATE SET {update_str};"

        execute_command(insert_query)


Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
0-15 / 15 (0.39s)[0m
===
Records extracted: 15[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
0-2 / 2 (6.13s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
===
Records extracted: 0[0m
Starting iteration with limit=100 skip=0 ...[0m
0-1 / 1 (1.17s)[0m
===
Records extracted: 1[0m
Starting iteration with limit=100 skip=0 ...[0m
0-5 / 5 (3.12s)[0m
===
Record

In [21]:

#PUBLICATIONS TABLE UPLOAD

alexResearcher_ids = []
i = 0 #to get specific researcher_id
execute_command("DROP TABLE IF EXISTS Publications;")
execute_command(f"""CREATE TABLE Publications (
    authorships STRING,
    best_oa_location STRING,
    cited_by_api_url STRING,
    cited_by_count STRING,
    concepts STRING,
    counts_by_year STRING,
    doi STRING,
    grants STRING,
    id STRING PRIMARY KEY,
    ids STRING,
    is_paratext STRING,
    keywords STRING,
    locations STRING,
    mesh STRING,
    primary_location STRING,
    publication_date STRING,
    publication_year STRING,
    referenced_works STRING,
    topics STRING,
    title STRING,
    researcher_id STRING,
    FOREIGN KEY (researcher_id) REFERENCES researchers(id)
);
""")

for author in id_dictionary["people"][2303:2310]:
    #Get the person's first and last name
    first_name = author["first_name"].strip(".")
    middle_name = author["middle_name"].strip(".")
    last_name = author["last_name"].strip(".")
    asci_year_value = author["year"]
    

    #Create a query name by combining the person's first and last name
    if author["middle_name"]: 
        author_name = first_name + " " + middle_name + " " + last_name
    else: 
        author_name = first_name + " " + last_name

    result_dict = [] #Storing a list of dictionaries, each representing a publication with the keys as the characteristics (i.e. id, title, pub_date)
    authorIds, failed = author_ids(author_name)
    print(author_name + ": "+ str(authorIds))
    for author_name, author_data in authorIds.items():
        for authorId_dict in author_data.values():
            authorId = authorId_dict['id']  # Extract the ID from the dictionary
            print("this is the authorid: " + str(authorId))
            workIds = work_id(authorId)
            for workId in workIds:
                alexResearcher_ids.append(authorId)
                work_details = findWork(workId)
                result_dict.append(work_details)

    
    for publication in result_dict: 
        fields = []
        values = []
        update_fields = []
        
        for field, value in publication.items():
            if not isinstance(value, str):
                # Convert non-string values to JSON string
                value = json.dumps(value)
            value = value.replace("'", "''")
            fields.append(field)
            values.append(f"'{value}'")  # Enclose string values in single quotes
            update_fields.append(f"{field} = EXCLUDED.{field}")

        # Join the lists to create the SQL query
        columns = ', '.join(fields)
        columns += ", researcher_id"
        values_str = ', '.join(values)
        values_str += f", '{alexResearcher_ids[i]}'"
        update_str = ', '.join(update_fields)
        update_str += f", researcher_id = EXCLUDED.researcher_id"
        i += 1

        insert_query = f"INSERT INTO Publications ({columns}) VALUES({values_str}) ON CONFLICT (id) DO UPDATE SET {update_str};"

        execute_command(insert_query)

        




Daniel C Link done
Failed queries: set()
Daniel C Link: {'Daniel C Link': {'A5049284725': {'id': 'A5049284725', 'relevance_score': 29321.25, 'works_count': 799, 'cited_by_count': 35254, 'ids': {'openalex': 'https://openalex.org/A5049284725', 'orcid': 'https://orcid.org/0000-0002-3170-7581'}, 'affiliations': [{'institution': {'id': 'https://openalex.org/I204465549', 'ror': 'https://ror.org/01yc7t268', 'display_name': 'Washington University in St. Louis', 'country_code': 'US', 'type': 'education', 'lineage': ['https://openalex.org/I204465549']}, 'years': [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]}, {'institution': {'id': 'https://openalex.org/I4210135078', 'ror': 'https://ror.org/036c27j91', 'display_name': 'Washington University Medical Center', 'country_code': 'US', 'type': 'healthcare', 'lineage': ['https://openalex.org/I4210135078']}, 'years': [2019, 1992, 1991]}, {'institution': {'id': 'https://openalex.org/I4210119077', 'ror': 'https://ror.org/02kb97560', 'display

In [19]:
for publication in result_dict: 
        fields = []
        values = []
        update_fields = []
        
        for field, value in publication.items():
            if not isinstance(value, str):
                # Convert non-string values to JSON string
                value = json.dumps(value)
            value = value.replace("'", "''")
            fields.append(field)
            values.append(f"'{value}'")  # Enclose string values in single quotes
            update_fields.append(f"{field} = EXCLUDED.{field}")

        # Join the lists to create the SQL query
        columns = ', '.join(fields)
        columns += ", researcher_id"
        values_str = ', '.join(values)
        values_str += f", '{alexResearcher_ids[i]}'"
        update_str = ', '.join(update_fields)
        update_str += f", researcher_id = EXCLUDED.researcher_id"
        i += 1

        insert_query = f"INSERT INTO Publications ({columns}) VALUES({values_str}) ON CONFLICT (id) DO UPDATE SET {update_str};"

        execute_command(insert_query)
