In [100]:
import dimcli
from dimcli.utils import *
import re
import mysql.connector

import os, sys, time, json
from tqdm.notebook import tqdm as progressbar

import pandas as pd

import plotly.express as px
from plotly.offline import plot
if not 'google.colab' in sys.modules:
  # make js dependecies local / needed by html exports
    from plotly.offline import init_notebook_mode
    init_notebook_mode(connected=True)

print("==\nLogging in..")
# https://digital-science.github.io/dimcli/getting-started.html#authentication
ENDPOINT = "https://app.dimensions.ai"
if 'google.colab' in sys.modules:
    import getpass
    KEY = getpass.getpass(prompt='API Key: ')  
    dimcli.login(key=KEY, endpoint=ENDPOINT)
else:
    KEY = "9F8D648F0D7E437CB1736BEBDF007F02"
    dimcli.login(key=KEY, endpoint=ENDPOINT)
dsl = dimcli.Dsl()

==
Logging in..
[2mDimcli - Dimensions API Client (v1.1)[0m
[2mConnected to: <https://app.dimensions.ai/api/dsl> - DSL v2.8[0m
[2mMethod: manual login[0m


In [68]:
# Specify the path to your JSON file
id_path = 'asci_aap_dataJSON.json'

# Initialize an empty dictionary
id_dictionary = {}

try:
    # Open the JSON file in read mode
    with open(id_path, 'r') as json_file:
        # Load the JSON data into the dictionary
        id_dictionary = json.load(json_file)
except FileNotFoundError:
    print("JSON file not found.")
except json.JSONDecodeError:
    print("Error decoding JSON data.")

In [60]:
q_template = """search researchers for "{}" where obsolete="0" return researchers[basics + extras]"""

query_work_template = """search patents where researchers = "{}" return patents[basics + extras]"""
authors = {}
patents = {}


for author in id_dictionary["people"][0:100]:
    # Get the person's first and last name
    first_name = author["first_name"].strip(".")
    last_name = author["last_name"].strip(".")
    
    # Create a query name by combining the person's first and last name
    author_name = first_name + " " + last_name
    authors[author_name] = {}
    
    print(f"querying for {author_name}")
    q = q_template.format(author_name)
    result = dsl.query_iterative(q)
    result_dict = result.data["researchers"]
    
    for unique in result_dict: 
        authors[author_name][unique["id"]] = unique
        print(f"querying for {unique['id']}'s works'")
        
        authors[author_name]["patents"] = []
        query_works = query_work_template.format(unique["id"])
        res = dsl.query_iterative(query_works)
        
        
        res_dict = res.data["patents"]
        
        if res_dict:
            for patent in res_dict:
                authors[author_name]["patents"].append(patent["id"])
                patents[patent["id"]] = patent


Starting iteration with limit=1000 skip=0 ...[0m


querying for Kjersti Aagaard


0-2 / 2 (0.95s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for ur.015111712357.02's works'


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for ur.01331772327.01's works'


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Derek Abbott


0-12 / 12 (1.73s)[0m
===
Records extracted: 12[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for ur.0744074052.68's works'


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for ur.0737254642.04's works'


===
Records extracted: 0[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for ur.0722255331.20's works'


0-2 / 2 (0.96s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for ur.0715354260.11's works'


KeyboardInterrupt: 

In [121]:
def execute_command(query):
    try:
        connection = mysql.connector.connect(host='sql.mit.edu',
                                            database="mit-ps+DimensionsPatents",
                                            user='mit-ps',
                                            passwd='cut18vuk',
                                            charset='utf8')

        # Check if the connection is established
        if connection.is_connected():
            # print("Connection to database established")

            # Create a cursor and execute the insert statement
            cursor = connection.cursor()
            # print('This is current command: ',query)
            cursor.execute(query)
            connection.commit()  # Commit the transaction
            # print("Data inserted successfully")

    except mysql.connector.Error as error:
        print(f"Error: {query}")
        print("------")
        print(error)

    finally:
        # Close the connection
        if connection.is_connected():
            cursor.close()
            connection.close()
            # print("MySQL connection is closed")
            
def clean_string(value):
    if isinstance(value, str):
        # Define a regular expression pattern for invalid characters
        invalid_chars_pattern = re.compile(r'[^\w\s\-\.]')

        # Replace invalid characters with an empty string
        cleaned_value = re.sub(invalid_chars_pattern, '', value)
        return cleaned_value
    else:
        return value

def clean_dict(input_dict):
    return {key: clean_string(value) for key, value in input_dict.items()}

def clean_list(input_list):
    return [clean_string(value) for value in input_list]

In [144]:
query_work2_template = """search patents for "{}" return patents[basics + extras]"""

for author in id_dictionary["people"][0:100]:

    
    #Get the person's first and last name
    first_name = author["first_name"].strip(".")
    middle_name = author["middle_name"].strip(".")
    last_name = author["last_name"].strip(".")
    

    #Create a query name by combining the person's first and last name
    
    if author["middle_name"]: 
        author_name = first_name + " " + middle_name + " " + last_name
    else: 
        author_name = first_name + " " + last_name
    
    cleaned_name = re.sub(r'[^a-zA-Z0-9_]', '_', author_name)
    cleaned_name = re.sub(r'^\d', '_', cleaned_name)
    
    execute_command(f"""CREATE TABLE {cleaned_name}(
    id VARCHAR(255) PRIMARY KEY,
    abstract TEXT,
    additional_filters TEXT,
    application_number VARCHAR(255),
    assignee_cities TEXT,
    assignee_countries TEXT,
    assignee_names TEXT,
    assignee_state_codes TEXT,
    assignees TEXT,
    associated_grant_ids TEXT,
    category_bra TEXT,
    category_for TEXT,
    category_for_2020 TEXT,
    category_hra TEXT,
    category_hrcs_hc TEXT,
    category_hrcs_rac TEXT,
    category_icrp_cso TEXT,
    category_icrp_ct TEXT,
    category_rcdc TEXT,
    claims_amount INT,
    cpc VARCHAR(255),
    current_assignee_names TEXT,
    current_assignees TEXT,
    date TEXT,
    date_inserted TEXT,
    dimensions_url VARCHAR(255),
    expiration_date TEXT,
    family_count INT,
    family_id INT,
    federal_support VARCHAR(255),
    filing_date DATE,
    filing_status VARCHAR(255),
    funder_countries TEXT,
    funders TEXT,
    granted_date TEXT,
    granted_year INT,
    inventor_names TEXT,
    inventors TEXT,
    ipcr VARCHAR(255),
    jurisdiction VARCHAR(255),
    kind VARCHAR(255),
    legal_status VARCHAR(255),
    orange_book TEXT,
    original_assignee_names TEXT,
    original_assignees TEXT,
    priority_date TEXT,
    priority_year INT,
    publication_date TEXT,
    publication_ids TEXT,
    publication_year INT,
    publications TEXT,
    reference_ids TEXT,
    researchers TEXT,
    score FLOAT,
    times_cited INT,
    title TEXT,
    year INT
);""")
            
    
    
    print(f"querying for {author_name}")
    q = query_work2_template.format(author_name)
    result = dsl.query_iterative(q)
    result_dict = result.data["patents"]
    
    for patent in result_dict: 
        fields = []
        values = []

        for field, value in patent.items():
            if not isinstance(value, str):
                # Convert non-string values to JSON string
                value = json.dumps(value)
            value = clean_string(value)
            fields.append(field)
            values.append(f"'{value}'")  # Enclose string values in single quotes

        # Join the lists to create the SQL query
        columns = ', '.join(fields)
        values_str = ', '.join(values)

        insert_query = insert_query = f"INSERT INTO `{cleaned_name}` ({columns}) VALUES ({values_str});"

        execute_command(insert_query)




Starting iteration with limit=1000 skip=0 ...[0m


querying for Kjersti M Aagaard


0-7 / 7 (0.57s)[0m
===
Records extracted: 7[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Derek W Abbott


0-822 / 822 (1.10s)[0m
===
Records extracted: 822[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Francois Abboud


0-31 / 31 (0.52s)[0m
===
Records extracted: 31[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Hanna E Abboud


0-76 / 76 (1.71s)[0m
===
Records extracted: 76[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Omar Abdel-Wahab


0-59 / 59 (0.40s)[0m
===
Records extracted: 59[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Sarki A Abdulkadir


0-20 / 20 (0.54s)[0m
===
Records extracted: 20[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for E Dale Abel


0-651 / 651 (0.77s)[0m
===
Records extracted: 651[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Evan Dale Abel


0-281 / 281 (0.85s)[0m
===
Records extracted: 281[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for John Abel


0-1000 / 8803 (1.27s)[0m
1000-2000 / 8803 (1.26s)[0m
2000-3000 / 8803 (4.58s)[0m
3000-4000 / 8803 (1.22s)[0m
4000-5000 / 8803 (3.60s)[0m
5000-6000 / 8803 (5.69s)[0m
6000-7000 / 8803 (1.32s)[0m
7000-8000 / 8803 (3.32s)[0m
8000-8803 / 8803 (1.32s)[0m
===
Records extracted: 8803[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Walter Abelmann


0-20 / 20 (0.51s)[0m
===
Records extracted: 20[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Amy P Abernethy


0-9 / 9 (0.61s)[0m
===
Records extracted: 9[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for J Abildskov


0-75 / 75 (0.45s)[0m
===
Records extracted: 75[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Janis Abkowitz


0-2 / 2 (0.40s)[0m
===
Records extracted: 2[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Abdul B Abou-Samra


0-16 / 16 (0.74s)[0m
===
Records extracted: 16[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Clara Abraham


0-1000 / 2145 (3.82s)[0m
1000-2000 / 2145 (1.17s)[0m
2000-2145 / 2145 (2.06s)[0m
===
Records extracted: 2145[0m
Starting iteration with limit=1000 skip=0 ...[0m


querying for Edward Abraham


0-1000 / 5550 (2.30s)[0m
1000-2000 / 5550 (1.17s)[0m
2000-3000 / 5550 (4.48s)[0m
3000-4000 / 5550 (1.38s)[0m
4000-5000 / 5550 (3.22s)[0m
5000-5550 / 5550 (5.74s)[0m
===
Records extracted: 5550[0m

"is" with a literal. Did you mean "=="?


"is" with a literal. Did you mean "=="?


"is" with a literal. Did you mean "=="?


"is" with a literal. Did you mean "=="?


"is" with a literal. Did you mean "=="?



UnboundLocalError: local variable 'connection' referenced before assignment

In [None]:
with open("patents_inventors.json", "w") as f:
    json.dump(authors, f, indent=4)

with open("patents.json", "w") as f:
    json.dump(patents, f, indent=4)


In [141]:
execute_command("DROP TABLE E_Dale_Abel")