In [None]:
!pip install torch==1.12.0
!pip install torch-geometric==2.3.0
!pip install pyhealth==1.1.2
!pip install scikit-learn==1.2.1
!pip install openai==0.27.
!pip install numpy==1.22.0

In [1]:
import csv

condition_mapping_file = "../../resources/CCSCM.csv"
procedure_mapping_file = "../../resources/CCSPROC.csv"
drug_file = "../../resources/ATC.csv"

condition_dict = {}
with open(condition_mapping_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        condition_dict[row['code']] = row['name'].lower()

procedure_dict = {}
with open(procedure_mapping_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        procedure_dict[row['code']] = row['name'].lower()

drug_dict = {}
with open(drug_file, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['level'] == '3':
            drug_dict[row['code']] = row['name'].lower()

print(drug_dict)

{'A01A': 'stomatological preparations', 'A02A': 'antacids', 'A02B': 'drugs for peptic ulcer and gastro-oesophageal reflux disease (gord)', 'A02X': 'other drugs for acid related disorders in atc', 'A03A': 'drugs for functional gastrointestinal disorders', 'A03B': 'belladonna and derivatives containing drugs, plain for functional gastrointestinal disorders', 'A03C': 'antispasmodics in combination with psycholeptics', 'A03D': 'antispasmodics in combination with analgesics', 'A03E': 'antispasmodics and anticholinergics in combination with other drugs', 'A03F': 'propulsives', 'A04A': 'antiemetics and antinauseants', 'A05A': 'bile therapy drugs', 'A05B': 'liver therapy, lipotropics', 'A05C': 'drugs for bile therapy and lipotropics in combination', 'A06A': 'drugs for constipation', 'A07A': 'intestinal antiinfectives', 'A07B': 'intestinal adsorbents', 'A07C': 'electrolytes with carbohydrates, antidiarrheals, intestinal antiinflammatory/antiinfective agents', 'A07D': 'antipropulsives', 'A07E': 

In [2]:
import re 
from ChatGPT import ChatGPT
import json

def extract_data_in_brackets(input_string):
    pattern = r"\[(.*?)\]"
    matches = re.findall(pattern, input_string)
    return matches 

def divide_text(long_text, max_len=800):
    sub_texts = []
    start_idx = 0
    while start_idx < len(long_text):
        end_idx = start_idx + max_len
        sub_text = long_text[start_idx:end_idx]
        sub_texts.append(sub_text)
        start_idx = end_idx
    return sub_texts

def filter_triples(triples):
    chatgpt = ChatGPT()
    response = chatgpt.chat(
        f"""
            I have a list of triples. I want to select 50 most important triples from the list.
            The importance of a triple is based on how you think it will help imrpove healthcare prediction tasks (e.g., drug recommendation, mortality prediction, readmission prediction …).
            If you think a triple is important, please keep it. Otherwise, please remove it.
            You can also add triples from your background knowledge.
            The total size of the updated list should be below 50.

            triples: {triples}
            updates:
        """
        )
    json_string = str(response)
    json_data = json.loads(json_string)

    filtered_triples = extract_data_in_brackets(json_data['content'])
    return filtered_triples


In [3]:
from ChatGPT import ChatGPT
import json

def graph_gen(term: str, mode: str):
    if mode == "condition":
        example = \
        """
        Example:
        prompt: systemic lupus erythematosus
        updates: [[systemic lupus erythematosus, is an, autoimmune condition], [systemic lupus erythematosus, may cause, nephritis], [anti-nuclear antigen, is a test for, systemic lupus erythematosus], [systemic lupus erythematosus, is treated with, steroids], [methylprednisolone, is a, steroid]]
        """
    elif mode == "procedure":
        example = \
        """
        Example:
        prompt: endoscopy
        updates: [[endoscopy, is a, medical procedure], [endoscopy, used for, diagnosis], [endoscopic biopsy, is a type of, endoscopy], [endoscopic biopsy, can detect, ulcers]]
        """
    elif mode == "drug":
        example = \
        """
        Example:
        prompt: iobenzamic acid
        updates: [[iobenzamic acid, is a, drug], [iobenzamic acid, may have, side effects], [side effects, can include, nausea], [iobenzamic acid, used as, X-ray contrast agent], [iobenzamic acid, formula, C16H13I3N2O3]]
        """
    chatgpt = ChatGPT()
    response = chatgpt.chat(
        f"""
            Given a prompt (a medical condition/procedure/drug), extrapolate as many relationships as possible of it and provide a list of updates.
            The relationships should be helpful for healthcare prediction (e.g., drug recommendation, mortality prediction, readmission prediction …)
            Each update should be exactly in format of [ENTITY 1, RELATIONSHIP, ENTITY 2]. The relationship is directed, so the order matters.
            Both ENTITY 1 and ENTITY 2 should be noun.
            Any element in [ENTITY 1, RELATIONSHIP, ENTITY 2] should be conclusive, make it as short as possible.
            Do this in both breadth and depth. Expand [ENTITY 1, RELATIONSHIP, ENTITY 2] until the size reaches 100.

            {example}

            prompt: {term}
            updates:
        """
        )
    json_string = str(response)
    json_data = json.loads(json_string)

    triples = extract_data_in_brackets(json_data['content'])
    outstr = ""
    for triple in triples:
        outstr += triple.replace('[', '').replace(']', '').replace(', ', '\t') + '\n'

    return outstr

In [4]:
from ChatGPT import ChatGPT
import json

def graph_gen_note(note_str: str):
    # if mode == "notes":
    #     example = \
    #     """
    #     Example: Mother was treated with antibiotics because of maternal temp of 100.3 just prior to delivery.  Mother's temp was then lower but at 2 hours rose again to 102.
    #     prompt: 
    #     updates: 
    #     """
    triples_tot = []
    notes = divide_text(note_str)
    for note in notes:
        chatgpt = ChatGPT()
        response = chatgpt.chat(
            f"""
                Given a prompt (a clinical note from MIMIC-III dataset), extrapolate relationships of it and provide a list of updates.
                The relationships should be helpful for healthcare prediction (e.g., drug recommendation, mortality prediction, readmission prediction …)
                Each update should be exactly in format of [ENTITY 1, RELATIONSHIP, ENTITY 2]. The relationship is directed, so the order matters.
                Both ENTITY 1 and ENTITY 2 should be noun.
                Any element in [ENTITY 1, RELATIONSHIP, ENTITY 2] should be conclusive, make it as short as possible.
                Do this in both breadth and depth. Expand [ENTITY 1, RELATIONSHIP, ENTITY 2].

                prompt: {note}
                updates:
            """
            )
        json_string = str(response)
        json_data = json.loads(json_string)

        triples = extract_data_in_brackets(json_data['content'])
        triples_tot += triples

    
    outstr = ""
    for triple in triples_tot:
        outstr += triple.replace('[', '').replace(']', '').replace(', ', '\t') + '\n'

    return outstr

In [7]:
import json

with open('../../clinical_notes/subject_text_dict.json', 'r') as f:
    subject_text_dict = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../../clinical_notes/subject_text_dict.json'

In [8]:
from tqdm import tqdm
import os

for key in tqdm(subject_text_dict.keys()):
    file = f'../../graphs/notes/{key}.txt'
    notes = subject_text_dict[key]
    
    if os.path.exists(file):
        # with open(file=file, mode="r", encoding='utf-8') as f:
        #     prev_triples = f.read()
        # if len(prev_triples.split('\n')) < 100:
        #     outstr = graph_gen_note(subject_text_dict[key])
        #     outfile = open(file=file, mode='w', encoding='utf-8')
        #     outstr = prev_triples + outstr
        #     # print(outstr)
        #     outfile.write(outstr)
        continue
    else:
        outstr = graph_gen_note(subject_text_dict[key])
        outfile = open(file=file, mode='w', encoding='utf-8')
        outstr = outstr
        # print(outstr)
        outfile.write(outstr)

NameError: name 'subject_text_dict' is not defined

In [6]:
from tqdm import tqdm
import os

for key in tqdm(condition_dict.keys()):
    file = f'../../graphs/condition/CCSCM/{key}.txt'
    if os.path.exists(file):
        with open(file=file, mode="r", encoding='utf-8') as f:
            prev_triples = f.read()
        if len(prev_triples.split('\n')) < 100:
            outstr = graph_gen(term=condition_dict[key], mode="condition")
            outfile = open(file=file, mode='w', encoding='utf-8')
            outstr = prev_triples + outstr
            # print(outstr)
            outfile.write(outstr)
    else:
        outstr = graph_gen(term=condition_dict[key], mode="condition")
        outfile = open(file=file, mode='w', encoding='utf-8')
        outstr = outstr
        # print(outstr)
        outfile.write(outstr)

  0%|          | 0/285 [00:00<?, ?it/s]

In [12]:
from tqdm import tqdm
import os

for key in tqdm(procedure_dict.keys()):
    file = f'../../graphs/procedure/CCSPROC/{key}.txt'
    if os.path.exists(file):
        with open(file=file, mode="r", encoding='utf-8') as f:
            prev_triples = f.read()
        if len(prev_triples.split('\n')) < 150:
            outstr = graph_gen(term=procedure_dict[key], mode="procedure")
            outfile = open(file=file, mode='w', encoding='utf-8')
            outstr = prev_triples + outstr
            # print(outstr)
            outfile.write(outstr)
    else:
        outstr = graph_gen(term=procedure_dict[key], mode="procedure")
        outfile = open(file=file, mode='w', encoding='utf-8')
        outstr = outstr
        # print(outstr)
        outfile.write(outstr)

100%|██████████| 231/231 [3:12:27<00:00, 49.99s/it]   


In [None]:
import importlib
import openai

with open("resources\openai.key", 'r') as f:
    key = f.readline().strip()
openai.api_key = key

# 重新导入 openai 模块
importlib.reload(openai)

# 打印 API 密钥
print(key)

In [7]:
from tqdm import tqdm
import os

for key in tqdm(drug_dict.keys()):
    file = f'../../graphs/drug/ATC3/{key}.txt'
    if os.path.exists(file):
        # with open(file=file, mode="r", encoding='utf-8') as f:
        #     prev_triples = f.read()
        # if len(prev_triples.split('\n')) < 150:
        #     outstr = graph_gen(term=drug_dict[key], mode="drug")
        #     outfile = open(file=file, mode='w', encoding='utf-8')
        #     outstr = prev_triples + outstr
        #     # print(outstr)
        #     outfile.write(outstr)
        continue
    else:
        outstr = graph_gen(term=drug_dict[key], mode="drug")
        outfile = open(file=file, mode='w', encoding='utf-8')
        outstr = outstr
        # print(outstr)
        outfile.write(outstr)

  0%|          | 0/269 [00:00<?, ?it/s]

100%|██████████| 269/269 [11:27<00:00,  2.56s/it]


In [1]:
import requests
import json

# Set the API endpoint
url = "https://api.openai.com/v1/embeddings"

# Set the headers
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer theapikey"
}

# Set the request data
data = {
    "input": "multiple myeloma is a cancer",
    "model": "text-embedding-ada-002"
}

# Convert the data to JSON format
json_data = json.dumps(data)

# Send the POST request to the API endpoint
response = requests.post(url, headers=headers, data=json_data)

# Print the response content
print(response.json())


{'object': 'list', 'data': [{'object': 'embedding', 'index': 0, 'embedding': [-0.019362058, -0.0075340266, -0.016889753, -0.040025327, -0.031151053, 0.033597335, -0.014599616, -0.01603095, -0.026857048, 0.011613331, 0.013610694, 0.027533678, -0.0003102581, 0.008210657, -0.028834892, -0.0051820828, 0.040441718, 0.00865307, 0.011164412, -0.003363637, -0.021535084, 0.015367332, -9.5913674e-05, -0.022836298, 0.02046809, 0.01437841, -0.00062092283, -0.032608412, 0.003851592, -0.004528223, 0.012895026, -0.010852121, -0.01191261, -0.022745213, -0.006629683, 0.0017956747, 0.019791458, -0.0009368738, 0.0032237566, -0.010253563, 0.0036173738, -0.0014923293, -0.0013589549, -0.012511169, -0.03021418, -0.00047331644, 0.00012259872, -0.013688766, -0.01901073, -0.002774838, 0.046479348, -0.0016183844, 0.0027309218, 0.009707053, -0.023682086, -0.015744684, 0.009908741, 0.023174614, -0.00051113294, -0.0008612407, 0.010058381, -0.009316689, -0.016356254, -0.0033002028, -0.0046681035, -0.0017566383, 0.00