In [13]:
PROJECT_ID = "refsides"
REGION = "us-central1"
ACCESS_TOKEN='ya29.a0Ad52N38wju22GOuDprnLu9IkW87bjgrSft2ml1sXfk68p29ttl8r4IPk_x46NsAG5E8GPx0hLkRigWNmI-ybN-8fGAgMqfnXeOVV-VRZdz5zMe66fl4isRuw7VOcM-HMO_Nq_yZPE6WPOMQ-xWRLdAz9X5YVOt4je5F3KAfrBAaCgYKASwSARMSFQHGX2MioE-hpcWsIo7RwArOcd3MXg0177'

In [14]:
import polars as pl
import json
import pandas as pd
import concurrent
from tqdm import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression


# import
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings
)
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
from openai import OpenAI, AzureOpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from typing import List, Optional

from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from google.cloud import aiplatform

## Set Up RAG and OpenAI connection

In [15]:
config = json.load(open('./config.json'))

organization = ""

api_source = 'OpenAI'

api_key = config[api_source]['openai_api_key'] #constants.AZURE_OPENAI_KEY
api_endpoint = config[api_source]['openai_api_endpoint']  

In [16]:
## saving embeddings
# text_splitter = CharacterTextSplitter()
# chunks = text_splitter.split_documents(documents)
# db = Chroma.from_documents(chunks, OpenAIEmbeddings(openai_api_key=api_key), persist_directory="../Chroma")
# db.persist()

## loading them
db = Chroma(persist_directory="../Chroma", embedding_function=OpenAIEmbeddings(openai_api_key=api_key))

In [17]:
retriever = db.as_retriever()

In [18]:
# no context
client = OpenAI(api_key=api_key)
chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": """As a professional summarizer, create a concise and comprehensive summary
        of the known knowledge for the causal assocation between a drug and an adverse event. 
        Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and 
        conciseness.
        Incorporate the additional context provided, main ideas and essential information, 
        eliminating extraneous language and focusing on critical aspects."""},
        {
            "role": "user",
            "content": """
            Summarize the evidence, if any, for the association between the use of Lisinopril and Acute Kidney Injury?"""
        }
    ],
    model='gpt-4-1106-preview',
    temperature=0,
)
term = chat_completion.choices[0].message.content
term

'The association between Lisinopril, an angiotensin-converting enzyme (ACE) inhibitor, and acute kidney injury (AKI) is a recognized clinical concern, supported by both mechanistic insights and epidemiological data.\n\nMechanistically, Lisinopril reduces the production of angiotensin II, a potent vasoconstrictor, leading to dilation of the efferent arterioles in the kidneys. While this can be beneficial in lowering blood pressure and decreasing proteinuria, it can also reduce the glomerular filtration rate (GFR). In certain circumstances, such as in patients with pre-existing renal impairment, renal artery stenosis, or volume depletion, the reduction in GFR can be significant enough to cause AKI.\n\nEpidemiological evidence for the association comes from observational studies, case reports, and clinical trials. These studies have documented instances of AKI in patients taking Lisinopril, particularly in high-risk groups such as the elderly, those with pre-existing kidney disease, conge

## Get summary including additional informatoin

In [11]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an expert in summarizing current information. Create a concise and comprehensive summary
of the known knowledge for the causal assocation between a drug and an adverse event. 
Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and 
conciseness. Incorporate the retrieved context, and previous medical literature.
If you don't know the answer just say I don't know.
Keep your summarize to less than 3 sentences.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="\nYou are an expert in summarizing current information. Create a concise and comprehensive summary\nof the known knowledge for the causal assocation between a drug and an adverse event. \nCraft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and \nconciseness. Incorporate the retrieved context, and previous medical literature.\nIf you don't know the answer just say I don't know.\nKeep your summarize to less than 3 sentences.\nQuestion: {question} \nContext: {context} \nAnswer:\n"))]


In [12]:
llm = ChatOpenAI(api_key=api_key, model_name="gpt-4-1106-preview", temperature=0)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

query = "What is the evidence, if any, supporting the causal association between lisinopril and acute kidney injury?"
rag_chain.invoke(query)

  warn_deprecated(


'Evidence suggests that lisinopril, an ACE inhibitor, can cause acute kidney injury, particularly in patients with risk factors such as heart failure with low systolic blood pressure, ischemic heart disease, cerebrovascular disease, high dose diuretic therapy, renal dialysis, or severe volume depletion. This association is supported by observations of symptomatic hypotension leading to oliguria, progressive azotemia, and increased serum creatinine levels, which are sometimes reversible upon discontinuation of the drug.'

In [49]:
query = "What is the evidence, if any, supporting the causal association between adenosine and acute kidney injury?"
rag_chain.invoke(query)

'There is no direct evidence provided in the provided documents linking adenosine to acute kidney injury. The adverse reactions listed for adenosine in controlled U.S. clinical trials and post-marketing experience primarily involve cardiovascular, respiratory, central nervous system, and gastrointestinal effects, with no mention of renal or kidney-related adverse events. Therefore, based on the available information, a causal association between adenosine and acute kidney injury cannot be established.'

## Apply to drugs in reference set

In [23]:
reference_set = pd.read_csv('reference_set_labels.csv')
reference_set.columns

Index(['cohort_id', 'condition_name', 'drug_concept_id', 'drug_name', 'affect',
       'set_id', 'label_id', 'spl_version', 'title', 'sections'],
      dtype='object')

In [91]:
def run_iteration(row):
    try:
        query = f"""What is the evidence, if any, supporting the causal association between 
        {row.drug_name} and {row.condition_name}?"""
        summary = rag_chain.invoke(query)
        return [row.condition_name, row.drug_name, row.affect, summary]
    except Exception as err:
        print(f"Encountered an exception for row: {row.drug_name} {row.condition_name}. Error message below:")
        print(err)
        return None

In [None]:
rows_to_run = [row for _,row in reference_set[['condition_name', 'drug_name', 'affect']].drop_duplicates().iterrows()]
results = list()

In [92]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as exec:
    results.extend(list(tqdm(
        exec.map(run_iteration, rows_to_run), 
        total=len(rows_to_run)
    )))

summary_evidence = pd.DataFrame(
    [r for r in results if r is not None],
    columns=['condition_name', 'drug_name', 'affect', 'summary']
)

100%|████████████████████████████████████████████| 344/344 [03:30<00:00,  1.63it/s]


In [94]:
summary_evidence.head(20)

Unnamed: 0,condition_name,drug_name,affect,summary
0,Acute liver injury,lamotrigine,1,The provided documents do not contain direct e...
1,Acute liver injury,nefazodone,1,Evidence suggests a causal association between...
2,Acute liver injury,tolcapone,1,The evidence supporting the causal association...
3,Acute liver injury,Neostigmine,0,Current evidence does not directly link neosti...
4,Acute liver injury,Nortriptyline,1,I don't know.
5,Acute liver injury,Interferon beta-1a,1,Evidence from postmarketing reports indicates ...
6,Acute liver injury,Phentermine,0,I don't know.
7,Acute liver injury,Disulfiram,1,I don't know.
8,Acute liver injury,Droperidol,0,The provided documents do not contain direct e...
9,Acute liver injury,Carbamazepine,1,Carbamazepine has been associated with hepatic...


In [95]:
summary_evidence.to_csv('baselineprompt_rag_3sent_summaries.csv')

## Embed the summaries

In [107]:
from sentence_transformers import SentenceTransformer

In [108]:
embed_model_name = 'llmrails/ember-v1'
embed_model = SentenceTransformer(embed_model_name)
man_embeds = embed_model.encode(summary_evidence['summary'].tolist())
summary_evidence['embeds'] = list(man_embeds)

  return torch._C._cuda_getDeviceCount() > 0
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/home/uog2000/miniconda3/envs/llm_cpus/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


In [121]:
summary_evidence.head()

Unnamed: 0,condition_name,drug_name,affect,summary,embeds
0,Acute liver injury,lamotrigine,1,The provided documents do not contain direct e...,"[0.15283781, -0.05386316, -0.19746718, 0.63484..."
1,Acute liver injury,nefazodone,1,Evidence suggests a causal association between...,"[-0.06405382, -0.30085707, 0.7590669, 0.546655..."
2,Acute liver injury,tolcapone,1,The evidence supporting the causal association...,"[-0.13962302, -0.36845908, 0.3567645, 0.789921..."
3,Acute liver injury,Neostigmine,0,Current evidence does not directly link neosti...,"[-0.011001076, 0.02761679, -0.059737656, 0.866..."
4,Acute liver injury,Nortriptyline,1,I don't know.,"[-0.0059796553, -0.3500437, -0.94098, -0.17054..."


In [136]:
summary_evidence.to_csv('baselineprompt_rag_3sent_summaries_embeds.csv')

In [142]:
embed_col_names = [f'embed_{i}' for i in range(1024)]

In [143]:
summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)

  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embe

  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embeds.tolist(), index= summary_evidence.index)
  summary_evidence[embed_col_names] = pd.DataFrame(summary_evidence.embe

In [145]:
summary_evidence.to_csv('baselineprompt_rag_3sent_summaries_embedsspread.csv')

## Logistic Regression Classifier

In [148]:
!pip install pyspark

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=91a0841b78755f066aa33842a53b02e0331dacfa893870d4b9591091503b4ae5
  Stored in directory: /home/uog2000/.cache/pip/wheels/95/13/41/f7f135ee114175605fb4f0a89e7389f3742aa6c1e1a5bcb657
Successfully built pyspark
Installing collected packages: py4j,

In [149]:
y_actual = summary_evidence.affect
X = summary_evidence[embed_col_names]
X.shape

(344, 1024)

In [157]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_actual, test_size=0.25, random_state=100)

In [158]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=100)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [159]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[43,  6],
       [ 6, 31]])

In [160]:
from sklearn.metrics import classification_report
target_names = ['negative control', 'positive control']
print(classification_report(y_test, y_pred, target_names=target_names))


                  precision    recall  f1-score   support

negative control       0.88      0.88      0.88        49
positive control       0.84      0.84      0.84        37

        accuracy                           0.86        86
       macro avg       0.86      0.86      0.86        86
    weighted avg       0.86      0.86      0.86        86

