## Installing Dependencies

In [6]:
! pip install langchain openai tiktoken docx2txt chromadb python-docx

Collecting langchain
  Downloading langchain-0.0.240-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting chromadb
  Downloading chromadb-0.4.2-py3-none-any.whl (399 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.3/399.3 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading py

## Importing Dependencies and Setting Relevant Environment Variables

In [7]:
import os, docx
import numpy as np
import pandas as pd
from langchain import llm_cache
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from langchain.document_loaders import Docx2txtLoader, CSVLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from chromadb.config import Settings
from langchain.cache import InMemoryCache
from langchain.text_splitter import RecursiveCharacterTextSplitter

# cache llm calls (faster when repeating queries and prompts)
llm_cache = InMemoryCache()

os.environ["OPENAI_API_KEY"]= "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"
os.environ["PRETRAINED_SUMMARY_MODEL_NAME"]='gpt-4'
os.environ["PRETRAINED_RETRIEVAL_MODEL_NAME"]='text-davinci-003'
os.environ["PRETRAINED_EMBEDDINGS_MODEL"] = "text-embedding-ada-002"
os.environ["VECTOR_STORE_FOLDER"] = "vectordb"

## Define Unstructured Clinical Texts and Convert to LangChain Document

In [1]:
import json
data_path = "/content/All_67_Trials.json"
# read this json file
with open(data_path) as f:
    data = json.load(f)
data

{'FullStudiesResponse': {'APIVrs': '1.01.05',
  'DataVrs': '2023:07:19 23:55:51.044',
  'Expression': 'NCT03394027 OR NCT04200482 OR NCT03384316 OR NCT02186015 OR NCT02957968 OR NCT02179515 OR NCT02111850 OR NCT02897375 OR NCT03934905 OR NCT03317405 OR NCT00127205 OR NCT02314156 OR NCT04024254 OR NCT04692103 OR NCT00978250 OR NCT03501979 OR NCT01554371 OR NCT01988571 OR NCT03691493 OR NCT00256217 OR NCT02509156 OR NCT04822597 OR NCT02401347 OR NCT00194714 OR NCT02312622 OR NCT01349322 OR NCT03858322 OR NCT02626507 OR NCT03716583 OR NCT03364348 OR NCT01596751 OR NCT01417286 OR NCT03971019 OR NCT02942355 OR NCT05693766 OR NCT00796978 OR NCT00088413 OR NCT01869764 OR NCT03180294 OR NCT02400476 OR NCT03219476 OR NCT02526498 OR NCT04366713 OR NCT03958721 OR NCT04965688 OR NCT00608972 OR NCT01466972 OR NCT01573442 OR NCT03633331 OR NCT01989546 OR NCT04750473 OR NCT02455453 OR NCT00496795 OR NCT02822573 OR NCT01730118 OR NCT02953860 OR NCT01967823 OR NCT01670877 OR NCT01855828 OR NCT02764541 

In [2]:
sub_dic = data['FullStudiesResponse']['FullStudies']

In [3]:
criteria_list = []
for i in range(67):
  criteria_list.append(sub_dic[i]['Study']['ProtocolSection']['EligibilityModule']['EligibilityCriteria'])

In [4]:
import pandas as pd
import numpy as np
data_path = "/content/Copy of ctg-studies (3).csv"
df = pd.read_csv(data_path)
df['Eligibility Criteria'] = np.array(criteria_list)
df['Eligibility Criteria']
df.columns.values

array(['NCT Number', 'Study Title', 'Study URL', 'Acronym',
       'Study Status', 'Brief Summary', 'Study Results', 'Conditions',
       'Interventions', 'Primary Outcome Measures',
       'Secondary Outcome Measures', 'Other Outcome Measures', 'Sponsor',
       'Collaborators', 'Sex', 'Age', 'Phases', 'Enrollment',
       'Funder Type', 'Study Type', 'Study Design', 'Other IDs',
       'Start Date', 'Primary Completion Date', 'Completion Date',
       'First Posted', 'Results First Posted', 'Last Update Posted',
       'Locations', 'Study Documents', 'Eligibility Criteria'],
      dtype=object)

In [8]:
from langchain.schema import Document

In [12]:
i=0
isolated_trial_1 = """
"""
column_names = df.columns.values
for data_field in df.iloc[0]:
  isolated_trial_1 += column_names[i] + ": "
  isolated_trial_1 += str(data_field)
  isolated_trial_1 += "\n"
  i+=1
isolated_trial_1 += "Eligibility Criteria: "
isolated_trial_1 += str(criteria_list[0])
isolated_trial_1

'\nNCT Number: NCT03934905\nStudy Title: Protective Effects of the Nutritional Supplement Sulforaphane on Doxorubicin-Associated Cardiac Dysfunction\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03934905\nAcronym: nan\nStudy Status: RECRUITING\nBrief Summary: Cardiomyopathy is a major complication of doxorubicin (DOX) chemotherapy, and 10-21% of breast cancer patients receiving DOX experience compromised cardiac function. Recent advancements have increased cancer survivorship but it remains clinically challenging to mitigate the cardiotoxic side effects. Although there are several strategies used to reduce the occurrence and severity of DOX-induced cardiotoxicity, they are not particularly effective. Hence, there is an urgent need to develop new strategies that prevent the cardiotoxic effects of DOX but maintain its potency as a cancer therapy. Because the cellular events responsible for the antitumor activity of DOX and DOX-induced cardiotoxicity are distinctly different, it ma

## Define LLM and Summarisation Chain and Summarise Each Unstructured Clinical Text

In [15]:
isolated_trial_1 = Document(page_content = isolated_trial_1)
chain = load_summarize_chain(
    ChatOpenAI(model_name=os.environ["PRETRAINED_SUMMARY_MODEL_NAME"], temperature=.1),
    chain_type="map_reduce"
)
summary = chain.run([isolated_trial_1])
print(f"Summary of document {1}:")
print(summary, "\n")


Summary of document 1:
Texas Tech University Health Sciences Center is conducting a clinical trial to examine if the nutritional supplement sulforaphane can protect against heart damage caused by the chemotherapy drug doxorubicin in breast cancer patients. The aim is to reduce the drug's cardiotoxic side effects without affecting its effectiveness against cancer. The trial, which is currently recruiting participants, is expected to conclude by June 2026. 



## Load Structured File and Summarise

In [None]:
docx_file = "data/Tablular Data w Inputs_Outputs.docx"
csv_file = "data/clinical_doc_table.csv"

doc = docx.Document(docx_file)

row_data = []
for table in doc.tables:
        for row in table.rows:
            row_data.append([cell.text.replace("\n", "").strip() for cell in row.cells])

df = pd.DataFrame(np.array(row_data), columns=["keys", "descriptions"])
df.to_csv(csv_file, index=False)

loader = CSVLoader(csv_file)
full_docs = loader.load()
print(f"Number of document lines: {len(full_docs)}")

Number of document lines: 26


In [None]:
summary = chain.run(full_docs)
summary

'The study, titled "Integrating Gene Signatures to Guide HR+MBC Therapy in a Diverse Cohort (INSIGHT)", is an open-label, multicenter Phase II clinical trial focusing on using gene signatures to guide hormone receptor-positive metastatic breast cancer (HR+MBC) therapy. The primary key tracks progression-free survival up to 3 years, while secondary keys include survival rates, clinical benefit rate, and treatment toxicity. The study involves two arms: one using endocrine-based therapy and the other using the drug Capecitabine. The study is currently recruiting with an estimated enrollment of 64. The study is set to start on June 30, 2023, and expected to be completed by May 30, 2037. Eligibility criteria include subjects who are 18 years or older with stage IV invasive mammary carcinoma or unresectable locoregional recurrence. The study does not accept healthy volunteers and is based in the United States.'

## Define an Embedding Model and a ChromaDB for Embedding and Persisting Embedded Vectors To ROM

In [None]:
embedding = OpenAIEmbeddings(model=os.environ["PRETRAINED_EMBEDDINGS_MODEL"])

if (
    not os.path.isdir(os.environ["VECTOR_STORE_FOLDER"])
    or
    len(os.listdir(os.environ["VECTOR_STORE_FOLDER"])) == 0):
    docsearch_db = Chroma.from_documents(
        documents=full_docs,
        persist_directory=os.environ["VECTOR_STORE_FOLDER"],
        embedding=embedding)

    # persist embeddings to ROM
    docsearch_db.persist()
    docsearch_db = None

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory=os.environ["VECTOR_STORE_FOLDER"],
    anonymized_telemetry=False,
)

docsearch_db = Chroma(
    embedding_function=embedding,
    client_settings=CHROMA_SETTINGS)

retriever = docsearch_db.as_retriever()

## Define QA Prompt and Retrieval Chain

In [None]:
prompt_str = "Answer this question: {question}"
qa_prompt = PromptTemplate.from_template(prompt_str)
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(model_name=os.environ["PRETRAINED_RETRIEVAL_MODEL_NAME"], temperature=.1),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True)

## Ask Questions Pertaining to the Clinical Document and Print Responses

In [None]:
queries = [
    "What are the eligibility criteria?",
    "What is the detailed description?",
    "What is the primary objective?"
]

for i, query in enumerate(queries):
    query = qa_prompt.format_prompt(question=query)
    print(f"answer to question {i+1}")
    print(qa(query.text)["result"])
    print("\n")

answer to question 1
 Inclusion Criteria: Signed and dated written informed consent. Subjects ≥ 18 years of age. Eastern Cooperative Oncology Group (ECOG) performance status 0 or 1. Clinical stage IV invasive mammary carcinoma or unresectable locoregional recurrence of invasive mammary carcinoma that is: ER/PR-positive (> 1% cells) by IHC and HER2 negative (by IHC or FISH) Previously progressed on an aromatase inhibitor (AI) or a selective estrogen receptor modulator/ downregulator (SERM; SERD) + a CDK4/6 inhibitor Appropriate candidates for chemotherapy Measurable disease as defined by Response Evaluation Criteria in Solid Tumors (RECIST) version 1.1 criteria that has not been previously irradiated and which can be followed by CT or MRI. Adequate organ function including: Absolute neutrophil count (ANC) ≥ 1.5 × 10^9/L Platelets ≥ 100 × 10^9/L Hemoglobin ≥ 9/g/dL (may have been transfused) Total serum bilirubin ≤ 1.5 times upper limit of normal (ULN) Aspartate aminotransferase (AST/SGO

In [None]:
question ="What are the eligibility criteria?"
query = qa_prompt.format_prompt(question=question)
print(qa(query.text)["result"])
print("\n")

 Inclusion Criteria: Signed and dated written informed consent. Subjects ≥ 18 years of age. Eastern Cooperative Oncology Group (ECOG) performance status 0 or 1. Clinical stage IV invasive mammary carcinoma or unresectable locoregional recurrence of invasive mammary carcinoma that is: ER/PR-positive (> 1% cells) by IHC and HER2 negative (by IHC or FISH) Previously progressed on an aromatase inhibitor (AI) or a selective estrogen receptor modulator/ downregulator (SERM; SERD) + a CDK4/6 inhibitor Appropriate candidates for chemotherapy Measurable disease as defined by Response Evaluation Criteria in Solid Tumors (RECIST) version 1.1 criteria that has not been previously irradiated and which can be followed by CT or MRI. Adequate organ function including: Absolute neutrophil count (ANC) ≥ 1.5 × 10^9/L Platelets ≥ 100 × 10^9/L Hemoglobin ≥ 9/g/dL (may have been transfused) Total serum bilirubin ≤ 1.5 times upper limit of normal (ULN) Aspartate aminotransferase (AST/SGOT) and


