<a href="https://colab.research.google.com/github/tascheidt/jdmgpt/blob/trulens_tests/jdmgpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
  from google.colab import drive
  drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install utils
%pip install requests
%pip install selenium
%pip install streamlit
%pip install openai
%pip install cohere
%pip install anthropic
%pip install llama-index
%pip install nltk
%pip install pypdf
%pip install chromadb
%pip install numpy
%pip install requests
%pip install trulens_eval
%pip install alembic
%pip install datasets
%pip install langchain
%pip install pydantic
%pip install python-dotenv
%pip install transformers
%pip install sentence-transformers



In [None]:
import os

openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
import chromadb

settings = chromadb.Settings(allow_reset=True)
chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Code/jdmgpt/chroma_db",settings=settings)
chroma_client.reset()

In [None]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.llms import OpenAI
import openai
import os
import json
import chromadb
from llama_index.vector_stores import ChromaVectorStore
from llama_index import SimpleDirectoryReader, StorageContext

openai.api_key = os.environ["OPENAI_API_KEY"]

# Function to load indexed files record
def load_indexed_files_record(record_file):
    try:
        with open(record_file, 'r') as file:
            return json.load(file)
    except FileNotFoundError:
        return {}

# Function to update indexed files record
def update_indexed_files_record(record_file, indexed_files):
    with open(record_file, 'w') as file:
        json.dump(indexed_files, file)

def setup_vector_store():
    record_file = "/content/drive/MyDrive/Code/jdmgpt/indexed_files.json"
    indexed_files = load_indexed_files_record(record_file)

    file_directory = "/content/drive/MyDrive/Code/jdmgpt/sample_articles"
    all_file_paths = [os.path.join(file_directory, f) for f in os.listdir(file_directory) if os.path.isfile(os.path.join(file_directory, f))]
    new_file_paths = [f for f in all_file_paths if os.path.basename(f) not in indexed_files]

    chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Code/jdmgpt/chroma_db", settings=chromadb.Settings(allow_reset=True))
    chroma_collection = chroma_client.get_or_create_collection("gptbot")

    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1))

    index = None

    try:
        if new_file_paths:
            reader = SimpleDirectoryReader(input_files=new_file_paths, recursive=True)
            new_docs = reader.load_data()
            # Load and index only new or updated files
            index = VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context, service_context=service_context)

            for doc in new_docs:
                file_name = doc.metadata.get('file_name')
                if file_name is not None:
                    index.insert(doc)
                    indexed_files[file_name] = True
                    print("New File Indexed:", file_name)
                else:
                    print("File missing 'file_name' metadata:", doc)

            # Update the indexed files record after processing all new files
            update_indexed_files_record(record_file, indexed_files)

        else:
            index = VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context, service_context=service_context)

    except ValueError as e:
        print(f"Error setting up vector store: {e}")

    return index


In [None]:
index = setup_vector_store()

In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query(
    "What is CMAS and how is it used to evaluate JDM patients?"
)
print(str(response))

In [None]:
# load eval questions from text file

eval_questions = []
with open('/content/drive/MyDrive/Code/jdmgpt/questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        print(item)
        eval_questions.append(item)

In [None]:
#print eval questions
print(eval_questions)

In [None]:
# print hearbeat
client = chromadb.Client()

# Test if the service is up and running
print(client.heartbeat())

In [None]:
# some checks to see what's going on in the chromastore
# source: https://github.com/neo-con/chromadb-tutorial/tree/main
import chromadb

# Initialize ChromaDB client
client = chromadb.Client()

# Creating a collection
gpt_collection = chroma_client.get_collection(name="gptbot") # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.

# Inspecting a collection
print(gpt_collection)

# Counting items in a collection
item_count = gpt_collection.count()
print(f"Count of items in collection: {item_count}")

# Get or Create a new collection, change the distance function
trinity_collection = client.get_or_create_collection(
    name="trinity", metadata={"hnsw:space": "cosine"}
)
print(trinity_collection)

# Deleting a collection
try:
    client.delete_collection(name="trinity")
    print("Trinity collection deleted.")
except ValueError as e:
    print(f"Error: {e}")

# Counting items in a collection
item_count = gpt_collection.count()
print(f"Count of items in collection: {item_count}")

# Get items from the collection
items = gpt_collection.get()
print(items)

# Or we can use the peek method
gpt_collection.peek(limit=5)

In [None]:
chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Code/jdmgpt/chroma_db",settings=chromadb.Settings(allow_reset=True))

chroma_collection = chroma_client.get_collection(name="gptbot") # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.
#collection = client.get_or_create_collection(name="test") # Get a collection object from an existing collection, by name. If it doesn't exist, create it.
#client.delete_collection(name="my_collection") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible

chroma_collection.peek() # returns a list of the first 10 items in the collection
chroma_collection.count() # returns the number of items in the collection
#collection.modify(name="new_name") # Rename the collection


## TruLens Evaluation

# Existing index - sentence window

In [None]:
from trulens_eval import Tru

def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Code/jdmgpt')

In [None]:
from trulens_utils import get_prebuilt_trulens_recorder
from trulens_eval import Tru

Tru().reset_database()

In [None]:
print(type(query_engine))

In [None]:
from trulens_utils import get_sentence_window_query_engine
sentence_window_engine_1 = get_sentence_window_query_engine(index)

In [None]:
tru_recorder_1 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1,
    app_id='jdmgpt_base'
)

In [None]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

In [None]:
Tru().get_leaderboard(app_ids=["jdmgpt_base"])

One time load of articles to create documents

In [None]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader("/content/drive/MyDrive/Code/jdmgpt/sample_articles/").load_data()

Scenario 2 - gpt3.5 with bge small embedding, sentence index 1

In [None]:
#scenario 2 - gpt3.5 with bge small embedding, sentence index 1
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "35_bgesmall_sw1"

sentence_index_1 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir="3.5_bgesmall_sw1",
)
sentence_window_engine_1 = get_sentence_window_query_engine(
    sentence_index_1
)

tru_recorder_1 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

In [None]:
Tru().get_leaderboard(app_ids=[])

Scenario 3 - gpt3.5 with bge small embedding, sentence index 3

In [None]:
#scenario 3 - gpt3.5 with bge small embedding, sentence index 3
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "35_bgesmall_sw3"

sentence_index_3 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir=app_id,
)
sentence_window_engine_3 = get_sentence_window_query_engine(
    sentence_index_3
)

tru_recorder_3 = get_prebuilt_trulens_recorder(
    sentence_window_engine_3,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_3, sentence_window_engine_3)

In [None]:
Tru().get_leaderboard(app_ids=[])

Scenario 4 - gpt4 with bge small embedding, sentence index 3

In [None]:
#scenario 4 - gpt4 with bge small embedding, sentence index 3
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "4_bgesmall_sw3"

sentence_index_3_4 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-4", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir=app_id,
)
sentence_window_engine_3_4 = get_sentence_window_query_engine(
    sentence_index_3_4
)

tru_recorder_3_4 = get_prebuilt_trulens_recorder(
    sentence_window_engine_3_4,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_3_4, sentence_window_engine_3_4)

In [None]:
Tru().get_leaderboard(app_ids=[])

Scenario 5 - gpt4 with bge small embedding, sentence index 1

In [None]:
#scenario 5 - gpt4 with bge small embedding, sentence index 1
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "4_bgesmall_sw1"

sentence_index_1_4 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-4", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir=app_id,
)
sentence_window_engine_1_4 = get_sentence_window_query_engine(
    sentence_index_1_4
)

tru_recorder_1_4 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1_4,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_1_4, sentence_window_engine_1_4)

In [None]:
Tru().get_leaderboard(app_ids=[])

Scenario 6 - gpt4 with bge large embedding, sentence index 3

In [None]:
#scenario 6 - gpt4 with bge large embedding, sentence index 3
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "4_bgelarge_sw3"

sentence_index_L3_4 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-4", temperature=0.1),
    embed_model="local:BAAI/bge-large-en-v1.5",
    sentence_window_size=3,
    save_dir=app_id,
)
sentence_window_engine_L3_4 = get_sentence_window_query_engine(
    sentence_index_L3_4
)

tru_recorder_L3_4 = get_prebuilt_trulens_recorder(
    sentence_window_engine_L3_4,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_L3_4, sentence_window_engine_L3_4)

In [None]:
Tru().get_leaderboard(app_ids=[])

Scenario 7 - gpt4 with bge large embedding, sentence index 1

In [None]:
#scenario 7 - gpt4 with bge large embedding, sentence index 1
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "4_bgelarge_sw1"

sentence_index_L1_4 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-4", temperature=0.1),
    embed_model="local:BAAI/bge-large-en-v1.5",
    sentence_window_size=1,
    save_dir=app_id,
)
sentence_window_engine_L1_4 = get_sentence_window_query_engine(
    sentence_index_L1_4
)

tru_recorder_L1_4 = get_prebuilt_trulens_recorder(
    sentence_window_engine_L1_4,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_L1_4, sentence_window_engine_L1_4)

In [None]:
Tru().get_leaderboard(app_ids=[])

Scenario 8 - gpt-4-1106-preview preview with bge large embedding, sentence index 1

In [None]:
#scenario 8 - gpt-4-1106-preview with bge large embedding, sentence index 1
from trulens_utils import build_sentence_window_index, get_sentence_window_query_engine, get_prebuilt_trulens_recorder

app_id = "4_1106_bgelarge_sw1"

sentence_index_L1_4_1106 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-4-1106-preview", temperature=0.1),
    embed_model="local:BAAI/bge-large-en-v1.5",
    sentence_window_size=1,
    save_dir=app_id,
)
sentence_window_engine_L1_4_1106 = get_sentence_window_query_engine(
    sentence_index_L1_4_1106
)

tru_recorder_L1_4_1106 = get_prebuilt_trulens_recorder(
    sentence_window_engine_L1_4_1106,
    app_id=app_id
)


In [None]:
run_evals(eval_questions, tru_recorder_L1_4, sentence_window_engine_L1_4)

In [None]:
Tru().get_leaderboard(app_ids=[])

In [None]:
#Tru.dashboard_proc.kill()
#Tru.dashboard_proc = None

Tru().run_dashboard()