In [20]:
import os
import glob
# from dotenv import load_dotenv
import gradio as gr

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain.embeddings import OllamaEmbeddings, HuggingFaceEmbeddings
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

Note: Code is from a course from Ed Donner on AI Engineering. This may need to be refactored somewhere down the line.

In [2]:
# constants
MODEL = "llama3.1"
db_name = "vector_db"

In [10]:
target_dir = r"C:/Users/ldmag/Documents/GitHub/Obsidian-Notes/Obsidian Vault/Research/GAI and student learning"

#folders = glob.glob(os.path.join(target_dir, "*"))
#folders = [f for f in folders if os.path.isdir(f)]

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []

doc_type = os.path.basename(target_dir)
loader = DirectoryLoader(target_dir, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
folder_docs = loader.load()
documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

#for folder in folders:
#    doc_type = os.path.basename(folder)
#    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
#    folder_docs = loader.load()
#    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=6500, chunk_overlap=3700)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")
print(f"Loading from: {target_dir}")
print(f"Files found: {len(documents)} .md files in the target directory only")

Created a chunk of size 6661, which is longer than the specified 6500


Total number of chunks: 47
Document types found: {'GAI and student learning'}
Loading from: C:/Users/ldmag/Documents/GitHub/Obsidian-Notes/Obsidian Vault/Research/GAI and student learning
Files found: 13 .md files in the target directory only


In [11]:
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

  embeddings = OllamaEmbeddings(model="nomic-embed-text")


Vectorstore created with 47 documents


In [12]:
# vector store stats
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 47 vectors with 768 dimensions in the vector store


### Visualizing the VecStore

In [14]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
# colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [15]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [16]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## Back to the knowledge worker

In [17]:
# Alternative - if you'd like to use Ollama locally, uncomment this line instead
llm = ChatOpenAI(temperature=0.7, model_name='llama3.1', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# set up the conversation chain with the Llama 3.1 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [18]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [21]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
