In [1]:
from openai import AzureOpenAI
import anthropic
import ollama
import requests
import json
from dotenv import load_dotenv
import os
from IPython.display import display, update_display, Markdown
from bs4 import BeautifulSoup
import re

In [2]:
load_dotenv(override=True)

azure_ai_foundary_api_key = os.getenv("AZURE_AI_FOUNDARY_API_KEY")
azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT")

azure_embed_api_key = os.getenv("AZURE_EMBED_KEY")
azure_embed_endpoint = os.getenv("AZURE_EMBED_ENDPOINT")

azure_audio_api_key = os.getenv("AZURE_AUDIO_API_KEY")
azure_audio_endpoint = os.getenv("AZURE_AUDIO_ENDPOINT")

In [3]:
azure_ai_foundary = AzureOpenAI(
    azure_endpoint = azure_oai_endpoint,
    api_key = azure_ai_foundary_api_key,
    api_version = "2025-01-01-preview"
)


## Langchain RAG

In [86]:
import numpy as np
import glob

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

# vector embeddings for vector db
from langchain_chroma import Chroma
from langchain_azure_ai.chat_models import AzureAIChatCompletionsModel
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI

# Debugging intermediate steps of langchain
from langchain_core.callbacks import StdOutCallbackHandler

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

### Data loading
Load all the documents which can be vectorized

In [4]:
data_path = os.path.join('data', '*')

text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
folders = glob.glob(data_path)

for folder in folders:
    doc_type = os.path.basename(folder)

    print(f'Folder {folder}, doc_type: {doc_type}')
    loader = DirectoryLoader(folder,
                glob="**/*.md", 
                loader_cls=TextLoader,
                loader_kwargs=text_loader_kwargs)

    docs = loader.load()
    print(f'Found {len(docs)} docs of doc_type: {doc_type}') 
    for doc in docs:
        doc.metadata['doc_type'] = doc_type
        documents.append(doc)

print(f'Total docs: {len(documents)}')

Folder data\business, doc_type: business
Found 5 docs of doc_type: business
Folder data\developers, doc_type: developers
Found 4 docs of doc_type: developers
Folder data\projects, doc_type: projects
Found 5 docs of doc_type: projects
Total docs: 14


In [6]:
# split the documents into chunks, the chunk size and amount of overlaps between the chunks can have
# impact on the results
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap= 200)
chunks = text_splitter.split_documents(documents)

print('List of documents')
for chunk in chunks:
    print(chunk.metadata)

List of documents
{'source': 'data\\business\\cloudsync_impact.md', 'doc_type': 'business'}
{'source': 'data\\business\\dataviz_impact.md', 'doc_type': 'business'}
{'source': 'data\\business\\fraud_detection_impact.md', 'doc_type': 'business'}
{'source': 'data\\business\\healthtrack_impact.md', 'doc_type': 'business'}
{'source': 'data\\business\\microservices_impact.md', 'doc_type': 'business'}
{'source': 'data\\developers\\alex_chen.md', 'doc_type': 'developers'}
{'source': 'data\\developers\\john_smith.md', 'doc_type': 'developers'}
{'source': 'data\\developers\\raj_patel.md', 'doc_type': 'developers'}
{'source': 'data\\developers\\sarah_johnson.md', 'doc_type': 'developers'}
{'source': 'data\\projects\\cloudsync.md', 'doc_type': 'projects'}
{'source': 'data\\projects\\dataviz.md', 'doc_type': 'projects'}
{'source': 'data\\projects\\fraud_detection.md', 'doc_type': 'projects'}
{'source': 'data\\projects\\healthtrack.md', 'doc_type': 'projects'}
{'source': 'data\\projects\\microservic

### Direct Azure OpenAI usage for embedding

In [7]:
from openai import AzureOpenAI

embed_model = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=azure_embed_endpoint,
    api_key=azure_embed_api_key
)

deployment = "text-embedding-3-large"
response = embed_model.embeddings.create(
    input=["first phrase","second phrase","third phrase"],
    model=deployment
)

for item in response.data:
    length = len(item.embedding)
    print(
        f"data[{item.index}]: length={length}, "
        f"[{item.embedding[0]}, {item.embedding[1]}, "
        f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
    )
print(response.usage)

data[0]: length=3072, [0.022330209612846375, -0.002088305074721575, ..., -0.014379994943737984, 0.006100048776715994]
data[1]: length=3072, [0.011640272103250027, 0.005252661183476448, ..., -0.028720801696181297, -0.0025770869106054306]
data[2]: length=3072, [0.016326788812875748, -0.0018455119570717216, ..., -0.005349587649106979, 0.006049444433301687]
Usage(prompt_tokens=6, total_tokens=6)


### Embeddings using LangChain

In [51]:
def initialize_llm(env_file: str = None):
    llm = AzureChatOpenAI(
        azure_endpoint=azure_oai_endpoint,
        azure_deployment="gpt-4o-mini",
        openai_api_version="2025-01-01-preview",
    )

    embed_model_name = "text-embedding-3-large"
    embed_api_version = "2024-12-01-preview" #"2024-02-01"
    azure_embed_endpoint = os.environ['AZURE_EMBED_ENDPOINT']

    embeddings = AzureOpenAIEmbeddings(
        model=embed_model_name,
        azure_deployment=embed_model_name,
        azure_endpoint=azure_embed_endpoint,
        openai_api_version=embed_api_version,
        api_key=azure_embed_api_key)
    
    return llm, embeddings

llm, azure_embeddings = initialize_llm()

In [21]:
CHROMA_DB = os.path.join("notebooks", "vector_store", "chroma_rag_db")
print("Chroma db path", CHROMA_DB)
      
# clear the db
if os.path.exists(CHROMA_DB):
    Chroma(persist_directory=CHROMA_DB, embedding_function=azure_embeddings).delete_collection()
    print('db cleared')

# vectorize the docs
vectorstore = Chroma.from_documents(documents=chunks,
                                    embedding=azure_embeddings,
                                    persist_directory=CHROMA_DB)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")


Chroma db path notebooks\vector_store\chroma_rag_db
db cleared
Vectorstore created with 14 documents


In [31]:
# Inspect the vectorstore
collection = vectorstore._collection
sample_vector_embed = collection.get(limit=1, include=['embeddings'])
print(sample_vector_embed)
print('Dimensions: ', len(sample_vector_embed['embeddings'][0]))

{'ids': ['0233e4dd-4ab5-4fcd-b616-9ec75396a7ce'], 'embeddings': array([[ 0.00643417, -0.01768304, -0.01689518, ..., -0.01734747,
        -0.00097479, -0.01140935]], shape=(1, 3072)), 'documents': None, 'uris': None, 'data': None, 'metadatas': None, 'included': [<IncludeEnum.embeddings: 'embeddings'>]}
Dimensions:  3072


### Visualizing Vectors

In [45]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])

print('Embedding shape', vectors.shape)
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]

colors_list = ['red', 'blue', 'green', 'orange', 'purple', 'pink', 'brown', 'gray']
doc_color_mapping = {}
# len(set(doc_types) < len(colors_list), so it wont crash
for i, doc_type in enumerate(set(doc_types)):
    doc_color_mapping[doc_type] = colors_list[i]

colors = [doc_color_mapping[doc_type] for doc_type in doc_types]

Embedding shape (14, 3072)


In [49]:
tsne = TSNE(n_components=2, random_state=42, perplexity=10)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(
    data=[go.Scatter(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        mode='markers',
        marker=dict(size=5, color=colors),
        text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
        hoverinfo='text'
    )]
)

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

### Putting together: RAG using LangChain

In [52]:
# 1. LLM: We already declared the LLM above

# 2. memory: This tracks the conversation history
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# 3. Retriever: This retrieves the context related documents from the vector store
retriever = vectorstore.as_retriever()

# 4. Chain: This combines the LLM, memory and retriever into a single chain
rag_chain = ConversationalRetrievalChain.from_llm(llm=llm, memory=memory, retriever=retriever)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [53]:
query = "What's the best project in terms of revenue?"
result = rag_chain.invoke({"question": query})
print('Question:', query)
print('Answer:', result['answer'])

Question: What's the best project in terms of revenue?
Answer: The project with the best revenue is DataViz, which has an annual recurring revenue of $7.2 million.


### Gradio with RAG

In [87]:
# reset the memory and use that for the gradio chat
chat_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chat_rag_chain = ConversationalRetrievalChain.from_llm(llm=llm, memory=chat_memory, retriever=retriever, callbacks=[StdOutCallbackHandler()])


def chat(message, history):
    # we dont need to manually add the msg to history since langchain RAG
    # does that using the memory object
    result = chat_rag_chain.invoke({"question": message})
    return result['answer']

import gradio as gr
view = gr.ChatInterface(chat, type="messages").launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
# Business Impact: DataViz

## Executive Summary
DataViz has revolutionized how our enterprise clients derive insights from their data, resulting in measurable business value through improved decision-making, operational efficiency, and strategic planning capabilities.

## Key Performance Indicators
- **Client Acquisition**: 42 enterprise clients in first year
- **Revenue**: $7.2M annual recurring revenue
- **Contract Renewal Rate**: 94%
- **Expansion Revenue**: 40% increase from existing clients

## Business Value
- **Decision Latency**: Reduced from weeks to hours
- **Data Utilization**: Increased by 145%
- 

### RAG: Open source LLMs and Embeddings

Instead of relying on paid LLMs for RAG, we can instead use the open source LLMs and word embeddings.

#### Ollama + FAISS (vectorstore)

In [60]:
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain.vectorstores import FAISS

In [68]:
# We will use an open source embedding called "nomic-embed-text", you can find it from ollama models hub.
ollama_embedding = OllamaEmbeddings(model="nomic-embed-text")

faiss_db_path = os.path.join("notebooks", "vector_store", "faiss_rag_db", "faiss.pkl")
print("FAISS db path: ", faiss_db_path)

faiss_vectorstore = None

# check if the faiss data file exists
if os.path.exists(faiss_db_path):
    # load from the file
    faiss_vectorstore = FAISS.load_local(faiss_db_path, ollama_embedding, allow_dangerous_deserialization=True)
    print('FAISS db file found, loaded from the file')
else:
    print('FAISS db file not found, creating a new one')
    os.makedirs(faiss_db_path, exist_ok=True)
    faiss_vectorstore = FAISS.from_documents(documents=chunks, embedding=ollama_embedding)
    print(f"FAISS vectorstore created in memory")
    
    faiss_vectorstore.save_local(faiss_db_path)
    print(f"FAISS vectorstore saved to path: {faiss_db_path}")

FAISS db path:  notebooks\vector_store\faiss_rag_db\faiss.pkl
FAISS db file not found, creating a new one
FAISS vectorstore created in memory
FAISS vectorstore saved to path: notebooks\vector_store\faiss_rag_db\faiss.pkl


In [80]:
faiss_collection = faiss_vectorstore.docstore
print(len((faiss_collection.__dict__)['_dict']))
faiss_collection = (faiss_collection.__dict__)['_dict']

print(faiss_collection)

14
{'3113b1fd-24cc-438b-ace5-2943b767915d': Document(id='3113b1fd-24cc-438b-ace5-2943b767915d', metadata={'source': 'data\\business\\cloudsync_impact.md', 'doc_type': 'business'}, page_content='# Business Impact: CloudSync\n\n## Executive Summary\nCloudSync has transformed internal collaboration workflows and enhanced security posture, resulting in significant productivity gains and cost reductions across all business units.\n\n## Key Performance Indicators\n- **ROI**: 287% over 18 months\n- **Annual Cost Savings**: $1.2M\n- **Productivity Increase**: 23%\n- **Security Incidents**: Reduced by 62%\n\n## Business Value\n- **Collaboration Enhancement**: Cross-team document sharing increased by 78%\n- **Process Acceleration**: Approval workflows reduced from 5 days to 6 hours\n- **Compliance Adherence**: 100% compliance with industry regulations\n- **Risk Mitigation**: Data loss risk reduced by estimated 85%\n\n## Customer Testimonials\n> "CloudSync has fundamentally changed how our teams 

In [69]:
# Create a RAG chain using Ollama LLM and FAISS vector store
ollama_llm = ChatOllama(model="llama3.2")

# Create memory and chain with the loaded vector store
ollama_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
ollama_retriever = faiss_vectorstore.as_retriever()
ollama_rag_chain = ConversationalRetrievalChain.from_llm(
    llm=ollama_llm, 
    memory=ollama_memory, 
    retriever=ollama_retriever
)

# Test the open-source RAG chain
query = "What are the different projects mentioned in the documents?"
result = ollama_rag_chain.invoke({"question": query})
print('Question:', query)
print('Answer:', result['answer'])

Question: What are the different projects mentioned in the documents?
Answer: The documents mention three separate projects:

1. **Project: MicroServices Migration**: A strategic initiative to decompose the company's monolithic legacy application into a modern microservices architecture.
2. **Project: FraudDetection**: An AI-powered system that identifies and prevents fraudulent transactions in real-time, using machine learning models and a Lambda architecture.
3. **Business Impact: CloudSync**: A project that transformed internal collaboration workflows and enhanced security posture, resulting in significant productivity gains and cost reductions.

Additionally, there are two separate business impact documents:

1. **Business Impact: MicroServices Migration**
2. **Business Impact: CloudSync**

Lastly, there is a generic document titled "Future Business Opportunities" which mentions opportunities for the three projects mentioned earlier.


#### RAG without Langchain: Ollma + SentenceTransformer Embedding
Instead of relying on the langchain for building the RAG chain, we can instead do it ourselves using some code to manage the below things:
1. Interactions with the vector store - ChromaDB in this case
2. Filling the context of user prompt by doing a semantic search from the chromadb for the user query
3. Then we use this augmented user prompt with the context information and then use the classic llm api to get the chat completion response