In [None]:
#!pip install gradio

In [10]:
import gradio as gr
import plotly.graph_objects as go
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
import openai
import os
from dotenv import load_dotenv

# imports for langchain and Chroma and plotly
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import openai

In [11]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [12]:
# In[14]:
# Load environment variables from the specified .env file
env_path = 'C:/Users/MichaelJWirickJr/keys.env'  # Update this path if necessary
load_dotenv(env_path)

# Set the OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

# Check if the API key is loaded correctly (optional for debugging)
if openai.api_key:
    print("✅ OpenAI API Key loaded successfully.")
else:
    print("❌ Error: OpenAI API Key not found. Please check your .env file.")


✅ OpenAI API Key loaded successfully.


In [13]:
# In[16]:
# Path to the specific Markdown file

#= r'C:/Users/MichaelJWirickJr/Tintinallis_Emergency_Medicine/Section 1.md'  # Update this path if necessary
input_file_path = r'C:/Users/MichaelJWirickJr/Tintinallis_Emergency_Medicine/Section 1_summaries.md'  # Output summary file

print(f"📄 Input File: {input_file_path}")
#print(f"📄 Output Summary File: {output_file_path}")

# In[17]:
#from langchain.document_loaders import TextLoader
#from langchain.text_splitter import CharacterTextSplitter

# Load the specific Markdown document using LangChain's TextLoader
loader = TextLoader(input_file_path, autodetect_encoding=True)
documents = loader.load()

print(f"✅ Loaded {len(documents)} document(s) from {input_file_path}.")

# Split the document into chunks using LangChain's CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
chunks = text_splitter.split_documents(documents)

print(f"✅ Split document into {len(chunks)} chunks.")



📄 Input File: C:/Users/MichaelJWirickJr/Tintinallis_Emergency_Medicine/Section 1_summaries.md
✅ Loaded 1 document(s) from C:/Users/MichaelJWirickJr/Tintinallis_Emergency_Medicine/Section 1_summaries.md.
✅ Split document into 46 chunks.


In [15]:
embeddings = OpenAIEmbeddings()


In [16]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [17]:
# Create our Chroma vectorstore!

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 46 documents


In [18]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [19]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
#doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
#colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [None]:
# Extract the doc_type from metadata
doc_types = [metadata.get('doc_type', 'Section 1: Resuscitative Problems and Techniques\
 Advanced Airway Support') for metadata in result['metadatas']]

print(f"✅ Extracted doc_types for {len(doc_types)} chunks.")
doc_types

In [24]:
# In[19]:
import matplotlib.pyplot as plt
import numpy as np

# Identify unique document types
unique_doc_types = sorted(set(doc_types))
print(f"Unique doc_types found: {unique_doc_types}")

# Define a color palette
# Using a predefined color palette from matplotlib
color_palette = plt.get_cmap('tab20')  # 'tab20' has 20 distinct colors

# Create a mapping from doc_type to color
color_map = {doc_type: color_palette(i) for i, doc_type in enumerate(unique_doc_types)}

# Assign colors to each chunk based on its doc_type
colors = [color_map[doc_type] for doc_type in doc_types]

# For Plotly, colors need to be in hexadecimal or numeric format
# Convert RGBA to hexadecimal
def rgba_to_hex(rgba):
    return f'rgba({int(rgba[0]*255)}, {int(rgba[1]*255)}, {int(rgba[2]*255)}, {rgba[3]})'

colors_hex = [rgba_to_hex(color) for color in colors]

# Perform t-SNE to reduce dimensionality to 3D
#tsne = TSNE(n_components=3, random_state=42)
#reduced_vectors = tsne.fit_transform(vectors)

# Define color mapping based on doc_types
unique_doc_types = sorted(set(doc_types))
color_palette = plt.get_cmap('tab20')
color_map = {doc_type: color_palette(i) for i, doc_type in enumerate(unique_doc_types)}

# Convert colors to hexadecimal for Plotly
colors_hex = [f'rgba({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)}, {color[3]})' for color in 
              [color_map[doc_type] for doc_type in doc_types]]

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)


Unique doc_types found: ['Section 1: Resuscitative Problems and Techniques Advanced Airway Support']


In [25]:
# Define a function to display the 3D graph
def display_3d_graph():
    fig = go.Figure(data=[go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            color=colors_hex,  # Use the hexadecimal colors from your color mapping
            opacity=0.8
        ),
        text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
        hoverinfo='text'
    )])
    fig.update_layout(
        title='3D Chroma Vector Store Visualization',
        scene=dict(
            xaxis_title='Dimension 1',
            yaxis_title='Dimension 2',
            zaxis_title='Dimension 3'
        ),
        width=800,
        height=600,
        margin=dict(r=20, b=10, l=10, t=40)
    )
    return fig

In [64]:
# In[19]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
import csv
import os

# Identify unique document types
unique_doc_types = sorted(set(doc_types))
print(f"Unique doc_types found: {unique_doc_types}")

# Define a color palette
# Using a predefined color palette from matplotlib
color_palette = plt.get_cmap('tab20')  # 'tab20' has 20 distinct colors

# Create a mapping from doc_type to color
color_map = {doc_type: color_palette(i) for i, doc_type in enumerate(unique_doc_types)}

# Assign colors to each chunk based on its doc_type
colors = [color_map[doc_type] for doc_type in doc_types]

# For Plotly, colors need to be in hexadecimal or numeric format
# Convert RGBA to hexadecimal
def rgba_to_hex(rgba):
    return f'rgba({int(rgba[0]*255)}, {int(rgba[1]*255)}, {int(rgba[2]*255)}, {rgba[3]})'

colors_hex = [rgba_to_hex(color) for color in colors]

# Initialize reduced_vectors as a global variable
reduced_vectors = None

def compute_reduced_vectors():
    global reduced_vectors
    # Reduce the dimensionality of the vectors using t-SNE
    tsne = TSNE(n_components=3, random_state=42)
    reduced_vectors = tsne.fit_transform(vectors)

# Compute reduced vectors before defining the display function
compute_reduced_vectors()

# Define a function to display the 3D graph
def display_3d_graph():
    fig = go.Figure(data=[go.Scatter3d(
        x=reduced_vectors[:, 0],
        y=reduced_vectors[:, 1],
        z=reduced_vectors[:, 2],
        mode='markers',
        marker=dict(
            size=5,
            color=colors_hex,  # Use the hexadecimal colors from your color mapping
            opacity=0.8
        ),
        text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
        hoverinfo='text'
    )])
    fig.update_layout(
        title='3D Chroma Vector Store Visualization',
        scene=dict(
            xaxis_title='Dimension 1',
            yaxis_title='Dimension 2',
            zaxis_title='Dimension 3'
        )
    )
    return fig
    
# Define a function to save conversation to CSV
def save_to_csv(query, response):
    filename = 'conversation_history.csv'
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # Create file with headers if it doesn't exist
    if not os.path.exists(filename):
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Timestamp', 'Query', 'Response'])

    # Append the new conversation
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([timestamp, query, response])

# Define a function to handle Q&A via RAG
def answer_query(query):
    # Ensure `query` is a single string
    query = str(query)

    # Perform similarity search directly on the query text
    search_results = vectorstore.similarity_search(query)

    # Concatenate search results for context
    context = " ".join([doc.page_content for doc in search_results])

    # Generate a response using ChatGPT with the retrieved context
    chat = ChatOpenAI(model_name=MODEL)
    response = chat.invoke(context + "\n\nUser Query: " + query)

    # Save conversation to CSV
    save_to_csv(query, response.content)

    return response.content

        # Define the Gradio interface
with gr.Blocks() as app:
    gr.Markdown("# 3D Chroma Vector Store Visualization with Chat Interface")
    
    with gr.Tab("Chat"):
        query_input = gr.Textbox(label="Enter your question:")
        response_output = gr.Textbox(label="Response", interactive=False)
        query_button = gr.Button("Ask")
        query_button.click(fn=answer_query, inputs=query_input, outputs=response_output)

    with gr.Tab("3D Graph"):
        gr.Plot(display_3d_graph)
        


Unique doc_types found: ['Section 1: Resuscitative Problems and Techniques Advanced Airway Support']


In [66]:
app.launch(share=True)


Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
* Running on public URL: https://14fd2b2167e4d6b62f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [60]:
app.close()

Closing server running on port: 7873


NameError: name 'save_to_csv' is not defined