In [7]:
import pandas as pd
import os
import openai
from openai import OpenAI
import numpy as np
import json

# Define a Document class to mimic LangChain's Document structure
class SimpleDocument:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

# 1. Load your processed chunks from CSV
df = pd.read_csv("/Users/arjunasudani/Desktop/dsudata/chatbot_data.csv")
documents = []
for _, row in df.iterrows():
    documents.append(
        SimpleDocument(
            page_content=row['chunk_text'],
            metadata={
                'week': row['week'],
                'source': row['source'],
                'slide_number': row['slide_number']
            }
        )
    )
print(f"Loaded {len(documents)} documents from CSV.")

# 2. Setup OpenAI client (replace YOUR_API_KEY with your actual API key)
client = OpenAI(api_key='REDACTED')

# 3. Function to create embeddings using OpenAI
def get_embedding(text, model="text-embedding-ada-002"):
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return response.data[0].embedding

# 4. Load or create embeddings store
embeddings_path = "/Users/arjunasudani/Desktop/dsudata/embeddings_store.json"
embeddings_store = []
if os.path.exists(embeddings_path):
    print("Loading embeddings from disk...")
    with open(embeddings_path, 'r') as f:
        store = json.load(f)
        for item in store:
            doc = SimpleDocument(page_content=item["page_content"], metadata=item["metadata"])
            embeddings_store.append({
                "document": doc,
                "embedding": item["embedding"]
            })
else:
    print("Creating embeddings for all documents...")
    for i, doc in enumerate(documents):
        if i % 10 == 0:
            print(f"Processing document {i+1}/{len(documents)}...")
        embedding = get_embedding(doc.page_content)
        embeddings_store.append({
            "document": doc,
            "embedding": embedding
        })
    # Save embeddings to disk for future runs
    with open(embeddings_path, 'w') as f:
        json.dump([{
            "page_content": item["document"].page_content,
            "metadata": item["document"].metadata,
            "embedding": item["embedding"]
        } for item in embeddings_store], f)
    print(f"Embeddings saved to {embeddings_path}")

# 5. Cosine similarity function for comparing embeddings
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# 6. Basic similarity search (kept for reference)
def similarity_search(query, k=3):
    query_embedding = get_embedding(query)
    similarities = []
    for item in embeddings_store:
        similarity = cosine_similarity(query_embedding, item["embedding"])
        similarities.append({
            "document": item["document"],
            "similarity": similarity
        })
    sorted_similarities = sorted(similarities, key=lambda x: x["similarity"], reverse=True)
    return [item["document"] for item in sorted_similarities[:k]]

# 7. MMR search for optimal and diverse retrieval
def mmr_search(query, k=5, lambda_param=0.5, fetch_k=10):
    query_embedding = get_embedding(query)
    similarities = []
    for item in embeddings_store:
        similarity = cosine_similarity(query_embedding, item["embedding"])
        similarities.append({
            "document": item["document"],
            "embedding": item["embedding"],
            "similarity": similarity
        })
    sorted_similarities = sorted(similarities, key=lambda x: x["similarity"], reverse=True)
    candidates = sorted_similarities[:fetch_k]
    
    selected = []
    selected_embeddings = []
    
    if candidates:
        selected.append(candidates[0]["document"])
        selected_embeddings.append(candidates[0]["embedding"])
        candidates.pop(0)
    
    while len(selected) < k and candidates:
        max_mmr = -np.inf
        max_mmr_idx = -1
        
        for i, candidate in enumerate(candidates):
            query_sim = candidate["similarity"]
            max_sim_to_selected = max([cosine_similarity(candidate["embedding"], sel_emb) for sel_emb in selected_embeddings] or [0])
            mmr = lambda_param * query_sim - (1 - lambda_param) * max_sim_to_selected
            if mmr > max_mmr:
                max_mmr = mmr
                max_mmr_idx = i
        
        if max_mmr_idx != -1:
            selected.append(candidates[max_mmr_idx]["document"])
            selected_embeddings.append(candidates[max_mmr_idx]["embedding"])
            candidates.pop(max_mmr_idx)
        else:
            break
    return selected

# 8. RAG Chatbot implementation using MMR search
def rag_chatbot(query):
    # Retrieve more documents (k=5) using MMR search for better context
    docs = mmr_search(query, k=5, lambda_param=0.5, fetch_k=10)
    context = "\n\n".join([f"[{doc.metadata['week']}] {doc.page_content}" for doc in docs])
    
    # Revised prompt instructs the assistant to use all available DSU information
    prompt = f"""
You are a helpful assistant for the UCLA Data Science Union (DSU).
Answer the following question based on the context provided. Use all available DSU information to provide a complete answer.

Context:
{context}

Question: {query}

Answer:
"""
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant for the UCLA Data Science Union."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5
    )
    answer = response.choices[0].message.content.strip()
    return answer

# 9. Main execution: Ask 10 testing questions and output only the question and the chatbot's answer.
if __name__ == "__main__":
    test_questions = [
        "When is the Friendsgiving event?",
        "What sports activities are available?",
        "Tell me about the DSU coffee chats.",
        "When is the next general meeting?",
        "What guest speakers are scheduled for this semester?",
        "What workshops will DSU host?",
        "Are there any upcoming networking events?",
        "How can I join DSU activities?",
        "What are the DSU membership benefits?",
        "Where can I find DSU event details online?"
    ]
    
    for question in test_questions:
        print(f"QUESTION: {question}")
        answer = rag_chatbot(question)
        print(f"ANSWER: {answer}\n")


Loaded 68 documents from CSV.
Loading embeddings from disk...
QUESTION: When is the Friendsgiving event?
ANSWER: The Friendsgiving event typically takes place during the Thanksgiving holiday season, which falls on the fourth Thursday of November each year in the United States. In 2021, Thanksgiving falls on Thursday, November 25th. The Friendsgiving event may be scheduled around this time to allow people to come together and celebrate with friends before the official Thanksgiving day. It is a time for friends to gather, share a meal, and express gratitude for each other's company.

QUESTION: What sports activities are available?
ANSWER: At the UCLA Data Science Union (DSU), there are various sports activities available for members to participate in. Some of the sports activities that may be offered include:

1. Basketball
2. Soccer
3. Volleyball
4. Tennis
5. Ultimate Frisbee
6. Running or jogging groups
7. Yoga or Pilates classes
8. Hiking or outdoor adventure trips

Participating in s