In [2]:
# Core LangChain components
!pip install langchain langchain-community faiss-cpu openai tiktoken pandas

# Vector store
!pip install faiss-cpu

# Embeddings + LLM
!pip install openai tiktoken

# Data handling
!pip install pandas



In [None]:
import pandas as pd
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
import os

# Set your OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE"

In [None]:
# Load email thread data
emails_df = pd.read_csv("emails.csv")

# Load thread summaries
summaries_df = pd.read_csv("summaries.csv")

# Merge both files using thread_id
merged_df = pd.merge(emails_df, summaries_df, on="thread_id", how="left")

In [None]:
documents = []

for _, row in merged_df.iterrows():
    # Combine all relevant information into a single text block
    content = f"""
    Thread ID: {row['thread_id']}
    Subject: {row['subject']}
    From: {row['from']}
    To: {row['to']}
    Timestamp: {row['timestamp']}
    Summary: {row.get('summary', '')}
    
    Email Body:
    {row['body']}
    """

    # Create a LangChain Document with metadata
    doc = Document(
        page_content=content,
        metadata={
            "thread_id": row["thread_id"],
            "subject": row["subject"],
            "from": row["from"],
            "timestamp": row["timestamp"]
        }
    )

    documents.append(doc)

In [None]:
# Split documents into smaller chunks for better embedding quality
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

split_docs = text_splitter.split_documents(documents)

In [None]:
# Initialize embedding model
embeddings = OpenAIEmbeddings()

# Create FAISS vector store from document chunks
vectorstore = FAISS.from_documents(split_docs, embeddings)

In [None]:
# Retriever fetches the most relevant chunks for a query
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

In [None]:
# Initialize the LLM
llm = OpenAI(temperature=0)

# Create RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",   # Simple and effective for document QA
    retriever=retriever,
    return_source_documents=True
)

In [None]:
# User query
query = "What decisions were made regarding budget approval?"

# Run query through the RAG pipeline
result = qa_chain(query)

# Print the generated answer
print("Answer:")
print(result["result"])

In [None]:
print("\nSources:\n")

for doc in result["source_documents"]:
    print(f"Thread ID: {doc.metadata['thread_id']}")
    print(f"Subject: {doc.metadata['subject']}")
    print(f"From: {doc.metadata['from']}")
    print(f"Timestamp: {doc.metadata['timestamp']}")
    print("-" * 50)