# Email Search AI – Generative Search System

This notebook implements a robust generative search system for organizational email data.
The system follows a three-layer architecture:
1. Embedding Layer
2. Search Layer
3. Generation Layer

The goal is to retrieve and summarize decisions, strategies, and timelines from large email corporates.


In [None]:
import os
import re
import hashlib
from typing import Dict

import pandas as pd
import chromadb
from chromadb.config import Settings

from sentence_transformers import SentenceTransformer, CrossEncoder
from openai import OpenAI

In [None]:
emails_df = pd.read_csv("email_thread_details.csv")
emails_df = emails_df.dropna(subset=["body"])

print("Emails loaded:", len(emails_df))
emails_df.head()

In [None]:
summary_df = pd.read_csv("email_thread_summaries.csv")
print("Thread summaries loaded:", len(summary_df))
summary_df.head()

In [None]:
def clean_email(text):
    text = re.sub(r"On .* wrote:", "", text)
    text = re.sub(r"(From|Sent|To|Subject):.*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

emails_df["clean_body"] = emails_df["body"].apply(clean_email)

In [None]:
def chunk_email(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunks.append(" ".join(words[start:end]))
        start += chunk_size - overlap
    return chunks

In [None]:
documents, metadatas = [], []

for _, row in emails_df.iterrows():
    for i, chunk in enumerate(chunk_email(row["clean_body"])):
        documents.append(chunk)
        metadatas.append({
            "thread_id": row["thread_id"],
            "subject": row["subject"],
            "timestamp": row["timestamp"],
            "from": row["from"],
            "to": row["to"],
            "doc_type": "email"
        })

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
email_embeddings = embedding_model.encode(documents, show_progress_bar=True)

In [None]:
summary_docs = summary_df["summary"].tolist()
summary_metas = [{"thread_id": t, "doc_type": "thread_summary"} 
                 for t in summary_df["thread_id"]]

summary_embeddings = embedding_model.encode(summary_docs, show_progress_bar=True)

In [None]:
client = chromadb.Client(Settings(persist_directory="./email_db"))

collection = client.get_or_create_collection("email_search_ai")

collection.add(
    documents=documents + summary_docs,
    embeddings=list(email_embeddings) + list(summary_embeddings),
    metadatas=metadatas + summary_metas,
    ids=[f"doc_{i}" for i in range(len(documents) + len(summary_docs))]
)

In [None]:
cache: Dict[str, tuple] = {}

def cache_key(q): 
    return hashlib.md5(q.encode()).hexdigest()

def search(query, top_k=10):
    key = cache_key(query)
    if key in cache:
        return cache[key]

    emb = embedding_model.encode([query])[0]
    results = collection.query(query_embeddings=[emb], n_results=top_k)
    cache[key] = (results["documents"][0], results["metadatas"][0])
    return cache[key]

In [None]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, docs, metas, top_n=3):
    scores = reranker.predict([(query, d) for d in docs])
    ranked = sorted(zip(docs, metas, scores), key=lambda x: x[2], reverse=True)
    return ranked[:top_n]

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def build_prompt(query, ranked):
    summaries, emails = [], []
    for doc, meta, _ in ranked:
        if meta["doc_type"] == "thread_summary":
            summaries.append(doc)
        else:
            emails.append(f"{meta['timestamp']} | {meta['from']} → {meta['to']}: {doc}")

    return f"""
You are an enterprise email analysis assistant.

THREAD SUMMARIES:
{chr(10).join(summaries)}

EMAIL EVIDENCE:
{chr(10).join(emails)}

QUESTION:
{query}

ANSWER (fact-based, concise):
"""

def generate_answer(query, ranked):
    prompt = build_prompt(query, ranked)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

In [None]:
queries = [
    "What decisions were made about Q3 marketing strategy?",
    "Was budget approval discussed for Project Atlas?",
    "What timelines were agreed upon for product launch?"
]

In [None]:
for q in queries:
    docs, metas = search(q)
    ranked = rerank(q, docs, metas)

    print("\nQUERY:", q)
    print("\nTop 3 Search Results:")
    for i, (doc, meta, score) in enumerate(ranked, 1):
        print(f"[{i}] ({meta['doc_type']}) Score: {score:.2f}")

    print("\nFinal Answer:")
    print(generate_answer(q, ranked))