In [1]:
!pip install sentence-transformers faiss-cpu --quiet



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv("../data/large_dataset.csv.txt")

# Prepare one combined text field
df["email_text"] = df["subject"].fillna("") + " " + df["body"].fillna("")
df = df[["email_text", "tag"]]  # optional keep tag for debugging

df.head()


Unnamed: 0,email_text,tag
0,Unable to access shared mailbox I am getting a...,access_issue
1,Rule not triggering Our auto-assignment rule i...,workflow_issue
2,Email threads not merging Two replies from the...,threading_issue
3,Tag suggestions incorrect Tag suggestions are ...,tagging_accuracy
4,Drafts disappearing Draft replies disappear wh...,ui_bug


In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded!")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embedding model loaded!


In [6]:
corpus = df["email_text"].tolist()
corpus_embeddings = model.encode(corpus, convert_to_numpy=True, normalize_embeddings=True)

corpus_embeddings.shape


(60, 384)

In [7]:
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # inner product = cosine if normalized
index.add(corpus_embeddings)
print("FAISS index created!")


FAISS index created!


In [8]:
def search(query, top_k=3):
    # Create embedding
    query_embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)

    # Search
    scores, indices = index.search(query_embedding, top_k)

    results = []
    for idx, score in zip(indices[0], scores[0]):
        results.append({
            "email": corpus[idx],
            "score": float(score),
            "tag": df.iloc[idx]["tag"]
        })
    return results


In [9]:
query = "automation rule not working"
results = search(query, top_k=3)
results


[{'email': 'Rule not triggering Our auto-assignment rule is no longer firing for emails with the word Refund.',
  'score': 0.4507540166378021,
  'tag': 'workflow_issue'},
 {'email': "Rules not saving Workflow rules don't save after clicking Submit.",
  'score': 0.434370756149292,
  'tag': 'workflow_bug'},
 {'email': 'Automation delay Our automation to mark emails as pending is taking 2–3 minutes to run.',
  'score': 0.40733572840690613,
  'tag': 'automation_delay'}]

In [10]:
def rag_answer(query):
    matches = search(query, top_k=3)

    top = matches[0]
    confidence = (top["score"] + 1) / 2  # normalize 0–1

    reasoning = f"The top matched email mentions similar context: '{top['email'][:80]}...'"

    return {
        "query": query,
        "top_result": top,
        "confidence": round(confidence, 3),
        "reasoning": reasoning,
        "alternates": matches[1:]
    }

rag_answer("unable to assign emails automatically")


{'query': 'unable to assign emails automatically',
 'top_result': {'email': 'Auto-assign slow Incoming emails remain unassigned for up to 2 minutes.',
  'score': 0.6926428079605103,
  'tag': 'automation_delay'},
 'confidence': 0.846,
 'reasoning': "The top matched email mentions similar context: 'Auto-assign slow Incoming emails remain unassigned for up to 2 minutes....'",
 'alternates': [{'email': 'Assignee reset Emails revert to unassigned randomly.',
   'score': 0.665044367313385,
   'tag': 'assignment_issue'},
  {'email': 'Incorrect user assignments Emails are being assigned to the wrong agent.',
   'score': 0.6207638382911682,
   'tag': 'assignment_bug'}]}