In [10]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv(r"level_2_fixed_3rd_jan.csv")

In [11]:
df.head()
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [12]:
# ---- Basic sanity checks ----
expected_columns = {
    "from",
    "sender_domain",
    "clean_text",
    "deadline_date",
    "label_source",
    "label_topic",
    "label_urgency",
}

In [13]:

missing_cols = expected_columns - set(df.columns)
if missing_cols:
    raise ValueError(f"Missing expected columns: {missing_cols}")


In [14]:
if "timestamp" in df.columns:
    df = df.sort_values("timestamp")

df = df.reset_index(drop=True)


In [15]:

# ---- Explicitly split labeled vs unlabeled ----
df_labeled = df[df["label_topic"].notna()].copy()
df_unlabeled = df[df["label_topic"].isna()].copy()

# ---- Declare scope exclusions ----
# label_urgency is intentionally ignored in this notebook
IGNORED_COLUMNS = ["label_urgency"]

print("Dataset loaded successfully")
print(f"Total emails     : {len(df)}")
print(f"Labeled emails   : {len(df_labeled)}")
print(f"Unlabeled emails : {len(df_unlabeled)}")

Dataset loaded successfully
Total emails     : 1536
Labeled emails   : 510
Unlabeled emails : 1026


In [16]:
# ============================================
# TF-IDF Vectorisation (Baseline)
# - Fit ONLY on labeled data
# - This defines the known semantic space
# ============================================

from sklearn.feature_extraction.text import TfidfVectorizer

# ---- TF-IDF configuration ----
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),     # captures short academic phrases
    max_df=0.9,             # ignore overly common terms
    min_df=3,               # ignore rare noise terms
    stop_words="english",
    sublinear_tf=True       # stabilizes term frequency
)

# ---- Fit on labeled data only ----
X_labeled = tfidf_vectorizer.fit_transform(df_labeled["clean_text"])

# ---- Basic vector sanity checks ----
print("TF-IDF vectorisation complete")
print(f"Number of labeled emails : {X_labeled.shape[0]}")
print(f"Vocabulary size          : {X_labeled.shape[1]}")


TF-IDF vectorisation complete
Number of labeled emails : 510
Vocabulary size          : 8411


In [19]:
# ============================================
# Topic-Restricted Similarity Retrieval
# - Cosine similarity
# - Search only within same label_topic
# - Pure, deterministic function
# ============================================

from sklearn.metrics.pairwise import cosine_similarity

def retrieve_similar_emails(
    query_text: str,
    query_topic: str,
    df_labeled: pd.DataFrame,
    X_labeled,
    vectorizer,
    k: int = 5
):
    """
    Retrieve top-k similar emails within the same label_topic.
    Returns a DataFrame with similarity scores.
    """

    # Vectorize query using existing TF-IDF space
    query_vec = vectorizer.transform([query_text])

    # Restrict search space to same topic
    topic_mask = df_labeled["label_topic"] == query_topic

    if topic_mask.sum() == 0:
        return pd.DataFrame()  # no comparable emails

    X_subset = X_labeled[topic_mask.values]
    df_subset = df_labeled[topic_mask]

    # Compute cosine similarity
    similarities = cosine_similarity(query_vec, X_subset).flatten()

    # Get top-k indices
    top_indices = similarities.argsort()[::-1][:k]

    # Return results with similarity score
    results = df_subset.iloc[top_indices].copy()
    results["similarity_score"] = similarities[top_indices]

    return results


In [20]:
# ---- Manual sanity test ----

sample_idx = 10  # pick any valid index
sample_row = df_labeled.iloc[sample_idx]

print("QUERY EMAIL")
print("Topic :", sample_row.label_topic)
print(sample_row.clean_text[:300], "...\n")

results = retrieve_similar_emails(
    query_text=sample_row.clean_text,
    query_topic=sample_row.label_topic,
    df_labeled=df_labeled,
    X_labeled=X_labeled,
    vectorizer=tfidf_vectorizer,
    k=5
)

print("TOP SIMILAR EMAILS")
results[["label_source", "label_topic", "similarity_score"]]


QUERY EMAIL
Topic : General Information / Misc
recommended: ai for everyone nan ...

TOP SIMILAR EMAILS


Unnamed: 0,label_source,label_topic,similarity_score
465,Misc / External,General Information / Misc,1.0
287,Misc / External,General Information / Misc,1.0
300,Misc / External,General Information / Misc,1.0
352,Misc / External,General Information / Misc,1.0
325,Misc / External,General Information / Misc,1.0


In [21]:
# ============================================
# Vectorise Unlabeled Emails (NO fitting)
# - Uses existing TF-IDF space
# - Does not affect semantic model
# ============================================

X_unlabeled = tfidf_vectorizer.transform(df_unlabeled["clean_text"])

print("Unlabeled emails vectorized")
print(f"Unlabeled vectors shape: {X_unlabeled.shape}")


Unlabeled emails vectorized
Unlabeled vectors shape: (1026, 8411)


In [26]:
# ============================================
# Simulated Retrieval for Unlabeled Email
# (Topic is manually injected for testing)
# ============================================

# Pick an unlabeled email
sample_idx = 0
sample_row = df_unlabeled.iloc[sample_idx]

# TEMPORARY: manually assumed topic
ASSUMED_TOPIC = "General Information / Misc"  # change freely while testing

print("UNLABELED EMAIL (SIMULATION)")
print("Assumed topic:", ASSUMED_TOPIC)
print(sample_row.clean_text[:300], "...\n")

results = retrieve_similar_emails(
    query_text=sample_row.clean_text,
    query_topic=ASSUMED_TOPIC,
    df_labeled=df_labeled,
    X_labeled=X_labeled,
    vectorizer=tfidf_vectorizer,
    k=5
)

results[["label_source", "label_topic", "similarity_score"]]


UNLABELED EMAIL (SIMULATION)
Assumed topic: General Information / Misc
fwd: regarding the collection of marksheet (odd term, ay:2024-25) dear students, greetings for the day!!! kindly note that the marksheets for 3rd and 5th semesters are available at pa to principal office, ground floor, depstar building. please collect the same between 11:00 am to 12:00 pm on or befo ...



Unnamed: 0,label_source,label_topic,similarity_score
421,Faculty / Academic Staff,General Information / Misc,0.227507
350,Administration / Office,General Information / Misc,0.190951
431,Administration / Office,General Information / Misc,0.184834
379,Faculty / Academic Staff,General Information / Misc,0.163236
203,Faculty / Academic Staff,General Information / Misc,0.158966
