In [1]:
import pandas as pd

In [11]:
df_path = "C:\\Users\\Vansh Malani\\Documents\\GitHub\\Uni-Dash_Reborn\\data"
df = pd.read_csv(df_path + "\\23DCS023_all_emails.csv")
# df=pd.read_csv("labeled_dataset.csv")

In [23]:
import re
import pandas as pd

# ---------------------------------------------------
# 1. REMOVE CHARUSAT DISCLAIMER
# ---------------------------------------------------
def remove_charusat_disclaimer(text):
    # Remove only the long legal footer, not message body
    pattern = r"(?is)disclaimer[:].*"
    return re.sub(pattern, "", str(text)).strip()


# ---------------------------------------------------
# 2. CLEAN TEXT
# ---------------------------------------------------
def clean_text(text):
    text = str(text)

    text = remove_charusat_disclaimer(text)

    # Remove generic signatures (keep forwarded content intact)
    text = re.sub(
        r"(?is)(thanks[,.]?$|regards[,.]?$|warm regards[,.]?$|best wishes[,.]?$)",
        "",
        text
    )

    # Remove long signature blocks if they are at the end (200 chars max)
    text = re.sub(r"(?is)(thanks|regards)[\s\S]{0,200}$", "", text)

    # Remove URLs only
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Collapse extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text.lower()


# ---------------------------------------------------
# 3. EMAIL + DOMAIN EXTRACTION
# ---------------------------------------------------
def extract_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+', str(text))
    return match.group(0).lower() if match else ""

def extract_domain(email):
    return email.split('@')[-1] if "@" in email else ""


# ---------------------------------------------------
# 4. FLAGS
# ---------------------------------------------------
def contains_submit_word(text):
    return bool(re.search(r"\b(submit|upload|last date|due|fill)\b", text))

def contains_exam_word(text):
    return bool(re.search(r"\b(exam|test|cie|hall ticket|seating)\b", text))

def contains_event_word(text):
    return bool(re.search(r"\b(hackathon|event|bootcamp|seminar|workshop|competition)\b", text))

def contains_form_link(text):
    return bool(re.search(r"(docs\.google\.com/forms|forms\.gle)", text))


# ---------------------------------------------------
# 5. DEADLINE EXTRACTION
# ---------------------------------------------------
def extract_deadline(text):
    pattern = r"(due|deadline|submit by|last date)[:\- ]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{1,2}\s+\w+\s+\d{4})"
    m = re.search(pattern, text, re.IGNORECASE)
    return m.group(2) if m else ""


# ---------------------------------------------------
# 6. PIPELINE
# ---------------------------------------------------
cols_to_drop = ["Unnamed: 0", "id", "threadId", "labelIds", "snippet"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

df["raw_text"] = df["subject"].astype(str) + " " + df["content"].astype(str)
df["clean_text"] = df["raw_text"].apply(clean_text)

df["sender_email"] = df["from"].apply(extract_email)
df["sender_domain"] = df["sender_email"].apply(extract_domain)

df["contains_submit_word"] = df["clean_text"].apply(contains_submit_word)
df["contains_exam_word"] = df["clean_text"].apply(contains_exam_word)
df["contains_event_word"] = df["clean_text"].apply(contains_event_word)
df["contains_form_link"] = df["clean_text"].apply(contains_form_link)

df["deadline_date"] = df["clean_text"].apply(extract_deadline)
df["contains_deadline"] = df["deadline_date"].apply(lambda x: bool(x))

df["label_source"] = ""
df["label_topic"] = ""
df["label_urgency"] = ""


# ---------------------------------------------------
# 7. FILTERING RULES
# ---------------------------------------------------

# Delete ONLY pure system messages (not real emails)
df = df[~df["from"].str.contains(r"^no-reply@classroom\.google\.com$", na=False, case=False)]

# Keep short tiny urgent mails
df = df[df["clean_text"].str.len() > 5].reset_index(drop=True)

df.head()


Unnamed: 0,from,subject,date,content,raw_text,clean_text,sender_email,sender_domain,contains_submit_word,contains_exam_word,contains_event_word,contains_form_link,deadline_date,contains_deadline,label_source,label_topic,label_urgency
0,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,Request for Email Data Contribution – Research...,2025-06-25 20:36:16+05:30,Dear Student\r\n\r\nThank you so much for your...,Request for Email Data Contribution – Research...,request for email data contribution – research...,23dcs023@charusat.edu.in,charusat.edu.in,False,False,False,False,,False,,,
1,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,Re: Request for Email Data Contribution – Rese...,2025-06-25 20:31:44+05:30,I hope you're doing well.\r\n\r\nThis is a fin...,Re: Request for Email Data Contribution – Rese...,re: request for email data contribution – rese...,23dcs023@charusat.edu.in,charusat.edu.in,True,False,False,False,,False,,,
2,NPTEL <onlinecourses@nptel.iitm.ac.in>,NPTEL Newsletter: IIT Madras CODE Certificate ...,2025-06-25 15:52:01+05:30,<https://code.iitm.ac.in/foundations-to-ml>\r\...,NPTEL Newsletter: IIT Madras CODE Certificate ...,nptel newsletter: iit madras code certificate ...,onlinecourses@nptel.iitm.ac.in,nptel.iitm.ac.in,False,False,False,False,,False,,,
3,Team at Read <support@e.read.ai>,Exclusive Education Pricing Just for You!,2025-06-25 09:33:48+00:00,"Read AI for $5 Per Month\r\nHi 23DCS023 , as a...",Exclusive Education Pricing Just for You! Read...,exclusive education pricing just for you! read...,support@e.read.ai,e.read.ai,False,False,False,False,,False,,,
4,Mohini Darji <mohinidarji.dcs@charusat.ac.in>,Reg. Submission of SGP- Weekly reports,2025-06-25 13:36:47+05:30,"Dear Students,\r\n\r\nKindly find the attached...",Reg. Submission of SGP- Weekly reports Dear St...,reg. submission of sgp- weekly reports dear st...,mohinidarji.dcs@charusat.ac.in,charusat.ac.in,True,False,False,False,,False,,,


In [24]:
# Remove emails from classroom.google.com domain
df = df[~df["sender_domain"].str.contains(r"classroom\.google\.com$", na=False, case=False)]

In [25]:
df.columns

Index(['from', 'subject', 'date', 'content', 'raw_text', 'clean_text',
       'sender_email', 'sender_domain', 'contains_submit_word',
       'contains_exam_word', 'contains_event_word', 'contains_form_link',
       'deadline_date', 'contains_deadline', 'label_source', 'label_topic',
       'label_urgency'],
      dtype='object')

In [26]:
# Find rows where clean_text contains the keyword 'fees' (case-insensitive) and display them
fees_rows = df[df['clean_text'].str.contains('fees', case=False, na=False)]
fees_rows

Unnamed: 0,from,subject,date,content,raw_text,clean_text,sender_email,sender_domain,contains_submit_word,contains_exam_word,contains_event_word,contains_form_link,deadline_date,contains_deadline,label_source,label_topic,label_urgency
26,Gaurang Patel <gaurangpatel.dcs@charusat.ac.in>,Semester 5 Fee Payment Schedule - B.Tech Progr...,2025-06-19 15:57:51+05:30,"Dear Students,\r\n\r\nThis is an important rem...",Semester 5 Fee Payment Schedule - B.Tech Progr...,semester 5 fee payment schedule - b.tech progr...,gaurangpatel.dcs@charusat.ac.in,charusat.ac.in,False,False,False,False,,False,,,
87,NPTEL <onlinecourses@nptel.iitm.ac.in>,NPTEL Newsletter: June 2025: Vol 6: Week 2!,2025-06-10 09:41:22+05:30,NPTEL NEW\r\n\r\n<https://nptel.ac.in/>\r\n­\...,NPTEL Newsletter: June 2025: Vol 6: Week 2! N...,nptel newsletter: june 2025: vol 6: week 2! np...,onlinecourses@nptel.iitm.ac.in,nptel.iitm.ac.in,False,False,False,False,,False,,,
129,Read Assistant <executiveassistant@e.read.ai>,"Weekly Kickoff | Mon, May 26th | read.ai",2025-06-02 16:58:00+00:00,"Need a recap? Review Meeting notes, personaliz...","Weekly Kickoff | Mon, May 26th | read.ai Need ...","weekly kickoff | mon, may 26th | read.ai need ...",executiveassistant@e.read.ai,e.read.ai,True,False,False,False,,False,,,
158,Radhika Patel <radhikapatel.it@charusat.ac.in>,Fwd: Proposal for Students to Participate in t...,2025-05-28 21:40:30+05:30,"Regards,\r\nRadhika H. Patel,\r\nAssistant Pro...",Fwd: Proposal for Students to Participate in t...,fwd: proposal for students to participate in t...,radhikapatel.it@charusat.ac.in,charusat.ac.in,False,False,True,False,,False,,,
159,Naina Parmar <nainaparmar.dcs@charusat.ac.in>,Fwd: Proposal for Students to Participate in t...,2025-05-28 20:38:11+05:30,---------- Forwarded message ---------\r\nFrom...,Fwd: Proposal for Students to Participate in t...,fwd: proposal for students to participate in t...,nainaparmar.dcs@charusat.ac.in,charusat.ac.in,False,False,True,False,,False,,,
160,Naina Parmar <nainaparmar.dcs@charusat.ac.in>,Fwd: Proposal for Students to Participate in t...,2025-05-28 20:23:08+05:30,---------- Forwarded message ---------\r\nFrom...,Fwd: Proposal for Students to Participate in t...,fwd: proposal for students to participate in t...,nainaparmar.dcs@charusat.ac.in,charusat.ac.in,False,False,True,False,,False,,,
171,Read Assistant <executiveassistant@e.read.ai>,"Weekly Kickoff | Mon, May 19th | read.ai",2025-05-26 16:58:00+00:00,"Need a recap? Review Meeting notes, personaliz...","Weekly Kickoff | Mon, May 19th | read.ai Need ...","weekly kickoff | mon, may 19th | read.ai need ...",executiveassistant@e.read.ai,e.read.ai,False,False,False,False,,False,,,
289,Disha Panchal <dishapanchal.dcs@charusat.ac.in>,Re: University Theory and Practical Examinatio...,2025-04-15 14:01:47+05:30,"Dear Students,\r\nPlease find attached herewit...",Re: University Theory and Practical Examinatio...,re: university theory and practical examinatio...,dishapanchal.dcs@charusat.ac.in,charusat.ac.in,False,True,False,False,,False,,,
293,radhika@krutanic.net,Accenture Accredited Internship Program - Regi...,2025-04-08 19:42:10+00:00,I've invited you to fill out the following for...,Accenture Accredited Internship Program - Regi...,accenture accredited internship program - regi...,radhika@krutanic.net,krutanic.net,True,False,False,False,,False,,,
310,Sanket Suthar <sanketsuthar.it@charusat.ac.in>,Re: Registration fees for Cloud Practitioner N...,2025-04-05 11:55:25+05:30,Dear all\r\n\r\nNow you can go to Admin depart...,Re: Registration fees for Cloud Practitioner N...,re: registration fees for cloud practitioner n...,sanketsuthar.it@charusat.ac.in,charusat.ac.in,False,False,False,False,,False,,,


In [27]:
label_df = df[[
    "from",
    "sender_domain",
    "clean_text",
    "deadline_date",
    "label_source",
    "label_topic",
    "label_urgency"
]]


In [28]:
label_df.head()

Unnamed: 0,from,sender_domain,clean_text,deadline_date,label_source,label_topic,label_urgency
0,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,charusat.edu.in,request for email data contribution – research...,,,,
1,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,charusat.edu.in,re: request for email data contribution – rese...,,,,
2,NPTEL <onlinecourses@nptel.iitm.ac.in>,nptel.iitm.ac.in,nptel newsletter: iit madras code certificate ...,,,,
3,Team at Read <support@e.read.ai>,e.read.ai,exclusive education pricing just for you! read...,,,,
4,Mohini Darji <mohinidarji.dcs@charusat.ac.in>,charusat.ac.in,reg. submission of sgp- weekly reports dear st...,,,,


In [29]:
# Find rows where clean_text contains the keyword 'fees' (case-insensitive) and display them
fees_rows = label_df[df['clean_text'].str.contains('fees', case=False, na=False)]
# fees_rows[['clean_text', 'label_topic_model', 'label_topic_rule']]

In [30]:
fees_rows

Unnamed: 0,from,sender_domain,clean_text,deadline_date,label_source,label_topic,label_urgency
26,Gaurang Patel <gaurangpatel.dcs@charusat.ac.in>,charusat.ac.in,semester 5 fee payment schedule - b.tech progr...,,,,
87,NPTEL <onlinecourses@nptel.iitm.ac.in>,nptel.iitm.ac.in,nptel newsletter: june 2025: vol 6: week 2! np...,,,,
129,Read Assistant <executiveassistant@e.read.ai>,e.read.ai,"weekly kickoff | mon, may 26th | read.ai need ...",,,,
158,Radhika Patel <radhikapatel.it@charusat.ac.in>,charusat.ac.in,fwd: proposal for students to participate in t...,,,,
159,Naina Parmar <nainaparmar.dcs@charusat.ac.in>,charusat.ac.in,fwd: proposal for students to participate in t...,,,,
160,Naina Parmar <nainaparmar.dcs@charusat.ac.in>,charusat.ac.in,fwd: proposal for students to participate in t...,,,,
171,Read Assistant <executiveassistant@e.read.ai>,e.read.ai,"weekly kickoff | mon, may 19th | read.ai need ...",,,,
289,Disha Panchal <dishapanchal.dcs@charusat.ac.in>,charusat.ac.in,re: university theory and practical examinatio...,,,,
293,radhika@krutanic.net,krutanic.net,accenture accredited internship program - regi...,,,,
310,Sanket Suthar <sanketsuthar.it@charusat.ac.in>,charusat.ac.in,re: registration fees for cloud practitioner n...,,,,


In [31]:
label_df.to_csv("unlabeled_dataset.csv",index=False)

In [9]:
label_df = interactive_level1_labeler(label_df)
label_df.to_csv("labeled_level1.csv", index=False)


NameError: name 'interactive_level1_labeler' is not defined