In [1]:
import pandas as pd

In [2]:
df_path = "C:\\Users\\Vansh Malani\\Documents\\GitHub\\Uni-Dash_Reborn\\data"
df = pd.read_csv(df_path + "\\23DCS023_all_emails.csv")
# df=pd.read_csv("labeled_dataset.csv")

In [3]:
import re
import pandas as pd

# ---------------------------------------------------
# 1. REMOVE CHARUSAT DISCLAIMER (must run first)
# ---------------------------------------------------
def remove_charusat_disclaimer(text):
    pattern = r"-- or entity to which it is addressed.*"
    return re.sub(pattern, "", str(text), flags=re.DOTALL | re.IGNORECASE).strip()


# ---------------------------------------------------
# 2. CLEAN TEXT (unified function)
# ---------------------------------------------------
def clean_text(text):
    text = str(text)

    # Remove disclaimer FIRST
    text = remove_charusat_disclaimer(text)

    # Remove signatures / greetings (mild version)
    text = re.sub(
        r"(?i)(sent from my.*|regards,?|thanks( &)?|sincerely|yours truly|best wishes)",
        "",
        text
    )

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove excess whitespace
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Lowercase for ML
    text = text.lower().strip()
    return text


# ---------------------------------------------------
# 3. EXTRACT EMAIL + DOMAIN
# ---------------------------------------------------
def extract_email(text):
    match = re.search(r'[\w\.-]+@[\w\.-]+', str(text))
    return match.group(0).lower() if match else ""

def extract_domain(email):
    return email.split('@')[-1] if "@" in email else ""


# ---------------------------------------------------
# 4. KEYWORD FLAGS
# ---------------------------------------------------
def contains_submit_word(text):
    return bool(re.search(r"\b(submit|upload|last date|due)\b", text, re.IGNORECASE))

def contains_exam_word(text):
    return bool(re.search(r"\b(exam|test|cie)\b", text, re.IGNORECASE))

def contains_event_word(text):
    return bool(re.search(r"\b(hackathon|event|bootcamp|seminar|workshop)\b", text, re.IGNORECASE))

def contains_form_link(text):
    return bool(re.search(r"(docs\.google\.com/forms|forms\.gle)", text))


# ---------------------------------------------------
# 5. DEADLINE EXTRACTION
# ---------------------------------------------------
def extract_deadline(text):
    # Simple date pattern (expand later)
    match = re.search(
        r"(due|deadline|submit by|last date)\s*[:\-]?\s*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{1,2}\s+\w+\s+\d{4})",
        text,
        re.IGNORECASE
    )
    return match.group(2) if match else ""


# ---------------------------------------------------
# 6. APPLY CLEANING + METADATA PIPELINE
# ---------------------------------------------------
# Drop unused columns
cols_to_drop = ["Unnamed: 0", "id", "threadId", "labelIds", "snippet"]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

# Create raw_text
df["raw_text"] = df["subject"].astype(str) + " " + df["content"].astype(str)

# Clean text
df["clean_text"] = df["raw_text"].apply(clean_text)

# Extract email metadata
df["sender_email"] = df["from"].apply(extract_email)
df["sender_domain"] = df["sender_email"].apply(extract_domain)

# Keyword flags
df["contains_submit_word"] = df["clean_text"].apply(contains_submit_word)
df["contains_exam_word"] = df["clean_text"].apply(contains_exam_word)
df["contains_event_word"] = df["clean_text"].apply(contains_event_word)
df["contains_form_link"] = df["clean_text"].apply(contains_form_link)

# Deadlines
df["deadline_date"] = df["clean_text"].apply(extract_deadline)
df["contains_deadline"] = df["deadline_date"].apply(lambda x: x != "")

# Label placeholders
df["label_source"] = ""
df["label_topic"] = ""
df["label_urgency"] = ""

# Remove Classroom mails
pattern = r'(?i)Classroom|no-reply@classroom\.google\.com'
df = df[~df['from'].astype(str).str.contains(pattern, regex=True, na=False)].reset_index(drop=True)

# Remove useless texts (optional)
df = df[df["clean_text"].str.len() > 20].reset_index(drop=True)

df.head()


Unnamed: 0,from,subject,date,content,raw_text,clean_text,sender_email,sender_domain,contains_submit_word,contains_exam_word,contains_event_word,contains_form_link,deadline_date,contains_deadline,label_source,label_topic,label_urgency
0,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,Request for Email Data Contribution – Research...,2025-06-25 20:36:16+05:30,Dear Student\r\n\r\nThank you so much for your...,Request for Email Data Contribution – Research...,request for email data contribution – research...,23dcs023@charusat.edu.in,charusat.edu.in,False,False,False,False,,False,,,
1,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,Re: Request for Email Data Contribution – Rese...,2025-06-25 20:31:44+05:30,I hope you're doing well.\r\n\r\nThis is a fin...,Re: Request for Email Data Contribution – Rese...,re: request for email data contribution – rese...,23dcs023@charusat.edu.in,charusat.edu.in,True,False,False,False,,False,,,
2,NPTEL <onlinecourses@nptel.iitm.ac.in>,NPTEL Newsletter: IIT Madras CODE Certificate ...,2025-06-25 15:52:01+05:30,<https://code.iitm.ac.in/foundations-to-ml>\r\...,NPTEL Newsletter: IIT Madras CODE Certificate ...,nptel newsletter: iit madras code certificate ...,onlinecourses@nptel.iitm.ac.in,nptel.iitm.ac.in,False,False,False,False,,False,,,
3,Team at Read <support@e.read.ai>,Exclusive Education Pricing Just for You!,2025-06-25 09:33:48+00:00,"Read AI for $5 Per Month\r\nHi 23DCS023 , as a...",Exclusive Education Pricing Just for You! Read...,exclusive education pricing just for you! read...,support@e.read.ai,e.read.ai,False,False,False,False,,False,,,
4,Mohini Darji <mohinidarji.dcs@charusat.ac.in>,Reg. Submission of SGP- Weekly reports,2025-06-25 13:36:47+05:30,"Dear Students,\r\n\r\nKindly find the attached...",Reg. Submission of SGP- Weekly reports Dear St...,reg. submission of sgp- weekly reports dear st...,mohinidarji.dcs@charusat.ac.in,charusat.ac.in,True,False,False,False,,False,,,


In [4]:
df.columns

Index(['from', 'subject', 'date', 'content', 'raw_text', 'clean_text',
       'sender_email', 'sender_domain', 'contains_submit_word',
       'contains_exam_word', 'contains_event_word', 'contains_form_link',
       'deadline_date', 'contains_deadline', 'label_source', 'label_topic',
       'label_urgency'],
      dtype='object')

In [5]:
label_df = df[[
    "from",
    "sender_domain",
    "clean_text",
    "deadline_date",
    "label_source",
    "label_topic",
    "label_urgency"
]]


In [6]:
label_df.head()

Unnamed: 0,from,sender_domain,clean_text,deadline_date,label_source,label_topic,label_urgency
0,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,charusat.edu.in,request for email data contribution – research...,,,,
1,23DCS023 JAMES DHANDHUKIYA <23dcs023@charusat....,charusat.edu.in,re: request for email data contribution – rese...,,,,
2,NPTEL <onlinecourses@nptel.iitm.ac.in>,nptel.iitm.ac.in,nptel newsletter: iit madras code certificate ...,,,,
3,Team at Read <support@e.read.ai>,e.read.ai,exclusive education pricing just for you! read...,,,,
4,Mohini Darji <mohinidarji.dcs@charusat.ac.in>,charusat.ac.in,reg. submission of sgp- weekly reports dear st...,,,,


In [8]:
label_df.to_csv("unlabeled_dataset.csv",index=False)

In [9]:
label_df = interactive_level1_labeler(label_df)
label_df.to_csv("labeled_level1.csv", index=False)


NameError: name 'interactive_level1_labeler' is not defined