In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [14]:
data = pd.read_csv("../data/data.csv")
data.head()

Unnamed: 0,id,sentences,personality,tone,special_words
0,1,"Good morning, I am writing to inquire about th...",2,1,"application, status, inquire"
1,2,"Hey, what's up? I'm just chilling, watching so...",7,9,"chilling, TV"
2,3,I'm extremely disappointed with the service I ...,8,4,"disappointed, unacceptable, service"
3,4,Please find attached the document you requeste...,3,2,"document, requested, attached"
4,5,"OMG, that movie was totally awesome! I loved it!",9,10,"awesome, movie, OMG"


In [15]:
data.dropna(inplace=True)
data.head()

Unnamed: 0,id,sentences,personality,tone,special_words
0,1,"Good morning, I am writing to inquire about th...",2,1,"application, status, inquire"
1,2,"Hey, what's up? I'm just chilling, watching so...",7,9,"chilling, TV"
2,3,I'm extremely disappointed with the service I ...,8,4,"disappointed, unacceptable, service"
3,4,Please find attached the document you requeste...,3,2,"document, requested, attached"
4,5,"OMG, that movie was totally awesome! I loved it!",9,10,"awesome, movie, OMG"


In [16]:
data["special_words"]

0                           application, status, inquire
1                                           chilling, TV
2                    disappointed, unacceptable, service
3                          document, requested, attached
4                                    awesome, movie, OMG
                             ...                        
368                   dessert, preferences, requests, yo
369           interruptions, important task, concentrate
370    scientific experiment, new element, unique pro...
371    humidity, unbearable, melting, popsicle, sun, ugh
372         request, proposed changes, contract, meeting
Name: special_words, Length: 373, dtype: object

In [None]:
import json

def safe_parse_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    return [] 

In [10]:
data["special_words"] = data["special_words"].apply(safe_parse_list)
data.head()

Unnamed: 0,id,sentences,personality,tone,special_words
0,1,"Good morning, I am writing to inquire about th...",2,1,[]
1,2,"Hey, what's up? I'm just chilling, watching so...",7,9,[]
2,3,I'm extremely disappointed with the service I ...,8,4,[]
3,4,Please find attached the document you requeste...,3,2,[]
4,5,"OMG, that movie was totally awesome! I loved it!",9,10,[]


In [18]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['sentences'])

In [19]:
y_personality = data['personality']
y_tone = data['tone']
y = np.column_stack((y_personality, y_tone))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(base_model)
model.fit(X_train, y_train)

In [22]:
def predict_text(text):
    X_input = vectorizer.transform([text])
    pred = model.predict(X_input)
    personality, tone = pred[0][0], pred[0][1]
    return {'personality' : personality, 'tone' : tone}

In [None]:
example_text = "How is it that you can enter in here"
print(predict_text(example_text))

{'personality': np.int64(2), 'tone': np.int64(1)}


In [25]:
import joblib
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = nlp("My name is binit")

In [6]:
for item in doc.ents:
    print(item.text)

In [45]:
import pandas as pd
df = pd.read_csv("../data/unread_messages.csv")

In [46]:
df.dropna(inplace=True)

In [47]:
df.reset_index(drop=True, inplace=True)

In [48]:
df.head(40)

Unnamed: 0.1,Unnamed: 0,Subject,Received Time,Sender Name,Body,Priority
0,0.0,Invitation to the Inauguration Ceremony of Sta...,28-03-2025 23:37,"Innovation and Entrepreneurship Cell, AIT","Dear AITans,\r\n\r\nGreetings from the Innovat...",2.0
1,1.0,About PPL lecture on every Friday .,28-03-2025 10:10,Pradnya Tapkir,"Dear Students,\r\nFrom 28 March 2025 to till y...",0.0
2,2.0,Regarding Unit Test-II,27-03-2025 10:30,Mr Umakant Dhatrak,"\r\n\r\nDear student,\r\nI am inform your Unit...",0.0
3,3.0,Session on Life is all Positive and Negativit...,26-03-2025 13:26,Yogita T Hambir,"Dear Students,\r\n\r\nDepartment of Computer E...",0.0
4,4.0,DSA Extra Lecture,26-03-2025 08:52,Mangesh Hajare,"Dear All Students,\r\nDSA extra lecture has be...",0.0
5,5.0,Regarding Microprocessor Assignment,24-03-2025 11:14,Mr Umakant Dhatrak,"Dear student,\r\n\r\nI am inform you. I am sen...",0.0
6,6.0,Regarding Unit Test-II,24-03-2025 09:44,Mr Umakant Dhatrak,"Dear student,\r\nI am informing you .Your Unit...",0.0
7,7.0,Fw: Regarding examination forms of April/May 2025,21-03-2025 11:19,Dr Sunil Dhore,\r\n________________________________\r\n\r\nFr...,0.0
8,8.0,Solving of Previous END SEM question paper,20-03-2025 10:00,Pradnya Tapkir,"Dear Students, \r\nYou all have to solve prev...",0.0
9,9.0,Fw: Orientation by SPIC MACAY,19-03-2025 16:28,Dr Sunil Dhore,\r\n________________________________\r\n\r\nFr...,0.0


In [19]:
from nltk.corpus import stopwords
import nltk
import re

In [49]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\r\n', ' ', text)  # Remove newlines
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Binit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
df['Subject'] = df['Subject'].apply(clean_text)
df['Sender Name'] = df['Sender Name'].apply(clean_text)
df['Body'] = df['Body'].apply(clean_text)

In [53]:
df

Unnamed: 0.1,Unnamed: 0,Subject,Received Time,Sender Name,Body,Priority
0,0.0,invitation inauguration ceremony startup saga,28-03-2025 23:37,innovation entrepreneurship cell ait,dear aitans greetings innovation entrepreneurs...,2.0
1,1.0,ppl lecture every friday,28-03-2025 10:10,pradnya tapkir,dear students march till end sem ii ppl lectur...,0.0
2,2.0,regarding unit testii,27-03-2025 10:30,mr umakant dhatrak,dear student inform unit testii postponed wedn...,0.0
3,3.0,session life positive negativity doesnt exist,26-03-2025 13:26,yogita hambir,dear students department computer engineering ...,0.0
4,4.0,dsa extra lecture,26-03-2025 08:52,mangesh hajare,dear students dsa extra lecture scheduled toda...,0.0
...,...,...,...,...,...,...
204,204.0,fw invitation complete schedule icsis converge...,23-04-2024 10:13,mridula chandola,people received message dont often get email m...,2.0
205,205.0,invitation speaker session startup saga,13-04-2024 10:43,innovation entrepreneurship cell ait,greetings everyone great respect honor extend ...,2.0
206,206.0,unavailability reading hall students th th april,13-04-2024 00:29,innovation entrepreneurship cell ait,greetings ie cell light ongoing event startup ...,2.0
207,207.0,invitation standup show startup saga,12-04-2024 15:10,innovation entrepreneurship cell ait,greetings ie cell ie cell elated invite standu...,2.0


In [65]:
result_df = df.groupby("Sender Name").agg(
    Body=("Body", " ".join),
    Total_Priority=("Priority", "sum"),
    Average_Priority=("Priority", "mean")
).reset_index()

In [58]:
result_df

Unnamed: 0,Sender Name,Body,Total_Priority,Average_Priority
0,aditya singh rawat,hey binit hope good mail regarding meeting sch...,3.0,3.0
1,ait college exam officer,dear students pfa circular came sppu savitriba...,14.0,2.0
2,ait google development students club,dear students final chance part enliven thrill...,2.0,2.0
3,ait oss club,greetings oss club great pride enthusiasm refl...,0.0,0.0
4,ait sports,dear happy blessed morning behalf sports club ...,2.0,2.0
5,anup kadam,kiran nathgosavi kirannathgosaviritindiaedu se...,3.0,3.0
6,dean rnd ait,dear students pfa interested students join tha...,1.0,0.333333
7,director ait,want say team rocks great teamwork amazing pla...,2.0,2.0
8,dr seema tiwari,congratulations oss club thanks regards dr see...,2.0,2.0
9,dr sharayu lokhande,dear students please find attached timetable s...,2.0,2.0


In [66]:
result_df["Body_Length"] = result_df["Body"].apply(len)

In [67]:
result_df

Unnamed: 0,Sender Name,Body,Total_Priority,Average_Priority,Body_Length
0,aditya singh rawat,hey binit hope good mail regarding meeting sch...,3.0,3.0,126
1,ait college exam officer,dear students pfa circular came sppu savitriba...,14.0,2.0,6091
2,ait google development students club,dear students final chance part enliven thrill...,2.0,2.0,1006
3,ait oss club,greetings oss club great pride enthusiasm refl...,0.0,0.0,7025
4,ait sports,dear happy blessed morning behalf sports club ...,2.0,2.0,815
5,anup kadam,kiran nathgosavi kirannathgosaviritindiaedu se...,3.0,3.0,11894
6,dean rnd ait,dear students pfa interested students join tha...,1.0,0.333333,4173
7,director ait,want say team rocks great teamwork amazing pla...,2.0,2.0,5610
8,dr seema tiwari,congratulations oss club thanks regards dr see...,2.0,2.0,9134
9,dr sharayu lokhande,dear students please find attached timetable s...,2.0,2.0,392


In [None]:
import numpy as np

def estimate_priority(new_body: str, new_sender: str, reference_df: pd.DataFrame):
    """
    Estimate the priority of a new mail based on past data.
    :param new_body: The body of the new email.
    :param new_sender: The sender of the new email.
    :param reference_df: The dataframe containing previous emails with priorities.
    :return: Estimated priority.
    """
    
    # Compute reference values
    min_length = reference_df["Body_Length"].min()
    max_length = reference_df["Body_Length"].max()
    min_priority = reference_df["Total_Priority"].min()
    max_priority = reference_df["Total_Priority"].max()
    
    # Get length of new mail body
    new_body_length = len(new_body)
    
    # Normalize body length between 0 and 1
    norm_length = (new_body_length - min_length) / (max_length - min_length)
    
    # Check if sender exists in reference data
    sender_avg_priority = reference_df[reference_df["Sender Name"] == new_sender]["Average_Priority"].mean()
    if np.isnan(sender_avg_priority):
        sender_avg_priority = reference_df["Average_Priority"].mean()  # Default to overall average
    
    # Weighting factors
    weight_body = 0.7  # More weight on body length
    weight_sender = 0.3  # Some influence from sender's historical priority
    
    # Map normalized length to priority range (higher length -> lower priority)
    estimated_priority = (
        (max_priority - norm_length * (max_priority - min_priority)) * weight_body +
        sender_avg_priority * weight_sender
    )
    
    return round(estimated_priority, 2)  # Round for better readability

# Example Usage
new_email_body = "Dear students, please note that there will be an extra class tomorrow."
new_email_sender = "sushma shirke"
new_priority = estimate_priority(new_email_body, new_email_sender, result_df)
print("Estimated Priority:", new_priority/10)

Estimated Priority: 1.421


In [None]:
senders_with_priority_0_count = df[df['Priority'] == 0]['Sender Name'].value_counts()
senders_with_priority_0_filtered = senders_with_priority_0_count[senders_with_priority_0_count > 4].index.tolist()
print(senders_with_priority_0_filtered)

['dr sunil dhore', 'mr umakant dhatrak', 'jyoti ait', 'yogita hambir', 'r verma', 'pradnya tapkir', 'mangesh hajare', 'sushma shirke']
