In [12]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression


In [6]:
csv_file='labeled_emails.csv'
data=pd.read_csv(csv_file,index_col=False) #reads the original csv file from index 1
data.index=np.arange(1,len(data)+1) #sets the indexing from zero

data.iloc[2] #print a single row

# problematic_emails

from          Radhika Patel <radhikapatel.it@charusat.ac.in>
to         23dcse@charusat.edu.in, 23dce@charusat.edu.in,...
subject    Fwd: Join AI master series by SkillDzire in as...
date                         Fri, 19 Jul 2024 12:41:16 +0530
labels                Inbox,Important,Opened,Category Forums
content    ---------- Forwarded message --------- From: S...
label                                               optional
Name: 3, dtype: object

In [7]:
if 'from' not in data.columns or 'content' not in data.columns:
    raise ValueError("CSV file must contain 'from' and 'content' columns.")

In [8]:
rows,column=data.shape
print("the number of rows is {} and the number of columns is {} in dataframe".format(rows,column ))

the number of rows is 485 and the number of columns is 7 in dataframe


In [None]:
sent=pd.DataFrame(columns=data.columns)
recieved=pd.DataFrame(columns=data.columns)
with open('error_log.txt', 'w', encoding='utf-8') as error_log:
    # Loop through the sample emails and ask for user input
    for index, row in data.iloc[:rows].iterrows():
        try:
            print(f"\nEmail {index} of {rows}")
            print("From:", row['from'])

            # Check if the content is a string, handle if it's not
            if isinstance(row['content'], str):
                # Handle possible encoding issues
                content = row['content'][:500].encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
                print("Content:", content)  # Limit display to 500 characters for readability
            else:
                raise TypeError('Content is not a string')

            print("-" * 80)
            myid="23dcs056@charusat.edu.in"
            if myid in row['from']:
                sent = pd.concat([sent, pd.DataFrame([row])], ignore_index=True)
                print("email was from your inbox")
            else:
                recieved=pd.concat([recieved,pd.DataFrame([row])],ignore_index=True)
                print("you recieved this email")

            
        except Exception as e:
            print(f"Error processing email {index + 1}: {e}")
            # Log the problematic email for further investigation
            error_log.write(f"Error with email {index + 1}:\nFrom: {row['from']}\nContent: {row['content'] if isinstance(row['content'], str) else ''}\nError: {e}\n\n")
            
            # Add the problematic email to the separate DataFrame
            problematic_emails = pd.concat([problematic_emails, pd.DataFrame([row])], ignore_index=True)

# Remove problematic entries from the original DataFrame
# data=data.drop(problematic_emails.index, inplace=True)



In [None]:
sample_size = 50  # You can adjust this to any number of emails you want to label manually
print("Please label the following sample emails:")

with open('error_log.txt', 'w', encoding='utf-8') as error_log:
    # Loop through the sample emails and ask for user input
    for index, row in recieved.iloc[:sample_size].iterrows():
        try:
            print(f"\nEmail {index} of {sample_size}")
            print("From:", row['from'])

            # Check if the content is a string, handle if it's not
            if isinstance(row['content'], str):
                # Handle possible encoding issues
                content = row['content'][:500].encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
                print("Content:", content)  # Limit display to 500 characters for readability
            else:
                raise TypeError('Content is not a string')

            print("-" * 80)

            # Ask user to label the email
            label = input("Enter label for this email (priority/optional): ").strip().lower()

            # Validate input
            while label not in ['p', 'o']:
                print("Invalid input. Please enter 'priority' or 'optional'.")
                label = input("Enter label for this email (priority/optional): ").strip().lower()

            # Assign the label to the DataFrame
            if label=='p':
                recieved.at[index, 'label'] = "priority"
            else:
                recieved.at[index, 'label'] = "optional"

        except Exception as e:
            print(f"Error processing email {index}: {e}")
            # Log the problematic email for further investigation
            error_log.write(f"Error with email {index}:\nFrom: {row['from']}\nContent: {row['content'] if isinstance(row['content'], str) else ''}\nError: {e}\n\n")
            
            # Add the problematic email to the separate DataFrame
            problematic_emails = pd.concat([problematic_emails, pd.DataFrame([row])], ignore_index=True)

# Remove problematic entries from the original DataFrame
# recieved.drop(problematic_emails.index, inplace=True)


In [9]:
# Split the data into training (with labels) and testing (NaN labels)
df=data
train_data = df[df['label'].notna()]
test_data = df[df['label'].isna()]


In [10]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['content'])
X_test = vectorizer.transform(test_data['content'])


In [13]:

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, train_data['label'])

In [16]:
# Predict labels for the test data (NaN labels)
predicted_labels = model.predict(X_test)
test_data.loc[:, 'label'] = predicted_labels


In [17]:
# Combine the training and test data back into a single DataFrame and sort by the original index
result = pd.concat([train_data, test_data]).sort_index()


In [18]:
result

Unnamed: 0,from,to,subject,date,labels,content,label
1,NDLI Account Activation <ndl-support@iitkgp.ac...,Vansh <23dcs056@charusat.edu.in>,NDLI user account verification,"Thu, 18 Jul 2024 16:04:06 +0000 (UTC)","Inbox,Important,Opened,Category Updates","NDLI account verification *Dear Vansh ,* Tha...",optional
2,Radhika Patel <radhikapatel.it@charusat.ac.in>,"21dcse@charusat.edu.in, 22dcse@charusat.edu.in...",Regarding the collection of Marksheet (Odd ter...,"Thu, 18 Jul 2024 15:28:13 +0530","Inbox,Important,Opened,Category Forums","Dear Students, Greetings!!! Kindly note that...",priority
3,Radhika Patel <radhikapatel.it@charusat.ac.in>,"23dcse@charusat.edu.in, 23dce@charusat.edu.in,...",Fwd: Join AI master series by SkillDzire in as...,"Fri, 19 Jul 2024 12:41:16 +0530","Inbox,Important,Opened,Category Forums",---------- Forwarded message --------- From: S...,optional
4,"""Gaurang Patel (Classroom)"" <no-reply@classroo...",23dcs056@charusat.edu.in,=?UTF-8?Q?New_announcement=3A_=22Dear_students...,"Mon, 15 Jul 2024 03:39:42 -0700","Inbox,Category Updates,Unread",Notification settings CSE202: Microprocessor a...,priority
5,LinkedIn <messages-noreply@linkedin.com>,Vansh Malani <23dcs056@charusat.edu.in>,Kshitish Bhatt commented on your post,"Wed, 17 Jul 2024 13:25:09 +0000 (UTC)","Inbox,Category Social,Unread",View Kshitish’s profile: https://www.linkedin....,optional
...,...,...,...,...,...,...,...
481,Bhargav Shobhana <bhargavshobhana.cv@charusat....,,Important- Online Quiz of Civil Part ME145,"Thu, 11 Apr 2024 10:44:37 +0530","Inbox,Important,Opened,Category Personal","Dear Students, An online quiz is planned for ...",priority
482,Radhika Patel <radhikapatel.it@charusat.ac.in>,"23dcse@charusat.edu.in, 23dce@charusat.edu.in,...",Fwd: Engineering Physics - 2 (PY143) || Lab Ma...,"Fri, 26 Apr 2024 09:56:41 +0530","Inbox,Important,Opened,Category Forums","Regards, Radhika H. Patel, Assistant Professor...",priority
483,"""Coursera"" <Coursera@m.learn.coursera.org>",23dcs056@charusat.edu.in,NEW! AI content added to Professional Certific...,"Tue, 16 Jul 2024 15:43:58 +0000","Inbox,Category Promotions,Unread","Plus, get a special offer from Google /...",optional
484,Devpost <support@devpost.com>,23dcs056@charusat.edu.in,Level Up with Devpost's Discord Leaderboard Re...,"Thu, 11 Jul 2024 19:17:12 +0000","Trash,Category Updates,Unread",****************************************** Tak...,optional


In [20]:
labeled_csv_file = 'labeled_emails_active_learning.csv'
result.to_csv(labeled_csv_file, index=False, encoding='utf-8')