In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Expanded dataset to balance classes
data = {
    'email_text': [
        "Can you send me the syllabus for my course?",
        "I need to discuss a research partnership between our institutions.",
        "I'd like to know about internship opportunities at your department.",
        "I need help with accessing the course materials.",
        "Our company would like to offer placement opportunities.",
        "I'm reaching out to discuss a potential academic cooperation.",
        "I'm having trouble enrolling in the online course.",
        "We'd like to collaborate on research data sharing.",
        "Is there a way to apply for internships?",
        "Please help me with my course grades.",
        "I want to request a transcript.",
        "Can you help me with exam dates?",
        "We'd like to discuss a research project.",
        "We are interested in a joint academic conference.",
        "Could you send me details on internship programs?",
        "We need a student internship program proposal.",
        "How can I collaborate on research in your lab?",
        "Can I use your research facilities?"
    ],
    'label': [
        'student_inquiry', 'academic_collaboration', 'corporate_inquiry',
        'student_inquiry', 'corporate_inquiry', 'academic_collaboration',
        'student_inquiry', 'academic_collaboration', 'corporate_inquiry',
        'student_inquiry', 'student_inquiry', 'student_inquiry',
        'academic_collaboration', 'academic_collaboration', 'corporate_inquiry',
        'corporate_inquiry', 'academic_collaboration', 'academic_collaboration'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Features and labels
X = df['email_text']
y = df['label']

# Use stratified split to preserve class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Display the classification report
print(classification_report(y_test, y_pred))


                        precision    recall  f1-score   support

academic_collaboration       0.67      1.00      0.80         2
     corporate_inquiry       0.00      0.00      0.00         1
       student_inquiry       1.00      1.00      1.00         1

              accuracy                           0.75         4
             macro avg       0.56      0.67      0.60         4
          weighted avg       0.58      0.75      0.65         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Expanded dataset to balance classes
data = {
    'email_text': [
        "Can you send me the syllabus for my course?",
        "I need to discuss a research partnership between our institutions.",
        "I'd like to know about internship opportunities at your department.",
        "I need help with accessing the course materials.",
        "Our company would like to offer placement opportunities.",
        "I'm reaching out to discuss a potential academic cooperation.",
        "I'm having trouble enrolling in the online course.",
        "We'd like to collaborate on research data sharing.",
        "Is there a way to apply for internships?",
        "Please help me with my course grades.",
        "I want to request a transcript.",
        "Can you help me with exam dates?",
        "We'd like to discuss a research project.",
        "We are interested in a joint academic conference.",
        "Could you send me details on internship programs?",
        "We need a student internship program proposal.",
        "How can I collaborate on research in your lab?",
        "Can I use your research facilities?",
        "This is a confidential partnership proposal.",
        "We would like to explore legal aspects of our collaboration."
    ],
    'label': [
        'student_inquiry', 'academic_collaboration', 'corporate_inquiry',
        'student_inquiry', 'corporate_inquiry', 'academic_collaboration',
        'student_inquiry', 'academic_collaboration', 'corporate_inquiry',
        'student_inquiry', 'student_inquiry', 'student_inquiry',
        'academic_collaboration', 'academic_collaboration', 'corporate_inquiry',
        'corporate_inquiry', 'academic_collaboration', 'academic_collaboration',
        'corporate_inquiry', 'corporate_inquiry'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Features and labels
X = df['email_text']
y = df['label']

# Use stratified split to preserve class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Display the classification report
print(classification_report(y_test, y_pred))

# Email Routing and Handling
sensitive_keywords = ['confidential', 'partnership', 'legal', 'agreement']

def handle_email(email_text, classification):
    """
    Routes and handles emails based on their classification and content.
    """
    # Check if the email contains sensitive keywords
    if classification == 'corporate_inquiry' or any(word in email_text.lower() for word in sensitive_keywords):
        return f"[Escalated for manual response] {email_text}"
    else:
        return f"[Auto Response Sent] {email_text}"

# Simulate routing of test emails
print("\nEmail Routing & Handling:\n")
for email, label in zip(X_test, y_pred):
    result = handle_email(email, label)
    print(result)


                        precision    recall  f1-score   support

academic_collaboration       1.00      1.00      1.00         2
     corporate_inquiry       0.50      1.00      0.67         1
       student_inquiry       0.00      0.00      0.00         1

              accuracy                           0.75         4
             macro avg       0.50      0.67      0.56         4
          weighted avg       0.62      0.75      0.67         4


Email Routing & Handling:

[Auto Response Sent] We'd like to discuss a research project.
[Auto Response Sent] We are interested in a joint academic conference.
[Escalated for manual response] I want to request a transcript.
[Escalated for manual response] We would like to explore legal aspects of our collaboration.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Sample data
data = {
    'email_text': [
        "Can you send me the syllabus for my course?",
        "I need to discuss a research partnership between our institutions.",
        "I'd like to know about internship opportunities at your department.",
        "I need help with accessing the course materials.",
        "Our company would like to offer placement opportunities.",
        "I'm reaching out to discuss a potential academic cooperation.",
        "I'm having trouble enrolling in the online course.",
        "We'd like to collaborate on research data sharing.",
        "Is there a way to apply for internships?",
        "Please help me with my course grades.",
        "I want to request a transcript.",
        "Can you help me with exam dates?",
        "We'd like to discuss a research project.",
        "We are interested in a joint academic conference.",
        "Could you send me details on internship programs?",
        "We need a student internship program proposal.",
        "How can I collaborate on research in your lab?",
        "Can I use your research facilities?",
        "This is a confidential partnership proposal.",
        "We would like to explore legal aspects of our collaboration."
    ],
    'label': [
        'student_inquiry', 'academic_collaboration', 'corporate_inquiry',
        'student_inquiry', 'corporate_inquiry', 'academic_collaboration',
        'student_inquiry', 'academic_collaboration', 'corporate_inquiry',
        'student_inquiry', 'student_inquiry', 'student_inquiry',
        'academic_collaboration', 'academic_collaboration', 'corporate_inquiry',
        'corporate_inquiry', 'academic_collaboration', 'academic_collaboration',
        'corporate_inquiry', 'corporate_inquiry'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Features and labels
X = df['email_text']
y = df['label']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Sensitive keywords for corporate inquiries
sensitive_keywords = ['confidential', 'partnership', 'legal', 'agreement', 'contract']

# Improved auto-response generator function
def generate_response(email_text, classification):
    """
    Generates a context-specific auto-response based on email classification.
    """
    if classification == 'student_inquiry':
        if 'syllabus' in email_text.lower():
            return "Attached is the syllabus for your course."
        elif 'course materials' in email_text.lower():
            return "You can access the course materials from the department portal."
        elif 'grades' in email_text.lower():
            return "Please check the student portal for your latest grades."
        elif 'transcript' in email_text.lower():
            return "You can request your transcript through the university's online system."
        elif 'exam' in email_text.lower():
            return "The exam schedule is available on the university's website."
        else:
            return "Your inquiry has been received. We'll get back to you shortly."

    elif classification == 'academic_collaboration':
        if 'partnership' in email_text.lower():
            return "We would be happy to discuss a potential research partnership. Please provide more details."
        elif 'conference' in email_text.lower():
            return "We are open to discussing joint academic conferences. Please share more information."
        else:
            return "Thank you for reaching out. We are interested in discussing academic collaboration further."

    elif classification == 'corporate_inquiry':
        return "Thank you for your corporate inquiry. Your request has been forwarded to the relevant department for review."

    return "Thank you for your email. We will get back to you shortly."

# Function to handle email routing and response
def handle_email(email_text, classification):
    """
    Routes and handles emails based on their classification and content.
    """
    # Check for sensitive keywords in corporate inquiries
    if classification == 'corporate_inquiry' or any(word in email_text.lower() for word in sensitive_keywords):
        return f"[Escalated for manual handling] {email_text}"
    else:
        # Generate an auto-response based on classification
        response = generate_response(email_text, classification)
        return f"[Auto Response] {response}"

# Simulate email routing and handling for the test emails
print("Email Routing & Handling:\n")
for email, label in zip(X_test, y_pred):
    result = handle_email(email, label)
    print(result)


Email Routing & Handling:

[Auto Response] Thank you for reaching out. We are interested in discussing academic collaboration further.
[Auto Response] We are open to discussing joint academic conferences. Please share more information.
[Escalated for manual handling] I want to request a transcript.
[Escalated for manual handling] We would like to explore legal aspects of our collaboration.


In [5]:
import random
import nltk
from nltk.corpus import wordnet
import numpy as np

nltk.download('wordnet')

# Example templates for email generation
student_templates = [
    "Dear Professor, I wanted to ask about {topic}. I missed the last class and would appreciate it if you could share the materials. Thank you!",
    "Respected Sir/Madam, Could you please confirm the {topic} deadline? I’m unsure if it is due by this {time_frame}. Thank you!",
]

academic_collab_templates = [
    "Dear Dr. {name}, I am working on research related to {topic} and saw your recent publication. I am interested in exploring a collaboration opportunity. Can we schedule a meeting?",
    "Hello Professor, I am a researcher from {institution} and would like to inquire about using your department’s {resource}. What would be the procedure to request access?"
]

corporate_templates = [
    "Dear Hiring Manager, I’m writing to ask about {topic} at your department for the summer of {year}. Could you provide details about the application process?",
    "Dear Head of Department, We are interested in organizing a {event} at your university for recruitment. Please let us know the steps to proceed."
]

# Selection of variables
topics = ["course material", "assignment", "exam schedule", "research collaboration", "research facilities"]
time_frames = ["Friday", "next Monday", "the upcoming weekend"]
names = ["Johnson", "Robertson", "Smith"]
institutions = ["XYZ University", "ABC Institute"]
resources = ["lab facilities", "computing resources", "research equipment"]
events = ["placement drive", "recruitment seminar"]
years = ["2024", "2025"]

# Synonym Replacement Function
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))

    if len(random_word_list) == 0:
        return sentence

    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        synonym_words = [syn.lemmas()[0].name() for syn in synonyms if syn.lemmas()]
        if len(synonym_words) > 0:
            synonym = random.choice(synonym_words)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# Random Deletion Function
def random_deletion(sentence, p=0.1):
    words = sentence.split()
    if len(words) == 1:  # Only one word in sentence, can't delete
        return sentence
    remaining_words = [word for word in words if random.uniform(0, 1) > p]
    if len(remaining_words) == 0:  # Avoid empty sentence
        return random.choice(words)
    return ' '.join(remaining_words)

# Sentence Shuffling
def sentence_shuffling(text):
    sentences = text.split('. ')
    random.shuffle(sentences)
    return '. '.join(sentences)

# Generate Random Emails
def generate_email(category):
    if category == 'student':
        template = random.choice(student_templates)
        email = template.format(
            topic=random.choice(topics),
            time_frame=random.choice(time_frames)
        )
    elif category == 'academic_collab':
        template = random.choice(academic_collab_templates)
        email = template.format(
            topic=random.choice(topics),
            name=random.choice(names),
            institution=random.choice(institutions),
            resource=random.choice(resources)
        )
    elif category == 'corporate':
        template = random.choice(corporate_templates)
        email = template.format(
            topic=random.choice(topics),
            event=random.choice(events),
            year=random.choice(years)
        )
    return email

# Apply augmentations
def augment_email(email):
    # Apply synonym replacement
    email = synonym_replacement(email, n=2)
    # Apply random deletion
    email = random_deletion(email, p=0.1)
    # Shuffle sentences
    email = sentence_shuffling(email)
    return email

# Generate and augment emails for each category
def generate_dataset(n_emails=10):
    dataset = []
    categories = ['student', 'academic_collab', 'corporate']

    for _ in range(n_emails):
        category = random.choice(categories)
        email = generate_email(category)
        augmented_email = augment_email(email)
        dataset.append({'email': augmented_email, 'category': category})

    return dataset

# Generate and print a sample dataset
sample_dataset = generate_dataset(n_emails=10)
for email in sample_dataset:
    print(f"Email: {email['email']}")
    print(f"Category: {email['category']}\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...


Email: Thank you!. Dear Professor, I ask about inquiry collaboration. the last class and would appreciate it you could share materials
Category: student

Email: Dear Hiring Manager, I’m writing to ask about inquiry collaboration at your department the summer of 2025. Could you details about the application process?
Category: corporate

Email: Head Department, We are interest in organizing a recruitment seminar astatine your university for recruitment. Please let us know the steps to proceed.
Category: corporate

Email: Robertson, I am working on research related to course material and saw your recent publication. Can we schedule a meeting?. I am interested in exploring a collaboration opportunity. Dear Dr
Category: academic_collab

Email: Dear Hiring Manager, I’m writing to ask about research facilities astatine your department for the summer of 2024. Could you provide details the application process?
Category: corporate

Email: I missed the last class and would appreciate it if you sh

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Sample dataset from the earlier generation step
sample_dataset = generate_dataset(n_emails=100)

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(sample_dataset)

# Encode labels to numeric format
label_mapping = {'student': 0, 'academic_collab': 1, 'corporate': 2}
df['label'] = df['category'].map(label_mapping)

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the emails and prepare inputs for BERT
def tokenize_data(data, tokenizer, max_length=128):
    return tokenizer(
        list(data['email']),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

# Tokenize train and test data
train_encodings = tokenize_data(train_df, tokenizer)
test_encodings = tokenize_data(test_df, tokenizer)

train_labels = list(train_df['label'])
test_labels = list(test_df['label'])


import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm

class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create dataset and dataloader
train_dataset = EmailDataset(train_encodings, train_labels)
test_dataset = EmailDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
model.train()
epochs = 10
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # Move batch data to GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())


from sklearn.metrics import classification_report

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    return true_labels, predictions

# Evaluate on test data
true_labels, predictions = evaluate(model, test_loader)

# Print classification report
# print(classification_report(true_labels, predictions, target_names=label_mapping.keys()))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 1: 100%|██████████| 10/10 [01:04<00:00,  6.41s/it, loss=1.09]
Epoch 2: 100%|██████████| 10/10 [00:38<00:00,  3.89s/it, loss=0.64]
Epoch 3: 100%|██████████| 10/10 [00:41<00:00,  4.15s/it, loss=0.293]
Epoch 4: 100%|██████████| 10/10 [00:39<00:00,  3.94s/it, loss=0.0852]
Epoch 5: 100%|██████████| 10/10 [00:41<00:00,  4.12s/it, loss=0.0518]
Epoch 6: 100%|██████████| 10/10 [00:39<00:00,  3.98s/it, loss=0.0346]
Epoch 7: 100%|██████████| 10/10 [00:39<00:00,  3.93s/it, loss=0.0162]
Epoch 8: 100%|██████████| 10/10 [00:37<00:00,  3.78s/it, loss=0.013]
Epoch 9: 100%|██████████| 10/10 [00:39<00:00,  3.93s/it, loss=0.00617]
Epoc