In [4]:
import pandas as pd

# Load the CSV file
data_path = 'C:/Users/pooja/OneDrive/Documents/emails.csv'
email_df = pd.read_csv(data_path)

# Sample a subset of the data if needed
n_samples = 1000
if len(email_df) > n_samples:
    email_df_subset1 = email_df.sample(n=n_samples, random_state=42)
else:
    email_df_subset1 = email_df

# Add numeric and descriptive 'label' columns
def label_email(message):
    # Placeholder logic for labeling (e.g., check for keywords)
    if 'urgent' in message.lower() or 'important' in message.lower():
        return 'Important'  # Example label for important emails
    else:
        return 'Non-Important'  # Example label for non-important emails

email_df_subset1['labels'] = email_df_subset1['message'].apply(label_email)

# Display the first few rows of the sampled subset with the 'labels' column
print("First few rows of the sampled subset:")
print(email_df_subset1.head())

# Define the file path where you want to save the dataset
save_path = 'C:/Users/pooja/OneDrive/Documents/sample_emails_labeled.csv'

# Save the DataFrame to a CSV file
email_df_subset1.to_csv(save_path, index=False)

print("Dataset saved successfully at:", save_path)


First few rows of the sampled subset:
                                              file  \
427616                     shackleton-s/sent/1912.   
108773                    farmer-d/logistics/1066.   
355471                  parks-j/deleted_items/202.   
457837  stokley-c/chris_stokley/iso/client_rep/41.   
124910               germany-c/all_documents/1174.   

                                                  message         labels  
427616  Message-ID: <21013688.1075844564560.JavaMail.e...  Non-Important  
108773  Message-ID: <22688499.1075854130303.JavaMail.e...  Non-Important  
355471  Message-ID: <27817771.1075841359502.JavaMail.e...  Non-Important  
457837  Message-ID: <10695160.1075858510449.JavaMail.e...  Non-Important  
124910  Message-ID: <27819143.1075853689038.JavaMail.e...  Non-Important  
Dataset saved successfully at: C:/Users/pooja/OneDrive/Documents/sample_emails_labeled.csv


In [2]:
import pandas as pd
import re

# Load the dataset
data_path = 'C:/Users/pooja/OneDrive/Documents/sample_emails_labeled.csv'
email_df = pd.read_csv(data_path)

# Feature Engineering
# Extract sender email domain
email_df['sender_domain'] = email_df['file'].apply(lambda x: x.split('/')[-1].split('@')[-1])

# Extract time of receipt (hour of the day) from message content
def extract_time_of_receipt(message):
    time_regex = r'Date:.*?(\d{2}:\d{2}:\d{2})'
    match = re.search(time_regex, message, re.IGNORECASE)
    if match:
        time_str = match.group(1)
        return pd.to_datetime(time_str).hour
    else:
        return None

email_df['time_of_receipt'] = email_df['message'].apply(extract_time_of_receipt)

# Extract email subject length from message content
def extract_subject_length(message):
    subject_regex = r'Subject:(.*?)(?=\n)'
    match = re.search(subject_regex, message, re.IGNORECASE)
    if match:
        subject = match.group(1).strip()
        return len(subject)
    else:
        return None

email_df['subject_length'] = email_df['message'].apply(extract_subject_length)

# Display the updated DataFrame
print("Updated DataFrame with additional features:")
print(email_df.head())


Updated DataFrame with additional features:
                                         file  \
0                     shackleton-s/sent/1912.   
1                    farmer-d/logistics/1066.   
2                  parks-j/deleted_items/202.   
3  stokley-c/chris_stokley/iso/client_rep/41.   
4               germany-c/all_documents/1174.   

                                             message         labels  \
0  Message-ID: <21013688.1075844564560.JavaMail.e...  Non-Important   
1  Message-ID: <22688499.1075854130303.JavaMail.e...  Non-Important   
2  Message-ID: <27817771.1075841359502.JavaMail.e...  Non-Important   
3  Message-ID: <10695160.1075858510449.JavaMail.e...  Non-Important   
4  Message-ID: <27819143.1075853689038.JavaMail.e...  Non-Important   

  sender_domain  time_of_receipt  subject_length  
0         1912.                1              22  
1         1066.                5              25  
2          202.                4              20  
3           41.               

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score

# Define features and target variable
X = email_df[['sender_domain', 'time_of_receipt', 'subject_length']]
y = email_df['labels']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Use the best model for prediction
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
print("\nAccuracy:", accuracy)
print("\nPrecision:", precision)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}

Accuracy: 0.935

Precision: [0.    0.935]

Classification Report:
               precision    recall  f1-score   support

    Important       0.00      0.00      0.00        13
Non-Important       0.94      1.00      0.97       187

     accuracy                           0.94       200
    macro avg       0.47      0.50      0.48       200
 weighted avg       0.87      0.94      0.90       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd

# Load or define your email_df with features and labels
# Example: email_df = pd.read_csv('your_dataset.csv')

# Define features and target variable
X = email_df[['sender_domain', 'time_of_receipt', 'subject_length']]
y = email_df['labels']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Use the best model for prediction
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nAccuracy:", accuracy)
print("\nPrecision:", precision)
print("\nRecall:", recall)
print("\nF1-score:", f1)
print("\nConfusion Matrix:")
print(conf_matrix)

# Print Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}

Accuracy: 0.935

Precision: [0.    0.935]

Recall: [0. 1.]

F1-score: [0.         0.96640827]

Confusion Matrix:
[[  0  13]
 [  0 187]]

Classification Report:
               precision    recall  f1-score   support

    Important       0.00      0.00      0.00        13
Non-Important       0.94      1.00      0.97       187

     accuracy                           0.94       200
    macro avg       0.47      0.50      0.48       200
 weighted avg       0.87      0.94      0.90       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the CSV file
data_path = 'sample_emails_labeled.csv'
email_df = pd.read_csv(data_path)

# Feature Engineering

# Extract sender domain from the 'file' column
def extract_sender_domain(file):
    try:
        return file.split('/')[-1].split('@')[-1]
    except AttributeError:
        return 'unknown'

email_df['sender_domain'] = email_df['file'].apply(extract_sender_domain)

# Dummy sender reputation score based on sender domain length
def dummy_sender_reputation(sender_domain):
    if sender_domain == 'unknown':
        return 0
    # Simplified logic: shorter domains might be considered more trustworthy
    return len(sender_domain)

email_df['sender_reputation'] = email_df['sender_domain'].apply(dummy_sender_reputation)

# Extract time of receipt (for example purposes, let's use a fixed dummy value)
email_df['time_of_receipt'] = 12  # Assuming all emails are received at noon (12 PM)

# Extract email subject length (if subject is present in 'message' column)
def calculate_subject_length(message):
    if isinstance(message, str):
        subject = message.split('\n')[0]  # Assuming the subject is the first line of the message
        return len(subject)
    return 0

email_df['subject_length'] = email_df['message'].apply(calculate_subject_length)

# Ensure all features are numeric
email_df['sender_reputation'] = pd.to_numeric(email_df['sender_reputation'], errors='coerce').fillna(0)
email_df['time_of_receipt'] = pd.to_numeric(email_df['time_of_receipt'], errors='coerce').fillna(0)
email_df['subject_length'] = pd.to_numeric(email_df['subject_length'], errors='coerce').fillna(0)

# Define features and target variable
X = email_df[['sender_reputation', 'time_of_receipt', 'subject_length']]
y = email_df['labels']

# Ensure target variable is numeric and handle any missing values
y = pd.to_numeric(y, errors='coerce').fillna(0).astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Tuning

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Use the best model for prediction
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)

# Evaluation Metrics

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the sender domains and their reputation scores to a new CSV file
sender_reputation_df = email_df[['sender_domain', 'sender_reputation']].drop_duplicates()
sender_reputation_df.to_csv('sender_reputation_scores.csv', index=False)

print("Sender reputation scores saved to sender_reputation_scores.csv")




Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}

Accuracy: 0.9975062344139651

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           2       0.00      0.00      0.00         1

    accuracy                           1.00       401
   macro avg       0.50      0.50      0.50       401
weighted avg       1.00      1.00      1.00       401

Sender reputation scores saved to sender_reputation_scores.csv


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
