In [2]:
import pandas as pd

# Load the CSV file
data_path = 'C:/Users/pooja/OneDrive/Documents/emails.csv'
email_df = pd.read_csv(data_path)

# Sample a subset of the data if needed
n_samples = 1000
if len(email_df) > n_samples:
    email_df_subset = email_df.sample(n=n_samples, random_state=42)
else:
    email_df_subset = email_df

# Add numeric and descriptive 'label' columns
def label_email(message):
    # Placeholder logic for labeling (e.g., check for keywords)
    if 'urgent' in message.lower() or 'important' in message.lower():
        return 'Important'  # Example label for important emails
    else:
        return 'Non-Important'  # Example label for non-important emails

email_df_subset['labels'] = email_df_subset['message'].apply(label_email)

# Display the first few rows of the sampled subset with the 'labels' column
print("First few rows of the sampled subset:")
print(email_df_subset.head())





First few rows of the sampled subset:
                                              file  \
427616                     shackleton-s/sent/1912.   
108773                    farmer-d/logistics/1066.   
355471                  parks-j/deleted_items/202.   
457837  stokley-c/chris_stokley/iso/client_rep/41.   
124910               germany-c/all_documents/1174.   

                                                  message         labels  
427616  Message-ID: <21013688.1075844564560.JavaMail.e...  Non-Important  
108773  Message-ID: <22688499.1075854130303.JavaMail.e...  Non-Important  
355471  Message-ID: <27817771.1075841359502.JavaMail.e...  Non-Important  
457837  Message-ID: <10695160.1075858510449.JavaMail.e...  Non-Important  
124910  Message-ID: <27819143.1075853689038.JavaMail.e...  Non-Important  


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import joblib

# Function to simulate adversarial attacks
def simulate_adversarial_attacks(X):
    # dd random noise to the input features
    noisy_X = X.toarray() + np.random.normal(0, 0.007, size=X.shape)
    return noisy_X

# Extract features and labels
X = email_df_subset['message']
y = email_df_subset['labels']

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the validation set
print("\nValidation results (before adversarial attacks):")
y_pred_val = model.predict(X_val_tfidf)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred_val)}')
print(f'Validation Precision: {precision_score(y_val, y_pred_val, pos_label="Important")}')
print(f'Validation Recall: {recall_score(y_val, y_pred_val, pos_label="Important")}')
print(f'Validation F1 Score: {f1_score(y_val, y_pred_val, pos_label="Important")}')

# Simulate adversarial attacks on the validation set
X_val_adversarial = simulate_adversarial_attacks(X_val_tfidf)

# Evaluate the model on the perturbed validation set
print("\nValidation results (after adversarial attacks):")
y_pred_val_adv = model.predict(X_val_adversarial)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred_val_adv)}')
print(f'Validation Precision: {precision_score(y_val, y_pred_val_adv, pos_label="Important")}')
print(f'Validation Recall: {recall_score(y_val, y_pred_val_adv, pos_label="Important")}')
print(f'Validation F1 Score: {f1_score(y_val, y_pred_val_adv, pos_label="Important")}')

# Simulate adversarial attacks on the test set
X_test_adversarial = simulate_adversarial_attacks(X_test_tfidf)

# Evaluate the model on the perturbed test set
print("\nTest results (after adversarial attacks):")
y_pred_test_adv = model.predict(X_test_adversarial)
print(f'Test Accuracy: {accuracy_score(y_test, y_pred_test_adv)}')
print(f'Test Precision: {precision_score(y_test, y_pred_test_adv, pos_label="Important")}')
print(f'Test Recall: {recall_score(y_test, y_pred_test_adv, pos_label="Important")}')
print(f'Test F1 Score: {f1_score(y_test, y_pred_test_adv, pos_label="Important")}')

# Save the model
joblib.dump(model, 'email_detection_model.pkl')



Validation results (before adversarial attacks):
Validation Accuracy: 0.9466666666666667
Validation Precision: 0.0
Validation Recall: 0.0
Validation F1 Score: 0.0

Validation results (after adversarial attacks):


  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.8533333333333334
Validation Precision: 0.20833333333333334
Validation Recall: 0.625
Validation F1 Score: 0.3125

Test results (after adversarial attacks):
Test Accuracy: 0.84
Test Precision: 0.24
Test Recall: 0.5454545454545454
Test F1 Score: 0.3333333333333333


['email_detection_model.pkl']

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import joblib

# Function to simulate adversarial attacks
def simulate_adversarial_attacks(X):
    # Placeholder function to simulate adversarial attacks (replace with actual implementation)
    # For demonstration purposes, let's add random noise to the input features
    noisy_X = X.toarray() + np.random.normal(0, 0.007, size=X.shape)
    return noisy_X

# Extract features and labels
X = email_df_subset['message']
y = email_df_subset['labels']

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize the base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Create an ensemble model using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('random_forest', rf),
    ('logistic_regression', lr),
    ('gradient_boosting', gb)
], voting='hard')

# Train the ensemble model
ensemble_model.fit(X_train_tfidf, y_train)

# Evaluate the model on the validation set
print("\nValidation results (before adversarial attacks):")
y_pred_val = ensemble_model.predict(X_val_tfidf)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred_val)}')
print(f'Validation Precision: {precision_score(y_val, y_pred_val, pos_label="Important")}')
print(f'Validation Recall: {recall_score(y_val, y_pred_val, pos_label="Important")}')
print(f'Validation F1 Score: {f1_score(y_val, y_pred_val, pos_label="Important")}')

# Simulate adversarial attacks on the validation set
X_val_adversarial = simulate_adversarial_attacks(X_val_tfidf)

# Evaluate the model on the perturbed validation set
print("\nValidation results (after adversarial attacks):")
y_pred_val_adv = ensemble_model.predict(X_val_adversarial)
print(f'Validation Accuracy: {accuracy_score(y_val, y_pred_val_adv)}')
print(f'Validation Precision: {precision_score(y_val, y_pred_val_adv, pos_label="Important")}')
print(f'Validation Recall: {recall_score(y_val, y_pred_val_adv, pos_label="Important")}')
print(f'Validation F1 Score: {f1_score(y_val, y_pred_val_adv, pos_label="Important")}')

# Simulate adversarial attacks on the test set
X_test_adversarial = simulate_adversarial_attacks(X_test_tfidf)

# Evaluate the model on the perturbed test set
print("\nTest results (after adversarial attacks):")
y_pred_test_adv = ensemble_model.predict(X_test_adversarial)
print(f'Test Accuracy: {accuracy_score(y_test, y_pred_test_adv)}')
print(f'Test Precision: {precision_score(y_test, y_pred_test_adv, pos_label="Important")}')
print(f'Test Recall: {recall_score(y_test, y_pred_test_adv, pos_label="Important")}')
print(f'Test F1 Score: {f1_score(y_test, y_pred_test_adv, pos_label="Important")}')

# Save the ensemble model
joblib.dump(ensemble_model, 'email_detection_ensemble_model.pkl')



Validation results (before adversarial attacks):
Validation Accuracy: 0.9466666666666667
Validation Precision: 0.0
Validation Recall: 0.0
Validation F1 Score: 0.0

Validation results (after adversarial attacks):


  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.9133333333333333
Validation Precision: 0.3333333333333333
Validation Recall: 0.625
Validation F1 Score: 0.43478260869565216

Test results (after adversarial attacks):
Test Accuracy: 0.9
Test Precision: 0.35714285714285715
Test Recall: 0.45454545454545453
Test F1 Score: 0.4


['email_detection_ensemble_model.pkl']

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import joblib

def simulate_adversarial_attacks(X):
    # Placeholder function to simulate adversarial attacks (replace with actual implementation)
    # For demonstration purposes, let's add random noise to the input features
    noise = np.random.normal(0, 0.007, size=X.shape)
    noisy_X = X.toarray() + noise  # Convert X to array before adding noise
    return noisy_X

# Extract features and labels
X = email_df_subset['message']
y = email_df_subset['labels']

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Adversarial Training
# Generate adversarial examples
X_train_adversarial = simulate_adversarial_attacks(X_train_tfidf)
# Augment training data with adversarial examples
X_train_augmented = np.vstack([X_train_tfidf.toarray(), X_train_adversarial])
y_train_augmented = np.hstack([y_train, y_train])
# Train the model with augmented data
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_augmented, y_train_augmented)

# Regularization
# Add regularization to the model
model = RandomForestClassifier(n_estimators=100, random_state=42, ccp_alpha=0.1)

# Model Architecture
# Experiment with a different model architecture
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)

# Data Augmentation
# Augment training data with noise addition
X_train_noisy = X_train_tfidf.toarray() + np.random.normal(0, 0.01, size=X_train_tfidf.shape)
X_train_augmented = np.vstack([X_train_tfidf.toarray(), X_train_noisy])
y_train_augmented = np.hstack([y_train, y_train])

# Ensemble Methods
# Train an ensemble of models with different architectures
model1 = RandomForestClassifier(n_estimators=100, random_state=42)
model2 = RandomForestClassifier(n_estimators=200, random_state=42)
models = [model1, model2]
for model in models:
    model.fit(X_train_tfidf, y_train)

# Evaluation on Adversarial Examples
# Simulate adversarial attacks on the test set
X_test_adversarial = simulate_adversarial_attacks(X_test_tfidf)
# Evaluate the model on the perturbed test set
y_pred_test_adv = model.predict(X_test_adversarial)
print(f'Test Accuracy: {accuracy_score(y_test, y_pred_test_adv)}')
print(f'Test Precision: {precision_score(y_test, y_pred_test_adv, pos_label="Important")}')
print(f'Test Recall: {recall_score(y_test, y_pred_test_adv, pos_label="Important")}')
print(f'Test F1 Score: {f1_score(y_test, y_pred_test_adv, pos_label="Important")}')

# Save the model
joblib.dump(model, 'email_detection_model.pkl')




Test Accuracy: 0.9066666666666666
Test Precision: 0.4117647058823529
Test Recall: 0.6363636363636364
Test F1 Score: 0.5


['email_detection_model.pkl']

KeyboardInterrupt: Interrupted by user