In [2]:
import os
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

# Set a random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Define the data folders
lr_nn_data_folder = 'C:\\Users\\kunalrajput\\Downloads\\SpamFilterMachineLearning-master\\SpamFilterMachineLearning-master\\data\\'
nb_nonspam_folders = [
    'D:\\spam test for pligun\\main test\\easy_ham',
    'D:\\spam test for pligun\\main test\\easy_ham_2',
    'D:\\spam test for pligun\\main test\\hard_ham'
]
nb_spam_folders = [
    'D:\\spam test for pligun\\main test\\spam',
    'D:\\spam test for pligun\\main test\\spam_2'
]

# Function to process and clean a single file
def process_file(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        content = file.read()
        content = re.sub(r'\<\w{1,2}\>', '', content)
        return content

# Combine texts and labels from all datasets
all_texts = []
all_labels = []

# Process all files in the LR+NN data folder
for root, _, files in os.walk(lr_nn_data_folder):
    for file in files:
        file_path = os.path.join(root, file)
        all_texts.append(process_file(file_path))
        all_labels.append(0 if 'nonspam' in root else 1)  # Non-spam label if folder contains 'nonspam'

# Process all files in the Naive Bayes folders
for folder in nb_nonspam_folders:
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt') or file.endswith('.txt.txt'):
                file_path = os.path.join(root, file)
                all_texts.append(process_file(file_path))
                all_labels.append(0)  # Non-spam labelD

for folder in nb_spam_folders:
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt') or file.endswith('.txt.txt'):
                file_path = os.path.join(root, file)
                all_texts.append(process_file(file_path))
                all_labels.append(1)  # Spam label

# Check if lengths of texts and labels are equal
print(f"Number of texts: {len(all_texts)}")
print(f"Number of labels: {len(all_labels)}")

# Split the dataset into training and testing sets
X_train_texts, X_test_texts, y_train, y_test = train_test_split(all_texts, all_labels, test_size=0.2, random_state=42)

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_texts)
X_test_tfidf = vectorizer.transform(X_test_texts)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Create and train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Save the Naive Bayes model
with open('nb_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)

# Tokenization and padding for Neural Network and BiLSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_texts)

X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)

max_seq_length = 100  # Maximum length of a sequence
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Clear any previous TensorFlow/Keras session state
tf.keras.backend.clear_session()

# Create and train the Neural Network model
nn_model = Sequential()
nn_model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_seq_length))
nn_model.add(GlobalAveragePooling1D())
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=1)

# Save the Neural Network model
nn_model_json = nn_model.to_json()
with open('nn_model.json', 'w') as json_file:
    json_file.write(nn_model_json)
nn_model.save_weights('nn_model.h5')

# Create and train the BiLSTM model
bilstm_model = Sequential()
bilstm_model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_seq_length))
bilstm_model.add(Bidirectional(LSTM(64)))
bilstm_model.add(Dense(64, activation='relu'))
bilstm_model.add(Dense(1, activation='sigmoid'))

bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bilstm_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=1)

# Save the BiLSTM model
bilstm_model_json = bilstm_model.to_json()
with open('bilstm_model.json', 'w') as json_file:
    json_file.write(bilstm_model_json)
bilstm_model.save_weights('bilstm_model.h5')

# Generate meta-features for training the meta-model (Logistic Regression)
nn_train_predictions = nn_model.predict(X_train_pad)
bilstm_train_predictions = bilstm_model.predict(X_train_pad)

X_train_meta = np.hstack((X_train_tfidf.toarray(), nn_train_predictions, bilstm_train_predictions))

# Train the Logistic Regression model using the meta-features
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_meta, y_train)

# Save the Logistic Regression model
with open('lr_model.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

# Generate meta-features for the test set
nn_test_predictions = nn_model.predict(X_test_pad)
bilstm_test_predictions = bilstm_model.predict(X_test_pad)

X_test_meta = np.hstack((X_test_tfidf.toarray(), nn_test_predictions, bilstm_test_predictions))

# Function to predict if a given email text is spam or not
def predict_email(email_text):
    # Predict using NB model
    user_features_tfidf = vectorizer.transform([email_text])
    nb_prediction = nb_model.predict(user_features_tfidf)
    
    # Predict using NN model
    user_seq = tokenizer.texts_to_sequences([email_text])
    user_pad = pad_sequences(user_seq, maxlen=max_seq_length, padding='post')
    nn_prediction = nn_model.predict(user_pad)
    
    # Predict using BiLSTM model
    bilstm_prediction = bilstm_model.predict(user_pad)
    
    # Combine predictions into meta-features
    user_meta_features = np.hstack((user_features_tfidf.toarray(), nn_prediction, bilstm_prediction))
    
    # Predict using LR model
    lr_prediction = lr_model.predict(user_meta_features)
    
    # Print individual model predictions
    print("Logistic Regression Prediction:", lr_prediction)
    print("Neural Network Prediction:", nn_prediction)
    print("BiLSTM Prediction:", bilstm_prediction)
    print("Naive Bayes Prediction:", nb_prediction)
    
    # Weighted majority voting for final prediction
    lr_weight = 0.4
    nn_weight = 0.2
    bilstm_weight = 0.2
    nb_weight = 0.2
    
    final_score = (lr_prediction * lr_weight) + (nn_prediction * nn_weight) + (bilstm_prediction * bilstm_weight) + (nb_prediction * nb_weight)
    final_prediction = 1 if final_score >= 0.5 else 0
    
    # Display final prediction
    if final_prediction == 1:
        print("Consensus: The email is predicted to be spam.")
    else:
        print("Consensus: The email is not predicted to be spam.")

# Ask the user to enter an email
email_text = input("Enter the email text: ")
predict_email(email_text)


Number of texts: 7473
Number of labels: 7473
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Enter the email text:  this is kunal and i want help


Logistic Regression Prediction: [1]
Neural Network Prediction: [[0.8087951]]
BiLSTM Prediction: [[0.99984396]]
Naive Bayes Prediction: [0]
Consensus: The email is predicted to be spam.


In [3]:
import os
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

# Set a random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Define the data folders
lr_nn_data_folder = 'C:\\Users\\kunalrajput\\Downloads\\SpamFilterMachineLearning-master\\SpamFilterMachineLearning-master\\data\\'
nb_nonspam_folders = [
    'D:\\spam test for pligun\\main test\\easy_ham',
    'D:\\spam test for pligun\\main test\\easy_ham_2',
    'D:\\spam test for pligun\\main test\\hard_ham'
]
nb_spam_folders = [
    'D:\\spam test for pligun\\main test\\spam',
    'D:\\spam test for pligun\\main test\\spam_2'
]

# Function to process and clean a single file
def process_file(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        content = file.read()
        content = re.sub(r'\<\w{1,2}\>', '', content)
        return content

# Combine texts and labels from all datasets
all_texts = []
all_labels = []

# Process all files in the LR+NN data folder
for root, _, files in os.walk(lr_nn_data_folder):
    for file in files:
        file_path = os.path.join(Aroot, file)
        all_texts.append(process_file(file_path))
        all_labels.append(0 if 'nonspam' in root else 1)  # Non-spam label if folder contains 'nonspam'

# Process all files in the Naive Bayes folders
for folder in nb_nonspam_folders:
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt') or file.endswith('.txt.txt'):
                file_path = os.path.join(root, file)
                all_texts.append(process_file(file_path))
                all_labels.append(0)  # Non-spam label

for folder in nb_spam_folders:
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('.txt') or file.endswith('.txt.txt'):
                file_path = os.path.join(root, file)
                all_texts.append(process_file(file_path))
                all_labels.append(1)  # Spam label

# Check if lengths of texts and labels are equal
print(f"Number of texts: {len(all_texts)}")
print(f"Number of labels: {len(all_labels)}")

# Split the dataset into training and testing sets
X_train_texts, X_test_texts, y_train, y_test = train_test_split(all_texts, all_labels, test_size=0.2, random_state=42)

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_texts)
X_test_tfidf = vectorizer.transform(X_test_texts)

# Save the vectorizer
with open('D:\\LR+NN+NB\\more accurate\\tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Create and train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Save the Naive Bayes model
with open('D:\\LR+NN+NB\\more accurate\\nb_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)

# Tokenization and padding for Neural Network and BiLSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_texts)

X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)

max_seq_length = 100  # Maximum length of a sequence
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# Save the tokenizer
with open('D:\\LR+NN+NB\\more accurate\\tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Clear any previous TensorFlow/Keras session state
tf.keras.backend.clear_session()

# Create and train the Neural Network model
nn_model = Sequential()
nn_model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_seq_length))
nn_model.add(GlobalAveragePooling1D())
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=1)

# Save the Neural Network model
nn_model_json = nn_model.to_json()
with open('D:\\LR+NN+NB\\more accurate\\nn_model.json', 'w') as json_file:
    json_file.write(nn_model_json)
nn_model.save_weights('D:\\LR+NN+NB\\more accurate\\nn_model.h5')

# Create and train the BiLSTM model
bilstm_model = Sequential()
bilstm_model.add(Embedding(input_dim=5000, output_dim=64, input_length=max_seq_length))
bilstm_model.add(Bidirectional(LSTM(64)))
bilstm_model.add(Dense(64, activation='relu'))
bilstm_model.add(Dense(1, activation='sigmoid'))

bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bilstm_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=1)

# Save the BiLSTM model
bilstm_model_json = bilstm_model.to_json()
with open('D:\\LR+NN+NB\\more accurate\\bilstm_model.json', 'w') as json_file:
    json_file.write(bilstm_model_json)
bilstm_model.save_weights('D:\\LR+NN+NB\\more accurate\\bilstm_model.h5')

# Generate meta-features for training the meta-model (Logistic Regression)
nn_train_predictions = nn_model.predict(X_train_pad)
bilstm_train_predictions = bilstm_model.predict(X_train_pad)

X_train_meta = np.hstack((X_train_tfidf.toarray(), nn_train_predictions, bilstm_train_predictions))

# Train the Logistic Regression model using the meta-features
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_meta, y_train)

# Save the Logistic Regression model
with open('D:\\LR+NN+NB\\more accurate\\lr_model.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

# Generate meta-features for the test set
nn_test_predictions = nn_model.predict(X_test_pad)
bilstm_test_predictions = bilstm_model.predict(X_test_pad)

X_test_meta = np.hstack((X_test_tfidf.toarray(), nn_test_predictions, bilstm_test_predictions))

# Function to predict if a given email text is spam or not
def predict_email(email_text):
    # Predict using NB model
    user_features_tfidf = vectorizer.transform([email_text])
    nb_prediction = nb_model.predict(user_features_tfidf)
    
    # Predict using NN model
    user_seq = tokenizer.texts_to_sequences([email_text])
    user_pad = pad_sequences(user_seq, maxlen=max_seq_length, padding='post')
    nn_prediction = nn_model.predict(user_pad)
    
    # Predict using BiLSTM model
    bilstm_prediction = bilstm_model.predict(user_pad)
    
    # Combine predictions into meta-features
    user_meta_features = np.hstack((user_features_tfidf.toarray(), nn_prediction, bilstm_prediction))
    
    # Predict using LR model
    lr_prediction = lr_model.predict(user_meta_features)
    
    # Print individual model predictions
    print("Logistic Regression Prediction:", lr_prediction)
    print("Neural Network Prediction:", nn_prediction)
    print("BiLSTM Prediction:", bilstm_prediction)
    print("Naive Bayes Prediction:", nb_prediction)
    
    # Weighted majority voting for final prediction
    lr_weight = 0.4
    nn_weight = 0.2
    bilstm_weight = 0.2
    nb_weight = 0.2
    
    final_score = (lr_prediction * lr_weight) + (nn_prediction * nn_weight) + (bilstm_prediction * bilstm_weight) + (nb_prediction * nb_weight)
    final_prediction = 1 if final_score >= 0.5 else 0
    
    # Display final prediction
    if final_prediction == 1:
        print("Consensus: The email is predicted to be spam.")
    else:
        print("Consensus: The email is not predicted to be spam.")

# Ask the user to enter an email
email_text = input("Enter the email text: ")
predict_email(email_text)


Number of texts: 7473
Number of labels: 7473
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Enter the email text:  hii


Logistic Regression Prediction: [0]
Neural Network Prediction: [[0.7428211]]
BiLSTM Prediction: [[0.14241438]]
Naive Bayes Prediction: [0]
Consensus: The email is not predicted to be spam.
