In [7]:
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/spam_ham_dataset.csv/spam_ham_dataset.csv')

# Clean the dataset by removing escape characters such as "\n" and "\r"
df['text'] = df['text'].str.replace('\n', '').str.replace('\r', '')

# Remove unnecessary columns
df.drop(['Unnamed: 0', 'label_num'], axis=1, inplace=True)

# Remove stop words from the text
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Extract features
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['cleaned_text'] = df['text'].apply(lambda x: re.findall(r'\b[A-Za-z]+\b', x.lower()))  # Extract only words
df['most_appearing_word'] = df['cleaned_text'].apply(lambda x: Counter(x).most_common(1)[0][0])
df['capital_letters_count'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()))
df['capital_letter_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / max(1, sum(1 for c in x if c.isalpha())))
df['exclamation_mark_count'] = df['text'].str.count('!')
df['email_length'] = df['text'].apply(len)
df['spam_word_count'] = df['text'].apply(lambda x: sum(word in spam_words_list for word in x.split()))

# Sentiment analysis (simplified approach)
def sentiment_analysis(text):
    if "good" in text.lower():
        return "positive"
    elif "bad" in text.lower():
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['text'].apply(sentiment_analysis)

# Feature Engineering - Extracting spam-related words
spam_words_list = [
    "100% more", "100% free", "100% satisfied", "Additional income", "Be your own boss",
    "Best price", "Big bucks", "Billion", "Cash bonus", "Cents on the dollar",
    # Add more spam-related words here...
]

spam_words_list = [word.lower() for word in spam_words_list]
df['text'] = df['text'].str.lower()
df['spam_word_count'] = df['text'].apply(lambda x: sum(word in spam_words_list for word in x.split()))

# Select relevant features for modeling
features = ['word_count', 'capital_letters_count', 'capital_letter_ratio', 'exclamation_mark_count',
            'email_length', 'spam_word_count']

# Prepare data for modeling
X = df[features]
y = df['label']
le = LabelEncoder()
y = le.fit_transform(y)

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(df['text'])

# Merge features and text data
X = np.hstack((X.to_numpy(), X_text.toarray()))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BernoulliNB model
clf = BernoulliNB()

# Fit the model
clf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Create a DataFrame to store the results
results_df = pd.DataFrame([['BernoulliNB', accuracy, precision, recall, f1]],
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Display the results
print(results_df)


           Model  Accuracy  Precision    Recall  F1 Score
0  MultinomialNB  0.756522        1.0  0.139932  0.245509


In [5]:
spam_words_list = [
    "100% more", "100% free", "100% satisfied", "Additional income", "Be your own boss",
    "Best price", "Big bucks", "Billion", "Cash bonus", "Cents on the dollar",
    # Add more spam-related words here...
]

In [3]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True