In [99]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import nltk

In [100]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [101]:
# define preprocessing function
def preprocess(text):
    if isinstance(text, str):
        # remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # convert to lowercase
        text = text.lower()
        # remove stop words
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [102]:
# load data
data = pd.read_csv('spam_ham_dataset.csv')
data = data.dropna()
data['text'] = data['text'].apply(preprocess)

# split into training and testing sets
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)

In [103]:
# get vocabulary
words = set()
for text in train_data['text']:
    for word in text.split():
        words.add(word)

# count occurrences of words in each class
spam_words = {}
ham_words = {}
spam_total_words = 0
ham_total_words = 0
for text, label in zip(train_data['text'], train_data['label']):
    for word in text.split():
        if label == 'spam':
            spam_words[word] = spam_words.get(word, 0) + 1
            spam_total_words += 1
        else:
            ham_words[word] = ham_words.get(word, 0) + 1
            ham_total_words += 1

In [104]:
# calculate prior probabilities
spam_prior = train_data[train_data['label'] == 'spam'].shape[0] / train_data.shape[0]
ham_prior = train_data[train_data['label'] == 'ham'].shape[0] / train_data.shape[0]

In [105]:
# classify test data
predictions = []
for text in test_data['text']:
    spam_score = 0
    ham_score = 0
    for word in text.split():
        spam_count = spam_words.get(word, 0)
        ham_count = ham_words.get(word, 0)
        # apply Laplace smoothing
        spam_score += math.log((spam_count + 1) / (spam_count +ham_count+1))
        ham_score += math.log((ham_count + 1) / (spam_count +ham_count+1))
    spam_score += math.log(spam_prior)
    ham_score += math.log(ham_prior)
    if spam_score > ham_score:
        predictions.append('spam')
    else:
        predictions.append('ham')

# evaluate accuracy
accuracy = (predictions == test_data['label']).mean()
print(f'Accuracy: {accuracy}')

Accuracy: 0.925531914893617


In [106]:
import os
# create directory to store text files
directory = "email_text_files"
if not os.path.exists(directory):
    os.makedirs(directory)

# load email data from dataframe
test_emails =  pd.Series(test_data['text'])

# iterate over each email
for i, row in enumerate(test_emails):
    # create file name using email id
    filename = f"email_{i}.txt"
    filepath = os.path.join(directory, filename)
    # write email content to file
    with open(filepath, "w") as f:
        f.write(row)

In [107]:
import os
import pandas as pd

# Ask for path of directory containing text files
path = input("Enter path of directory: ")

# Create an empty list to store file names and text
file_names = []
texts = []

# Loop through each file in the directory
for file in os.listdir(path):
    # Check if file is a text file
    if file.endswith(".txt"):
        # Store file name in list
        file_names.append(file)
        # Open file and read contents
        with open(os.path.join(path, file), 'r') as f:
            text = f.read()
            # Append text to list
            texts.append(text)

# Create DataFrame from file names and text
test_emails= pd.DataFrame({'file_name': file_names, 'text': texts})

Enter path of directory: /content/email_text_files


In [108]:
# classify test data
predictions = []
test_emails['text']= test_emails['text'].apply(preprocess)
for text in test_emails['text']:
    spam_score = 0
    ham_score = 0
    for word in text.split():
        spam_count = spam_words.get(word, 0)
        ham_count = ham_words.get(word, 0)
        # apply Laplace smoothing
        spam_score += math.log((spam_count + 1) / (spam_count +ham_count+1))
        ham_score += math.log((ham_count + 1) / (spam_count +ham_count+1))
    spam_score += math.log(spam_prior)
    ham_score += math.log(ham_prior)
    if spam_score > ham_score:
        predictions.append('+1')
    else:
        predictions.append('0')

# evaluate accuracy
for filename,classification in zip(test_emails['file_name'],predictions):
    print(filename,":",classification)

email_646.txt : 0
email_744.txt : +1
email_526.txt : 0
email_980.txt : +1
email_970.txt : 0
email_871.txt : 0
email_910.txt : +1
email_953.txt : 0
email_537.txt : +1
email_890.txt : 0
email_223.txt : +1
email_233.txt : 0
email_377.txt : 0
email_480.txt : 0
email_20.txt : 0
email_828.txt : 0
email_204.txt : 0
email_983.txt : 0
email_217.txt : 0
email_143.txt : 0
email_934.txt : 0
email_436.txt : 0
email_333.txt : 0
email_98.txt : 0
email_409.txt : 0
email_451.txt : 0
email_842.txt : 0
email_36.txt : 0
email_69.txt : 0
email_1004.txt : 0
email_224.txt : 0
email_798.txt : 0
email_546.txt : 0
email_213.txt : 0
email_459.txt : 0
email_831.txt : 0
email_635.txt : 0
email_397.txt : 0
email_67.txt : +1
email_918.txt : 0
email_954.txt : 0
email_725.txt : 0
email_307.txt : 0
email_925.txt : +1
email_388.txt : 0
email_84.txt : 0
email_114.txt : 0
email_784.txt : +1
email_17.txt : 0
email_295.txt : 0
email_345.txt : 0
email_888.txt : 0
email_61.txt : 0
email_186.txt : 0
email_677.txt : 0
email_48.