### Data collection and understanding

In [1]:
import pandas as pd

In [2]:
email_df = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
email_df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [4]:
email_df = email_df[['v1', 'v2']]
email_df = email_df.rename(columns={'v1': 'label', 'v2': 'email_text'})

In [5]:
email_df.head()

Unnamed: 0,label,email_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Text preprocessing and feature engineering

In [6]:
#import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import os
from textblob import TextBlob
from nltk.stem import PorterStemmer
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

### Pre-processing steps like lower case, stemming and lemmatization

In [7]:
email_df['email_text'] = email_df['email_text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

stop = stopwords.words('english')
email_df['email_text'] = email_df['email_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

st = PorterStemmer()

In [8]:
email_df['email_text'] = email_df['email_text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
email_df['email_text'] = email_df['email_text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

email_df.head()

Unnamed: 0,label,email_text
0,ham,"go jurong point, crazy.. avail bugi n great wo..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor... u c alreadi say...
4,ham,"nah think goe usf, live around though"


### Splitting data into train and validation

In [9]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(email_df['email_text'], email_df['label'], test_size=0.2, random_state=42)

In [10]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.transform(valid_y)

### TFIDF feature generation for a maximum of 5000 features

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

tfidf_vect.fit(email_df['email_text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)

In [12]:
xtrain_tfidf.data

array([0.45929507, 0.31119737, 0.33094025, ..., 0.57002701, 0.4622825 ,
       0.54414827])

In [13]:
xtrain_tfidf.shape

(4457, 5000)

### Model training and evaluation

In [14]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
# Naive Bayes training
accuracy = train_model(naive_bayes.MultinomialNB(alpha=0.2), xtrain_tfidf, train_y, xvalid_tfidf)

print ("Accuracy: ", accuracy)

Accuracy:  0.9874439461883409


In [16]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(),

xtrain_tfidf, train_y, xvalid_tfidf)

print ("Accuracy: ", accuracy)

Accuracy:  0.9533632286995516


### Testing the model on unseen data

In [19]:
# Tạo test_data từ spam và ham
test_data = spam_test_data + ham_test_data

In [20]:
# Tạo DataFrame
spam_df = pd.DataFrame({
    'label': ['spam'] * len(spam_test_data),
    'email_text': spam_test_data
})

ham_df = pd.DataFrame({
    'label': ['ham'] * len(ham_test_data),
    'email_text': ham_test_data
})

# Kết hợp cả spam và ham vào một DataFrame duy nhất
test_df = pd.concat([spam_df, ham_df], ignore_index=True)

# Hiển thị DataFrame kết quả
test_df.head()

Unnamed: 0,label,email_text
0,spam,Congratulations! You have won a lottery of $10...
1,spam,Your subscription has been confirmed. Thank yo...
2,spam,Get rich quick! Invest in our new cryptocurren...
3,spam,Reminder: Your appointment is scheduled for to...
4,spam,Limited time offer! Buy one get one free on al...


In [21]:
test_x = test_df['email_text']
test_y = test_df['label']

test_y = encoder.transform(test_y)

In [22]:
test_tfidf = tfidf_vect.transform(test_x)
test_tfidf.data
test_tfidf.shape

(12, 5000)

In [23]:
# best model
best_model = naive_bayes.MultinomialNB(alpha=0.2)
best_model.fit(xtrain_tfidf, train_y)

In [24]:
### Predicting the test data
y_pred = best_model.predict(test_tfidf)

# Tính toán độ chính xác
accuracy = metrics.accuracy_score(y_pred, test_y)
print("Test Accuracy: ", accuracy)

Test Accuracy:  0.5833333333333334
