# DETECT SPAM E-MAILS USING NAIVE BAYES


# PROBLEM STATEMENT


- The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

- The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.


### IMPORTING DATA AND LIBRARIES


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
spam_df = pd.read_csv('emails.csv')
spam_df.head(10)

In [None]:
spam_df.spam.value_counts()

### DATA ANALYSIS


In [None]:
ham = spam_df[spam_df.spam == 0]
spam = spam_df[spam_df.spam == 1]

In [None]:
ham

In [None]:
spam

In [None]:
spam_df['length'] = spam_df['text'].apply(len)
spam_df.head()

In [None]:
spam_percent = 100*len(spam)/len(spam_df)
ham_percent = 100*len(ham)/len(spam_df)

print('Spam Pecent : {}%'.format(round(spam_percent, 2)))
print('Ham Percent : {}%'.format(round(ham_percent, 2)))

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x='spam', data=spam_df)
plt.title('Spam vs Ham', fontsize=14)
plt.show()

### DATA CLEANING


In [None]:
import nltk
nltk.download("stopwords")

In [None]:
import string
from nltk.corpus import stopwords

stopwords.words('english')
string.punctuation

In [None]:
# Removing puctuation, stopwords
def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_join_clean

In [None]:
spam_df_clean = spam_df['text'].apply(message_cleaning)
print(spam_df_clean[0])

In [None]:
print(spam_df['text'][0])

### COUNT VECTORIZER


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
sample_data = ['This is the first document',
              'And this document is the second document',
              'This one is the third']

In [None]:
sample_vectorizer = CountVectorizer()
x_transformed = sample_vectorizer.fit_transform(sample_data)
print(x_transformed.toarray())

## 2 --> there are 2 'documnet' in that sentence
## that's why they call it count vectorizer

In [None]:
print(sample_vectorizer.get_feature_names_out())

### APPLYING COUNT VECTORIZER TO MAIL


In [None]:
# apply the analyzer we've previously defined to our dataset
vectorizer = CountVectorizer(analyzer=message_cleaning)
spamham_cv = vectorizer.fit_transform(spam_df['text'])
print(spamham_cv.toarray())

In [None]:
print(vectorizer.get_feature_names_out())

In [None]:
## 5728 samples, 37229 words extracted
spamham_cv.shape

### TRAINING MODEL


In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
label = spam_df['spam'].values

In [None]:
NB_classifier = MultinomialNB()
NB_classifier.fit(spamham_cv, label)

In [None]:
testing_sample = ['Free Money!!!',
                  'Hi Jane, Please let me know if you want to modify our project.']

test_sample_vectorizer = vectorizer.transform(testing_sample)

In [None]:
test_pred = NB_classifier.predict(test_sample_vectorizer)
test_pred

In [None]:
testing_sample = ['Hello, I am Boo, I would like to book a hotel in Bali by January 24th',
                  'money vaigra!!']

test_sample_vectorizer = vectorizer.transform(testing_sample)

test_pred = NB_classifier.predict(test_sample_vectorizer)
test_pred

### DIVIDING TRAIN TEST AND TRAINING


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
X = spamham_cv
y = label

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=7)

In [None]:
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

### EVALUATION


In [None]:
NB_classifier.score(X_train, y_train)

In [None]:
NB_classifier.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
y_pred_train = NB_classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred_train)
sns.heatmap(cm, annot=True, cmap='ocean')
plt.title('Y-TRAIN PREDICTIONS')
plt.show()

In [None]:
y_pred_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred_test)
sns.heatmap(cm, annot=True, cmap='ocean')
plt.title('Y-TEST PREDICTIONS')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_test))