**Spam-Ham Email Classification using Logistic Regression and SVM**

In [232]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import contractions 
from spellchecker import SpellChecker
spell = SpellChecker()
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, classification_report

nltk.download('punkt_tab')
nltk.download('wordnet')

df= pd.read_csv("email.csv")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mahap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mahap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [233]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [234]:
df.shape

(5573, 2)

In [235]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

Applying Pre-processing

In [236]:
def correct_spelling(text):
    corrected_words = []
    for word in text.split():
        corrected = spell.correction(word)
        if corrected is None:
            corrected_words.append(word) 
        else:
            corrected_words.append(corrected)
    return ' '.join(corrected_words)

In [237]:
def cleaning(text):
    # lowercasing the text
    text = text.lower()

    # removing the html tags
    text = re.sub(r'<.*?>','',text)

    # removing the urls (if any)
    text = re.sub('https?://\S+|www\.\S+','',text)

    # removing punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # fixing the short forms
    text = contractions.fix(text)

    # remove single characters
    text = re.sub(r'\b\w\b', '', text)

    # correcting misspellings
    # text = correct_spelling(text)

    # tokenization 
    tokens = word_tokenize(text)

    # lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]


    return ' '.join(lemmatized_tokens)

In [238]:
df['Message'] = df['Message'].apply(cleaning)

In [239]:
df

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif you oni
2,spam,free entry in wkly comp to win fa cup final tk...
3,ham,you dun say so early hor you already then say
4,ham,nah do not think he go to usf he life around h...
...,...,...
5568,ham,will going to esplanade fr home
5569,ham,pity wa in mood for that soany other suggestion
5570,ham,the guy did some bitching but acted like id be...
5571,ham,rofl it true to it name


In [240]:
df['Category'].unique()

array(['ham', 'spam', '{"mode":"full"'], dtype=object)

In [241]:
df = df[df['Category'] != '{"mode":"full"']

In [242]:
df['Category'] = df['Category'].map({'spam' : 1,'ham' : 0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Category'] = df['Category'].map({'spam' : 1,'ham' : 0})


In [243]:
df['Category'].unique()

array([0, 1])

In [244]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Message'],df['Category'], random_state=42, test_size=0.2)

In [245]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [246]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [247]:
len(vectorizer.vocabulary_)

7798

**Using Logistic Regression**

In [248]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [249]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('Accuracy: ',accuracy)
print('Precision: ', precision)
print('Report: ', report)

Accuracy:  0.968609865470852
Precision:  0.9913793103448276
Report:                precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       0.99      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [250]:
test1 = vectorizer.transform(["Lets go to park"])
prediction = model.predict(test1)
proba = model.predict_proba(test1)
print("Predicted Class:", prediction[0])
print("Probability:", proba)

print('\n') 
test2 = vectorizer.transform(["free gifts money cash"])
prediction = model.predict(test2)
proba = model.predict_proba(test2)
print("Predicted Class:", prediction[0])
print("Probability:", proba)

Predicted Class: 0
Probability: [[0.88149936 0.11850064]]


Predicted Class: 1
Probability: [[0.44003292 0.55996708]]


**Using SVM**

In [251]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [252]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('Accuracy: ',accuracy)
print('Precision: ', precision)
print('Report: ', report)

Accuracy:  0.9856502242152466
Precision:  1.0
Report:                precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

