## Spam Email Detection
### Support Vector and Logistic Regression Models Performance Compared

In [20]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Read Data and Preprocess

In [7]:
df = pd.read_csv('../data/mail_data.csv')

In [9]:
data = df.where(pd.notnull(df), '')

In [12]:
data.loc[data['Category'] == 'spam', 'Category',] = 0
data.loc[data['Category'] == 'ham', 'Category',] = 1

In [13]:
X = data['Message']
Y = data['Category']

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Tokenize Message data

In [16]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase = True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

### Model Training

In [21]:
lr_model = LogisticRegression()
svc_model = SVC()

In [22]:
lr_model.fit(X_train_features, Y_train)

In [23]:
svc_model.fit(X_train_features, Y_train)

### Prediction

In [27]:
prediction_lr = lr_model.predict(X_test_features)
prediction_svc = svc_model.predict(X_test_features)

accuracy_lr = accuracy_score(Y_test, prediction_lr)
accuracy_svc = accuracy_score(Y_test, prediction_svc)

In [29]:
print("Prediction on Test Data (Logistic Regression) - "+str(accuracy_lr))
print("Prediction on Test Data (SVC) - " + str(accuracy_svc))

Prediction on Test Data (Logistic Regression) - 0.967713004484305
Prediction on Test Data (SVC) - 0.9847533632286996


### Test on Actual Mail Data

In [30]:
# SVC
input_mail = ["Mail Data Here"]
input_mail_features = feature_extraction.transform(input_mail)
prediction_svc = svc_model.predict(input_mail_features)
if prediction_svc==1:
    print("Ham Mail")
else:
    print("Spam Mail")
    
prediction_lr = lr_model.predict(input_mail_features)
if prediction_lr==1:
    print("Ham Mail")
else:
    print("Spam Mail")

Ham Mail
Ham Mail
