In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import ipynb.fs.full.TextCleaner as cleaner
import ipynb.fs.full.Naive_Bayes_Model_Generator as model_generator
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(500)

In [3]:
# Load Dataset

DATASET_DIR = '../../Dataset/big_spam_dataset.csv'

data = pd.read_csv(DATASET_DIR, encoding='cp1252')
data.drop(labels=['Unnamed: 0'], inplace=True, axis=1)
print(len(data))
data.head()

1082


Unnamed: 0,Message_body,Label
0,"UpgrdCentre Orange customer, you may now claim...",Spam
1,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,Congrats! Nokia 3650 video camera phone is you...,Spam
3,URGENT! Your Mobile number has been awarded wi...,Spam
4,Someone has contacted our dating service and e...,Spam


In [4]:
# Feature Engineering

# Create new features (Feature Engineering)

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / (len(text) - text.count(" ")), 3)

data['body_len'] = data['Message_body'].apply(lambda x: len(x) - x.count(' '))
data['punct%'] = data['Message_body'].apply(lambda x: count_punct(x))

data.head()

Unnamed: 0,Message_body,Label,body_len,punct%
0,"UpgrdCentre Orange customer, you may now claim...",Spam,147,0.054
1,"Loan for any purpose £500 - £75,000. Homeowner...",Spam,135,0.067
2,Congrats! Nokia 3650 video camera phone is you...,Spam,132,0.015
3,URGENT! Your Mobile number has been awarded wi...,Spam,115,0.035
4,Someone has contacted our dating service and e...,Spam,136,0.015


In [5]:
# Split Train and Test sets

# No feature Engineering
X_train, X_test, y_train, y_test = train_test_split(data[['Message_body']], data['Label'], test_size=0.2)

#Split with Feature Engineered features
X_feat_train, X_feat_test, y_feat_train, y_feat_test = train_test_split(data[['Message_body', 'body_len', 'punct%']],
                                                                        data['Label'], test_size=0.2)

In [6]:
# Text Cleaning

# Stemming

X_train_processed = X_train['Message_body'].apply(lambda x: cleaner.clean_text(x, digit_opt='remove', 
                                                                                  root_opt= 'stemming', 
                                                                                  return_type='sentence'))

X_test_processed = X_test['Message_body'].apply(lambda x: cleaner.clean_text(x, digit_opt='remove', 
                                                                                  root_opt= 'stemming', 
                                                                                  return_type='sentence'))



In [7]:
# TF-IDF - base 

tfidf_vect = TfidfVectorizer()
tfidf_vect_fit = tfidf_vect.fit(X_train_processed)

tfidf_train = tfidf_vect_fit.transform(X_train_processed)
tfidf_test = tfidf_vect_fit.transform(X_test_processed)

X_train_tfidf_vect = pd.concat([X_feat_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)

X_test_tfidf_vect = pd.concat([X_feat_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

# CountVectorizer - base
count_vect = CountVectorizer()
X_count_vect_fit = count_vect.fit(X_train_processed)

count_train = X_count_vect_fit.transform(X_train_processed)
count_test = X_count_vect_fit.transform(X_test_processed)

X_train_count_vect = pd.concat([X_feat_train[['body_len', 'punct%']].reset_index(drop=True),
                               pd.DataFrame(count_train.toarray())], axis=1)

X_test_count_vect = pd.concat([X_feat_test[['body_len', 'punct%']].reset_index(drop=True),
                               pd.DataFrame(count_test.toarray())], axis=1)

# n-gram - base
ngram_vect = CountVectorizer(ngram_range=(1,3))
X_ngram_fit = ngram_vect.fit(X_train_processed)

ngram_train = X_ngram_fit.transform(X_train_processed)
ngram_test = X_ngram_fit.transform(X_test_processed)

X_train_ngram_vect = pd.concat([X_feat_train[['body_len', 'punct%']].reset_index(drop=True),
                               pd.DataFrame(ngram_train.toarray())], axis=1)

X_test_ngram_vect = pd.concat([X_feat_test[['body_len', 'punct%']].reset_index(drop=True),
                               pd.DataFrame(ngram_test.toarray())], axis=1)

In [8]:
# TF-IDF 

Naive_base_model, precision, recall, accuracy = model_generator.generate_naive_bayes_model(X_train_ngram_vect, 
                                                                                           X_test_ngram_vect, 
                                                                                           y_train, y_test)
print('TF-IDF vectorized / Stemming')
print('Precision: {} / Recall: {} / Acuracy: {}'.format(round(precision, 3),
                                                       round(recall, 3),
                                                       accuracy))
# Count-Vectorizer

Naive_base_model, precision, recall, accuracy = model_generator.generate_naive_bayes_model(X_train_ngram_vect, 
                                                                                           X_test_ngram_vect, 
                                                                                           y_train, y_test)
print('Count vectorized / Stemming')
print('Precision: {} / Recall: {} / Acuracy: {}'.format(round(precision, 3),
                                                       round(recall, 3),
                                                       accuracy))



TF-IDF vectorized / Stemming
Precision: 1.0 / Recall: 0.514 / Acuracy: 0.922
Count vectorized / Stemming
Precision: 1.0 / Recall: 0.514 / Acuracy: 0.922


