# Import the required libraries

In [1]:
# Regular expression to remove the unwanted charaters
import re

# Pandas to store the dataset and make the necessary operations for EDA
import pandas as pd

# NLTK tools to help clean the string for the model to train
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# Tfdf vectorizer to convert the words into vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# To split the data for train and test
from sklearn.model_selection import train_test_split

# Model we used to train our data
from sklearn.naive_bayes import MultinomialNB

# Metrics to measure the performance and accuracy for the model
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculate the model running time
import time

# Exploratory Data Analysis

### Load the data

In [2]:
spam_df = pd.read_csv('spam.csv', encoding='latin-1')
spam_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Drop the unwanted columns

In [3]:
spam_df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [4]:
data = spam_df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Split the data into dependent and independed variable (i.e. split features and target column)

In [5]:
X = data['v2']
y = data['v1']

X.head, y.head()

(<bound method NDFrame.head of 0       Go until jurong point, crazy.. Available only ...
 1                           Ok lar... Joking wif u oni...
 2       Free entry in 2 a wkly comp to win FA Cup fina...
 3       U dun say so early hor... U c already then say...
 4       Nah I don't think he goes to usf, he lives aro...
                               ...                        
 5567    This is the 2nd time we have tried 2 contact u...
 5568                Will Ì_ b going to esplanade fr home?
 5569    Pity, * was in mood for that. So...any other s...
 5570    The guy did some bitching but I acted like i'd...
 5571                           Rofl. Its true to its name
 Name: v2, Length: 5572, dtype: object>,
 0     ham
 1     ham
 2    spam
 3     ham
 4     ham
 Name: v1, dtype: object)

### Convert the target column into numerical values

In [6]:
y_one_hot = pd.get_dummies(y, drop_first=True)
y_one_hot.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [7]:
class SpamClassifier():
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.test_size = 0.2
        self.random_state = 42
    
    # Clean the feature column
    def clean_dataset(self, X, clean_type, method):
        corpus = []
        for i in range(len(X)):
            sentence = re.sub('[^a-zA-Z]', ' ', X[i])
            sentence = sentence.lower()
            sentence = sentence.split()

            # remove the words which are present in stop words and either stem or lemmatize them
            words = []
            if clean_type == 'stemmer':
                words = [method.stem(word) for word in sentence if not word in stopwords.words('english')]
            else:
                words = [method.lemmatize(word) for word in sentence if not word in stopwords.words('english')]
            sentence = ' '.join(words)
            corpus.append(sentence)
        return corpus
    
    # Convert the feature column into vectors
    def data_into_vector(self, X):
        tfidf = TfidfVectorizer()
        return tfidf.fit_transform(X)
    
    # Clean the feature using portal stemming
    def using_stemming(self, X):
        stemmer = PorterStemmer()
        return self.clean_dataset(X, 'stemmer', stemmer)
    
    # Clean the feature using snowball stemming
    def using_stemming(self, X):
        stemmer = SnowballStemmer('english')
        return self.clean_dataset(X, 'stemmer', stemmer)
    
    # Clean the feature using lemmatization
    def using_lemmatization(self, X):
        lemma = WordNetLemmatizer()
        return self.clean_dataset(X, 'lemmatization', lemma)
    
    # Training the model
    def model_train(self, X_train, y_train):
        return MultinomialNB().fit(X_train, y_train)
    
    # Prediction from model
    def model_prediction(self, model, X_test):
        return model.predict(X_test)
    
    # Count number of prediction are true or false using confusion matrix
    def model_confusion_matrix(self, y_test, y_pred):
        return confusion_matrix(y_test, y_pred)
    
    # Model accuracy score
    def model_accuracy(self, y_test, y_pred):
        return accuracy_score(y_test, y_pred)
    
    # Split the data into train and test data and train the model
    def split_and_train(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=self.test_size, 
                                                            random_state=self.random_state)

        model = self.model_train(X_train, y_train)
        y_pred = self.model_prediction(model, X_test)

        confusion_mat = self.model_confusion_matrix(y_test, y_pred)
        accuracy = self.model_accuracy(y_test, y_pred)

        return (model, y_pred, confusion_mat, accuracy)
    
    # Call this to train model using stemming
    def train_model_using_porter_stemming(self):
        corpus = self.using_stemming(self.X)
        X_vec = self.data_into_vector(corpus)
        model, y_pred, confusion_mat, accuracy = self.split_and_train(X_vec, self.y)
        
        print("==============================================================")
        print("Confusion Matrix:\n", confusion_mat)
        print("==============================================================")
        print("Accuracy:", accuracy)
        print("==============================================================")
    
    # Call this to train model using stemming
    def train_model_using_snowball_stemming(self):
        corpus = self.using_stemming(self.X)
        X_vec = self.data_into_vector(corpus)
        model, y_pred, confusion_mat, accuracy = self.split_and_train(X_vec, self.y)
        
        print("==============================================================")
        print("Confusion Matrix:\n", confusion_mat)
        print("==============================================================")
        print("Accuracy:", accuracy)
        print("==============================================================")
    
    # Call this to train model using lemmatization
    def train_model_using_lemmatization(self):
        corpus = self.using_lemmatization(self.X)
        X_vec = self.data_into_vector(corpus)
        model, y_pred, confusion_mat, accuracy = self.split_and_train(X_vec, self.y)
        
        print("==============================================================")
        print("Confusion Matrix:\n", confusion_mat)
        print("==============================================================")
        print("Accuracy:", accuracy)
        print("==============================================================")

In [8]:
spam_classifier_stem = SpamClassifier(X, y_one_hot)

In [9]:
start_time = time.time()
spam_classifier_stem.train_model_using_porter_stemming()
end_time = time.time()
print("Model running time:", end_time-start_time)

Confusion Matrix:
 [[964   1]
 [ 41 109]]
Accuracy: 0.9623318385650225
Model running time: 57.09599733352661


  y = column_or_1d(y, warn=True)


In [10]:
start_time = time.time()
spam_classifier_stem.train_model_using_snowball_stemming()
end_time = time.time()
print("Model running time:", end_time-start_time)

Confusion Matrix:
 [[964   1]
 [ 41 109]]
Accuracy: 0.9623318385650225
Model running time: 56.72276854515076


  y = column_or_1d(y, warn=True)


In [11]:
start_time = time.time()
spam_classifier_stem.train_model_using_lemmatization()
end_time = time.time()
print("Model running time:", end_time-start_time)

Confusion Matrix:
 [[965   0]
 [ 42 108]]
Accuracy: 0.9623318385650225
Model running time: 57.69467544555664


  y = column_or_1d(y, warn=True)
