# Spam Mail Classifier using NLTK

In [48]:
from nltk.classify.util import apply_features
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer
from nltk.probability import FreqDist

#### Read data from file

In [88]:
data = pd.read_csv('emails.csv')

In [89]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [90]:
# Check for null values

data.isna().sum()

text    0
spam    0
dtype: int64

In [91]:
data['spam'].value_counts()/len(data)

0    0.761173
1    0.238827
Name: spam, dtype: float64

#### Split data into train and test sets

Because of presence of class imbalance, we also need to stratify while splitting our data. Our original dataset contains 76% spam and 24% non-spam values. Our train and test sets should contain similar class weightage. 

In [92]:
train_X, test_X, train_Y, test_Y = train_test_split(data["text"].values,
                                                    data["spam"].values,
                                                    test_size = 0.25,
                                                    random_state = 50,
                                                    shuffle = True,
                                                    stratify=data["spam"].values)

In [93]:
unique_elements, counts_elements = np.unique(train_Y, return_counts=True)
perc_elements = counts_elements/len(train_Y)
print("Frequency of unique values of the train_Y array:")
print(np.asarray((unique_elements, counts_elements)))
print(perc_elements)

Frequency of unique values of the train_Y array:
[[   0    1]
 [3270 1026]]
[0.76117318 0.23882682]


In [94]:
unique_elements, counts_elements = np.unique(test_Y, return_counts=True)
perc_elements = counts_elements/len(test_Y)
print("Frequency of unique values of the test_Y array:")
print(np.asarray((unique_elements, counts_elements)))
print(perc_elements)

Frequency of unique values of the test_Y array:
[[   0    1]
 [1090  342]]
[0.76117318 0.23882682]


### Define Classifier Class

In [87]:
class SpamClassifier:

    word_features = []      # collection of unique words
    classifier = None       # placeholder variable for trained model
    
    def extract_tokens(self, text, target):
        """returns array of tuples where each tuple is defined by (tokenized_text, label)
         parameters:
                text: array of texts
                target: array of target labels
                
        NOTE: consider only those words which have all alphabets and atleast 3 characters.
        """
        tup_array = []
        lem = WordNetLemmatizer()
        stop_words_eng = set(stopwords.words('english'))
        for i in range(len(text)):
            tokenized_text = [lem.lemmatize(x.lower()) for x in word_tokenize(text[i]) 
                              if len(x)>=3 
                                  and x.isalpha()
                                  and x not in stop_words_eng
                             ]
            label = target[i]
            tup_array.append(tuple([tokenized_text, label]))
        return tup_array
        
    
    def get_features(self, corpus):
        """ 
        returns a Set of unique words in complete corpus. 
        parameters:- corpus: tokenized corpus along with target labels
        
        Return Type is a set
        """
        all_words = []
        for tokens, _ in corpus:
            for word in tokens:
                all_words.append(word)
        
        fdist = FreqDist(all_words)
        uniqueWords = [x[0] for x in fdist.most_common(2000)]
        return uniqueWords
    
    def extract_features(self, document):
        """
        maps each input text into feature vector
        parameters:- document: string
        
        Return type : A dictionary with keys being the train data set word features.
                      The values correspond to True or False
        """
        feature_vector = dict()
        for word in self.word_features:
            if word in document:
                feature_vector[word] = True
            else:
                feature_vector[word] = False
        return feature_vector

    def train(self, text, labels):
        """
        Returns trained model and set of unique words in training data
        also set trained model to 'self.classifier' variable and set of 
        unique words to 'self.word_features' variable.
        """
        trainTokens = self.extract_tokens(text, labels)
        self.word_features = self.get_features(trainTokens)
        train_feature_vectors = [(self.extract_features(d), c) for (d,c) in trainTokens]
        self.classifier = NaiveBayesClassifier.train(train_feature_vectors)
#         return self.classifier, self.word_features
        
    
    def predict(self, text):
        """
        Returns prediction labels of given input text. 
        Allowed Text can be a simple string i.e one input email, a list of emails, or a dictionary of emails identified by their labels.
        """
        # tokenize
        tokenized_test_X = []
        lem = WordNetLemmatizer()
        for i in range(len(text)):
            tokenized_text = [lem.lemmatize(x) for x in word_tokenize(text[i]) if len(x)>=3 and x.isalpha()]
            tokenized_test_X.append(tokenized_text)
        
        # create feature vectors
        test_feature_vectors = [self.extract_features(x) for x in tokenized_test_X]
        test_results = self.classifier.classify_many(test_feature_vectors)
        
        return test_results, test_feature_vectors
    
    def accuracy(self, feature_vectors, actual_labels):
        """
        Returns accuracy.
        Input Parameters: Feature vectors for test data strings, Actual labels for test set
        To be called after running Predict function
        """
        vector_labels = list(zip(feature_vectors, actual_labels))
        acc = accuracy(self.classifier, vector_labels)
        return acc

#### Create Object of Model Class and Train the Model

In [95]:
classifier = SpamClassifier()
classifier.train(train_X, train_Y)

#### Make Predictions and Test Model Accuracy

In [96]:
predictions, feature_vectors = classifier.predict(test_X)
predictions[:5]

[0, 0, 0, 0, 0]

In [98]:
print(f"Model Accuracy is {round(classifier.accuracy(feature_vectors, test_Y)*100,2)}")

Model Accuracy is 94.69


Our model gives ~95% accuracy!