In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import datetime
import math
import pickle 

from nltk.corpus import stopwords 
from collections import OrderedDict
from itertools import islice
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
#Neural Network spam classifier
##In order to classify messages as spam or ham, we are going to use a neural network. This model have one input, hidden and output layer, and the sigmoid function as activation function.

In [None]:
class NeuralNetwork:
    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate=0.1):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.weights_input_hidden = np.random.uniform(-1,1,size=(hidden_nodes, input_nodes))
        self.weights_hidden_output = np.random.uniform(-1,1,size=(output_nodes, hidden_nodes))
        self.bias_hidden = np.ones((hidden_nodes,1))
        self.bias_output = np.ones((output_nodes,1))
        self.learning_rate=learning_rate
        
    def sigmoid(self,x):
        return 1/(1+ math.exp(-x))
    
    def derivate(self,x):
        return x*(1-x)
    
   
    def feedforward(self,input_v):
        sigmoid_vector = np.vectorize(self.sigmoid)
        
        input_vector = input_v.reshape((self.input_nodes,1))
    
        hidden = np.dot(self.weights_input_hidden,input_vector)
        hidden = np.add(hidden, self.bias_hidden)
        hidden = sigmoid_vector(hidden)
    
        output = np.dot(self.weights_hidden_output, hidden)
        output = np.add(output, self.bias_output)
        output = sigmoid_vector(output)
    
        return output
    
    def backpropagation(self, input_v, target_v):
        input_vector = input_v.reshape((self.input_nodes,1))
        target_vector = target_v.reshape((self.output_nodes,1))
        
        sigmoid_vector = np.vectorize(self.sigmoid)
        derivate_vector = np.vectorize(self.derivate)
    
        hidden = np.dot(self.weights_input_hidden,input_vector)
        hidden = np.add(hidden, self.bias_hidden)
        hidden = sigmoid_vector(hidden)
        
        output = np.dot(self.weights_hidden_output, hidden)
        output = np.add(output, self.bias_output)
        output = sigmoid_vector(output)

        

        output_error = np.subtract(target_vector,output)
        error = output_error.sum(0)
        
        gradient = derivate_vector(output)
        gradient = np.multiply(gradient,output_error)
        gradient = np.multiply(gradient, self.learning_rate)
        
        hidden_transpose = np.transpose(hidden)
        weights_ho_deltas = np.dot(gradient, hidden_transpose)
        
        self.weights_hidden_output = np.add(self.weights_hidden_output, weights_ho_deltas)
        self.bias_output = np.add(self.bias_output, gradient)
        
        
        transpose_weights_hidden_output = np.transpose(self.weights_hidden_output)
        hidden_error = np.dot(transpose_weights_hidden_output, output_error)
        
    
        hidden_gradient = derivate_vector(hidden)
        hidden_gradient = np.multiply(hidden_gradient, hidden_error)
        hidden_gradient = np.multiply(hidden_gradient, self.learning_rate)
        
        input_transpose = np.transpose(input_vector)
        weights_ih_deltas = np.dot(hidden_gradient, input_transpose)
        
        self.weights_input_hidden = np.add(self.weights_input_hidden, weights_ih_deltas)
        self.bias_hidden = np.add(self.bias_hidden, hidden_gradient)

        return error
    

    def train(self, train_dataframe, epochs):
        spam = 0
        ham = 0
        iteration = 0
        error_sample = 200
        errors = []

        for i in range(epochs):
            print("Epoch", i)
            for index, row in train_dataframe.iterrows():
                spam+=(row['label_tag'])  
                input_v = row.to_numpy()
                input_v = input_v[1:len(input_v)-1]
                target_v = np.array([row['label_tag']])
                error = self.backpropagation(input_v, target_v)
                
                if iteration%error_sample == 0:
                    errors.append(error)
                    print("Iteration", iteration, "error", error)
                iteration += 1
            print("\n")
            
        ham = (len(train_dataframe)*epochs)-spam
        print(f"Spam:{spam} - Ham:{ham}")
        print("Done")  

        return np.array(errors)

In [None]:
#These are utils class for parsing, reading and saving data from the datasets

In [None]:
class DataUtil:
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    @staticmethod
    def normalize_data(message):
        message = re.sub(r"\$[\d]+",'price',message)
        message = re.sub(r"\%[\d]+",'percentage',message)
        message = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'url',message)
        message = re.sub(r"www.(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'url',message)
        message = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'email',message)
        message = re.sub(r'[\W\s\d]',' ',message)
        return message

    @staticmethod
    def clean_data(message):
        message = message.lower() 
        message = DataUtil.normalize_data(message)
        words = nltk.word_tokenize(message)

        result = []
        for word in words:
            if word not in DataUtil.stop_words and len(word)>2:   
                #words = DataUtil.stemmer.stem(words[i])
                word = DataUtil.lemmatizer.lemmatize(word)
                result.append(word)  
        return result

    @staticmethod
    def order_and_take(data, key, n=None):
        data = OrderedDict(sorted(data.items(), key=lambda i: i[1][key], reverse=True))
        if n!=None:
            data = dict(islice(data.items(), n))
        return data


class DocumentReader:
    def __init__(self, document):
        self.document = document
        self.words_data = {}

    def get_words(self):
        df = pd.read_csv(self.document)
        words_list = dict()

        for index, row in df.iterrows():
            words = DataUtil.clean_data(row['message'])
            for word in words: 
                if word not in words_list.keys():
                    words_list[word] = 1
                else:
                    words_list[word] += 1
        
        result = { key:val for key, val in words_list.items() if val > 10}
        result = nlargest(3000, result, key=result.get)
        return result

In [None]:
class Data:
    @staticmethod
    def tf(sentences):    
        words_counter = {}
        for index, sentence in enumerate(sentences):
            words = DataUtil.clean_data(sentence)
            for word in words: 
                    if word not in words_counter.keys(): 
                        words_counter[word] = {}
                        words_counter[word]['sentences'] = {}
                    if index not in words_counter[word]['sentences'].keys():
                        words_counter[word]['sentences'][index] = 1/len(words)         
                    else:
                        words_counter[word]['sentences'][index] += 1/len(words)
        return words_counter

    @staticmethod
    def tf_idf(message):
        sentences = nltk.sent_tokenize(message)
        words_count = Data.tf(sentences)
        words_data = {}

        for key, element in words_count.items():
            words_data[key] = [0 for i in range(len(sentences))]
            idf = math.log(len(sentences)/len(element['sentences']))
            for index, sentence_ratio in element['sentences'].items():    
                words_data[key][index] = sentence_ratio * idf
        return words_data
          
    @staticmethod
    def get_inputs_count(message, words_list):  
        words = DataUtil.clean_data(message)  
        inputs = np.zeros(len(words_list))

        for index, word in enumerate(words_list):
            if word in words:
                inputs[index] +=1 
        return inputs

    @staticmethod
    def load_unique_words(dataframe):
        unique_words = {}
        for index,row in dataframe.iterrows():
            unique_words[row['word']] = 0
        return unique_words

In [None]:
#First of all we need to get all the unique words from our dataset, so we parse each message into tokens and keep the 3000 most frequent words