In [2]:
import pandas as pd

df = pd.read_csv('spam.csv', encoding='latin-1', names=['class', 'text', '', '', ''], header=None)
df = df[1:]
df = df[['class', 'text']]
df['class'] = (df['class'] == 'spam').astype(int)
df.head()

Unnamed: 0,class,text
1,0,"Go until jurong point, crazy.. Available only ..."
2,0,Ok lar... Joking wif u oni...
3,1,Free entry in 2 a wkly comp to win FA Cup fina...
4,0,U dun say so early hor... U c already then say...
5,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_index, test_index = next(sss.split(df['text'], df['class']))
df.iloc[train_index].head()

Unnamed: 0,class,text
4913,1,"Goal! Arsenal 4 (Henry, 7 v Liverpool 2 Henry ..."
2542,0,I dont. Can you send it to me. Plus how's mode.
5324,0,Aah bless! How's your arm?
5172,0,Oh k. . I will come tomorrow
2533,0,Yup ok...


In [4]:
df.iloc[0].text

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [5]:
from nltk import word_tokenize

tokens = word_tokenize(df.iloc[0].text)
tokens

['Go',
 'until',
 'jurong',
 'point',
 ',',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '...',
 'Cine',
 'there',
 'got',
 'amore',
 'wat',
 '...']

In [6]:
from nltk.corpus import stopwords

filtered_tokens = list(filter(lambda x: x not in stopwords.words('english'), tokens) )
filtered_tokens

['Go',
 'jurong',
 'point',
 ',',
 'crazy..',
 'Available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '...',
 'Cine',
 'got',
 'amore',
 'wat',
 '...']

In [7]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
stemmed_tokens = [stemmer.stem(x) for x in filtered_tokens]
stemmed_tokens

['go',
 'jurong',
 'point',
 ',',
 'crazy..',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 '...',
 'cine',
 'got',
 'amor',
 'wat',
 '...']

In [243]:
import math as m
import random as rnd
import re

class NaiveBayesText():
    def __init__(self, categories):
        self.categories = categories
        self.stemmer = SnowballStemmer('english')
        self.stopwords = stopwords.words('english')
        self.replace = {
            'http|www\.': 'weblink',
            '\d{3,}': '00000',
            '\d{1,3}': '11111',
            '\@': '@@@@@',
            '\d{1,2}\.{\d{1,2}}': 'timee',
            '\!': '!!!!!' 
        }
        
    def fit(self, texts, categories):
        self.category_probabilities = {}
        self.word_probabilities = {}
        self.category_counts = {1:0, 0:0}
        self.word_counts = {}
        self.category_lengths = {1:0, 0:0}
        
        coll = zip(texts, categories)
        for text, category in coll:
            self._fit_one_text(text, category)
            
        assert sum(self.category_counts.values()) == len(texts)
            
        for c in self.categories:
            self.category_probabilities[c] = self.category_counts[c]/len(texts)
        
        for word, counts in self.word_counts.items():
            self.word_probabilities[word] = {}
            
            for c in self.categories:
                self.word_probabilities[word][c] = (counts[c] + 1)/(counts['all'] + len(self.categories))
                
            assert sum(self.word_probabilities[word].values()) == 1
                
    def predict(self, texts):
        word_probas = {}
        probas = []
        for text in texts:
            proba = self._predict_one(text)
            probas.append(proba)
        return probas
    
    def generate(self, category, n):
        s = ''
        wp = list(self.word_probabilities.items())
        i = 0
        while i < n:
            wl = len(wp)
            windex = rnd.randint(0, wl)
            w = wp[windex]
            if rnd.random() < w[1][category]:
                s += w[0] + ' '
                i += 1
        
        return s
    
    def _predict_one(self, text):
        stemmed_filtered_tokens = self._stem_tokenize_stop_words(text)
        
        log_proba_spam = self.category_probabilities[1] #m.log(self.category_probabilities[1])
        log_proba_hum = self.category_probabilities[0] #m.log(self.category_probabilities[0])
        for word in stemmed_filtered_tokens:
            if word not in self.word_probabilities: continue
                
            log_proba_word_spam = self.word_probabilities[word][1] #m.log(self.word_probabilities[word][1])
            log_proba_word_hum = self.word_probabilities[word][0] #m.log(self.word_probabilities[word][0])
            log_proba_spam *= log_proba_word_spam
            log_proba_hum *= log_proba_word_hum
        
        proba_spam = log_proba_spam#m.exp(log_proba_spam)
        proba_hum = log_proba_hum#m.exp(log_proba_hum)
        
        return proba_spam/(proba_hum+proba_spam)
    
    def _smart_replace(self, token):
        token = token.lower()
        for pattern, replace in self.replace.items():
            if re.search(pattern, token) is not None:
                token = replace
        return token
    
    def _smart_filter(self, token):
        return len(token) > 4 and token not in self.stopwords
        #return True
        
    def _stem_tokenize_stop_words(self, text):
        return filter(self._smart_filter, [self._smart_replace(x) for x in word_tokenize(text)])
            
    def _fit_one_text(self, text, category):
        stemmed_filtered_tokens = list(self._stem_tokenize_stop_words(text))
            
        self.category_counts[category] += 1
        
        for word in stemmed_filtered_tokens:
            self._fit_one_word(word, category)
            
    def _fit_one_word(self, word, category):
        if word not in self.word_counts:
            self.word_counts[word] = {'all':0, 0:0, 1:0}
        
        self.word_counts[word][category] += 1
        self.word_counts[word]['all'] += 1

In [244]:
b = NaiveBayesText([0,1])
b.fit(list(df.iloc[train_index]['text']), list(df.iloc[train_index]['class']))

In [245]:
b.generate(1, 8)

'lucozade model..sony isn\x89ûªt since addie wiskey andrews-boy rocking '

In [246]:
prediction_train = b.predict(list(df.iloc[train_index]['text']))

In [247]:
from sklearn.metrics import precision_score, recall_score, f1_score
threshold = 0.5

y_pred_train = [1 if x > threshold else 0 for x in prediction_train]
y_true_train = list(df.iloc[train_index]['class'])
precision_score(y_true_train, y_pred_train)

0.97967479674796742

In [248]:
recall_score(y_true_train, y_pred_train)

0.92160611854684515

In [249]:
prediction = b.predict(list(df.iloc[test_index]['text']))

In [250]:
threshold = 0.5

y_pred = [1 if x > threshold else 0 for x in prediction]
y_true = list(df.iloc[test_index]['class'])
precision_score(y_true, y_pred)

0.9563106796116505

In [251]:
recall_score(y_true, y_pred)

0.8794642857142857

In [252]:
f1_score(y_true, y_pred)

0.91627906976744178