In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline 
%config Completer.use_jedi = False

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Tasks 

- [x] Explain Simple and Optimized Implementations of the Algorithm Step by Step

# Imports

In [3]:
# from drawdata import draw_scatter
import pandas as pd
import numpy as np
import plotly as pl
import plotly.express as px
import plotly.graph_objects as go
import math
from pathlib import Path
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm
from tqdm.notebook import tqdm
import gc
import re
from collections import defaultdict
from typing import List, Union

from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

In [4]:
np.random.seed(42)
pl.io.renderers.default = 'iframe_connected'

# Bayes' Theorom Python Implementation

In [4]:
def bayes_theorom(p_a, p_b_given_a, p_b_given_not_a):
    not_a = 1 - p_a
    p_b = p_b_given_a * p_a + p_b_given_not_a * not_a
    p_a_given_b = (p_b_given_a * p_a)/p_b
    return p_a_given_b

# Naive Bayes Classifier

## Dummy Example

In [5]:
from sklearn.datasets import make_blobs

In [6]:
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
X.shape, y.shape

((100, 2), (100,))

In [7]:
X[:5]

array([[-0.79415228,  2.10495117],
       [-9.15155186, -4.81286449],
       [-3.10367371,  3.90202401],
       [-1.42946517,  5.16850105],
       [-7.4693868 , -4.20198333]])

In [8]:
y[:5]

array([0, 1, 0, 0, 1])

In [9]:
from scipy.stats import norm

In [10]:
# fit a probability distribution to a univariate data sample
def fit_distribution(data):
    mu = np.mean(data)
    sigma = np.std(data)
    print(mu, sigma)
    dist = norm(mu, sigma)
    return dist

In [11]:
fit_distribution(X[:,0])

-5.6222329955827375 4.145474011953628


<scipy.stats._distn_infrastructure.rv_frozen at 0x130058b8820>

In [12]:
# data by class
Xy0 = X[y==0]
Xy1 = X[y==1]

In [13]:
Xy0.shape, Xy1.shape

((50, 2), (50, 2))

In [14]:
# calculate the priors 
prior0 = len(Xy0)/len(X)
prior1 = len(Xy1)/len(X)

In [15]:
prior0, prior1

(0.5, 0.5)

In [16]:
X0y0= fit_distribution(Xy0[:,0])
X1y0 = fit_distribution(Xy0[:,1])

-1.5632888906409914 0.787444265443213
4.426680361487157 0.958296071258367


In [17]:
X0y1= fit_distribution(Xy1[:,0])
X1y1 = fit_distribution(Xy1[:,1])

-9.681177100524485 0.8943078901048118
-3.9713794295185845 0.9308177595208521


In [18]:
def probability(X, prior, dist1, dist2):
    return prior * dist1.pdf(X[0]) * dist2.pdf(X[1])

In [19]:
# classify one example
Xsample, ysample = X[0], y[0]

In [20]:
py0 = probability(Xsample, prior0, X0y0, X1y0)
py1 = probability(Xsample, prior1, X0y1, X1y1)

In [21]:
print('P(y=0 | %s) = %.3f' % (Xsample, py0*100))
print('P(y=1 | %s) = %.3f' % (Xsample, py1*100))

P(y=0 | [-0.79415228  2.10495117]) = 0.348
P(y=1 | [-0.79415228  2.10495117]) = 0.000


In [22]:
ysample

0

# Implement from Scratch

### Load IRIS Data

In [23]:
from sklearn import datasets

In [24]:
data = datasets.load_iris()

In [25]:
X = data['data']
y = data['target']

In [26]:
X.shape

(150, 4)

In [27]:
X[:1, :]

array([[5.1, 3.5, 1.4, 0.2]])

In [28]:
data['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [29]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [30]:
data['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [31]:
X.shape

(150, 4)

In [32]:
data = np.concatenate((X, y.reshape(-1,1)), axis=1)

## Gaussian NB 

### 1. Separate by class

In [33]:
set(data[:,-1])

{0.0, 1.0, 2.0}

In [34]:
def separate_by_class(data):
    separated = dict()
    classes = set(data[:,-1])
    for c in classes:
        separated[c] = data[data[:,-1]==c]
    return separated

In [35]:
separated = separate_by_class(data)
separated.keys()

dict_keys([0.0, 1.0, 2.0])

### 2. Summarize Dataset

In [36]:
from math import sqrt

In [37]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [38]:
def std(numbers):
    avg = mean(numbers)
    variance =sum([(x - avg)**2 for x in numbers])/float(len(numbers)-1)
    return np.sqrt(variance)

In [39]:
[(mean(data[:, col]), std(data[:, col]), len(data[:, col])) for col in range(data.shape[1]-1)]

[(5.843333333333335, 0.8280661279778629, 150),
 (3.057333333333334, 0.435866284936698, 150),
 (3.7580000000000027, 1.7652982332594667, 150),
 (1.199333333333334, 0.7622376689603465, 150)]

In [40]:
def summarize_dataset(data):
    summaries = [(mean(data[:, col]), std(data[:, col]), len(data[:, col])) for col in range(data.shape[1]-1)]
    return summaries

### 3. Summarize Data by Class

In [41]:
def summarize_by_class(data):
    separated = separate_by_class(data)
    summaries = dict()
    for label, rows in separated.items():
        summaries[label] = summarize_dataset(rows)
    return summaries

In [42]:
summary = summarize_by_class(data)

In [43]:
summary

{0.0: [(5.005999999999999, 0.3524896872134512, 50),
  (3.428000000000001, 0.3790643690962886, 50),
  (1.4620000000000002, 0.1736639964801841, 50),
  (0.2459999999999999, 0.10538558938004569, 50)],
 1.0: [(5.936, 0.5161711470638635, 50),
  (2.7700000000000005, 0.3137983233784114, 50),
  (4.26, 0.46991097723995806, 50),
  (1.3259999999999998, 0.197752680004544, 50)],
 2.0: [(6.587999999999998, 0.635879593274432, 50),
  (2.9739999999999998, 0.3224966381726376, 50),
  (5.552, 0.5518946956639835, 50),
  (2.026, 0.27465005563666733, 50)]}

In [44]:
summary[0][0][2]

50

### 4. Gaussian Probability Distribution Function

In [45]:
def calculate_gaussian_probability(x, mean, std):
    exponent = np.exp(-((x-mean)**2/(2*std**2)))
    return (1/(np.sqrt(2*np.pi)*std)) * exponent

In [46]:
calculate_gaussian_probability(1, 1, 1)

0.3989422804014327

In [47]:
calculate_gaussian_probability(2, 1, 1), calculate_gaussian_probability(0, 1, 1)

(0.24197072451914337, 0.24197072451914337)

### Class Probabilities

In [48]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for label, class_summaries in summaries.items():
        probabilities[label] = summaries[label][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, std, count =  class_summaries[i]
            probabilities[label] *= calculate_gaussian_probability(row[i], mean, std)
    return probabilities

In [49]:
sample = data[0][:-1]

In [50]:
sample

array([5.1, 3.5, 1.4, 0.2])

In [51]:
calculate_class_probabilities(summary, sample)

{0.0: 2.7915339171768885,
 1.0: 8.322426199968131e-18,
 2.0: 6.008422572010989e-25}

## Gaussian NB Optimized

### Calculation Tests

In [52]:
data = pd.DataFrame(data, columns=['sepal length', 'sepal width', 'petal length', 'petal width', 'target'])

In [53]:
m, v = data.groupby('target').apply(np.mean).values[:,:-1], data.groupby('target').apply(np.var).values[:,:-1]

In [54]:
m

array([[5.006, 3.428, 1.462, 0.246],
       [5.936, 2.77 , 4.26 , 1.326],
       [6.588, 2.974, 5.552, 2.026]])

In [55]:
v

array([[0.121764, 0.140816, 0.029556, 0.010884],
       [0.261104, 0.0965  , 0.2164  , 0.038324],
       [0.396256, 0.101924, 0.298496, 0.073924]])

In [56]:
np.log(0.33333333)

-1.0986122986681097

In [57]:
np.log(1)

0.0

In [58]:
np.log(33)

3.4965075614664802

In [59]:
np.exp(3.4965075614664802)

33.0

In [62]:
# np.sum(np.log((np.exp((-1/2)*((samples.iloc[0,:-1].values-m)**2)/(2*v))))[0])+np.log(0.33333333)

In [None]:
np.sum(np.log((np.exp((-1/2)*((samples.iloc[0,:-1].values-m)**2)/(2*v))))[1])+np.log(0.33333333)

In [None]:
np.sum(np.log((np.exp((-1/2)*((samples.iloc[0,:-1].values-m)**2)/(2*v))))[2])+np.log(0.33333333)

In [None]:
stats[0][0]

In [None]:
stats[0][1:]

In [None]:
(data.groupby('target')[data.columns[0]].count()/data.shape[0]).values

In [None]:
len(data.iloc[0,:-1].values)

In [None]:
data['target'].unique()

In [None]:
data.target.value_counts(normalize=True)

### GNB with Numpy

In [75]:
class GaussianNBCustom():
    def __init__(self,):
        pass
    
    def fit(self, data, target):
        """ calculate mean and variance of each feature per class
            data: pd.DataFrame
            target: target column name
        """
        self.classes = data['target'].unique()
        
        self.mean, self.var = data.groupby('target').apply(np.mean).values[:,:-1], data.groupby('target').apply(np.var).values[:,:-1]
        
        self.prior = (data.groupby('target')[data.columns[0]].count()/data.shape[0]).values
    
    def gaussian_pdf(self, class_idx, x):
        mean, var = self.mean[class_idx], self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2)/(2*var))
        denominator = np.sqrt(2*np.pi*var)
        prob = numerator/denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []
        for i in range(len(self.classes)):
            prior = np.log(self.prior[i])
            conditional = np.sum(np.log(self.gaussian_pdf(i, x)))
            posterior = prior + conditional
            posteriors.append(posterior)
        
        return self.classes[np.argmax(posteriors)]
        
    def predict(self, X):
        preds = [self.calc_posterior(x) for x in X.values]
        return preds        
        

In [76]:
gnb = GaussianNBCustom()

In [77]:
gnb.fit(data, 'target')

In [78]:
samples = data.sample(10)

In [79]:
preds = gnb.predict(samples.iloc[:,:-1])

In [80]:
[(x==y) for x, y in zip(preds, samples.target.tolist())]

[True, True, True, True, True, True, True, True, True, True]

## Multinomial Naive Bayes

### Get Data

In [81]:
Path.cwd()

WindowsPath('D:/amit/ml_indepth/naivebayes_mle_map')

In [82]:
sms_df = pd.read_csv('../datasets/SMSSpamCollection', header=None, sep='\t', names=['label', 'sms'])

In [83]:
sms_df

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [84]:
sms_df.label.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

### Split Train Test

#### Manually

In [85]:
sms_randomized = sms_df.sample(frac=1, random_state=42)

In [86]:
training_test_idx = round(len(sms_randomized)*0.8)
training_test_idx

4458

In [87]:
train_set = sms_randomized[:training_test_idx].reset_index(drop=True)

In [88]:
test_set = sms_randomized[training_test_idx:].reset_index(drop=True)

In [89]:
train_set.shape, test_set.shape

((4458, 2), (1114, 2))

In [90]:
train_set.label.value_counts(normalize=True)

ham     0.866981
spam    0.133019
Name: label, dtype: float64

In [91]:
test_set.label.value_counts(normalize=True)

ham     0.861759
spam    0.138241
Name: label, dtype: float64

#### Stratified Shuffle Split

In [92]:
from sklearn.model_selection import StratifiedShuffleSplit

In [93]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [94]:
sss.get_n_splits(sms_df.sms.values, sms_df.label.values)

5

In [95]:
for train_index, test_index in sss.split(sms_df.sms.values, sms_df.label.values):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = sms_df.sms.values[train_index], sms_df.sms.values[test_index]
    y_train, y_test = sms_df.label.values[train_index], sms_df.label.values[test_index]

TRAIN: [ 184 2171 5422 ... 2309 1904  762] TEST: [2825 3695 3904 ... 2015 3380  785]
TRAIN: [3535 3026 1592 ... 3319  545 2131] TEST: [1573  966 4323 ... 3427 3784 2526]
TRAIN: [2528  635  699 ...  485 2784 3898] TEST: [1113 5041 5065 ... 3516 2366 1821]
TRAIN: [1011 3634 1222 ... 2441 4439 2403] TEST: [5059 1488 4018 ...  496 4096 2110]
TRAIN: [ 752 4008 5511 ... 4794 1155 1479] TEST: [1115 3969 3191 ...   31 2069 2314]


In [96]:
X_train.shape, y_train.shape

((4457,), (4457,))

In [97]:
X_test.shape, y_test.shape

((1115,), (1115,))

In [98]:
from collections import Counter

In [99]:
{k:round(v/len(y_train),5) for k, v in Counter(y_train).items()}

{'spam': 0.13417, 'ham': 0.86583}

In [100]:
train_set = pd.DataFrame(zip(X_train, y_train), columns=['sms', 'label'])
test_set = pd.DataFrame(zip(X_test, y_test), columns=['sms', 'label'])

In [101]:
train_set.shape, test_set.shape

((4457, 2), (1115, 2))

In [102]:
train_set.label.value_counts(normalize=True)

ham     0.865829
spam    0.134171
Name: label, dtype: float64

In [103]:
test_set.label.value_counts(normalize=True)

ham     0.866368
spam    0.133632
Name: label, dtype: float64

### Data Cleaning

In [None]:
train_set.rename(columns={'sms':'TEXT'}, inplace=True)

In [302]:
train_set.TEXT = train_set.TEXT.str.lower().str.replace('\W', ' ', regex=True)

In [303]:
train_set.TEXT.values[0]

'you have an important customer service announcement from premier '

In [305]:
%%time
vocab = set(' '.join(train_set.TEXT.values).replace('\W', ' ').split())
vocab = list(vocab - set(stopwords.words('english')))

Wall time: 10.3 ms


In [306]:
len(vocab)

7670

In [109]:
for idx, sms in enumerate(train_set.sms):
    for word in sms.split():
        word_counts_per_sms[word][idx] += 1

In [310]:
word_counts = pd.DataFrame(word_counts_per_sms)

In [311]:
word_counts

Unnamed: 0,cs,cricket,serena,hit,sing,linerental,enters,meet,ate,changes,...,norm150p,prometazine,09066362220,med,sportsx,scool,poop,toss,bribe,vijaykanth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [312]:
train_set_words = pd.concat([train_set, word_counts], axis=1)
train_set_words.head()

Unnamed: 0,TEXT,label,n_words_in_text,cs,cricket,serena,hit,sing,linerental,enters,...,norm150p,prometazine,09066362220,med,sportsx,scool,poop,toss,bribe,vijaykanth
0,you have an important customer service announc...,spam,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,i m reaching home in 5 min,ham,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,it s reassuring in this crazy world,ham,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,are you staying in town,ham,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,you will go to walmart i ll stay,ham,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [314]:
train_set_words.groupby('label').sum()

Unnamed: 0_level_0,n_words_in_text,cs,cricket,serena,hit,sing,linerental,enters,meet,ate,...,norm150p,prometazine,09066362220,med,sportsx,scool,poop,toss,bribe,vijaykanth
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ham,57196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
spam,15148,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [313]:
train_set_words.label.value_counts(normalize=True, sort=True, ascending=True)

spam    0.134171
ham     0.865829
Name: label, dtype: float64

In [115]:
p_spam, p_ham = train_set_words.label.value_counts(normalize=True, sort=True, ascending=True).values

In [116]:
p_spam, p_ham

(0.13417096701817366, 0.8658290329818263)

In [117]:
train_set_words['n_words_in_text'] = train_set_words.TEXT.str.split().apply(len)

In [118]:
n_ham, n_spam = train_set_words.groupby('label').n_words_in_text.sum().values

In [119]:
n_ham, n_spam, p_ham, p_spam

(57196, 15148, 0.8658290329818263, 0.13417096701817366)

In [120]:
train_set_words.drop(columns='n_words_in_text', inplace=True)

In [121]:
n_vocab = len(vocab)

In [122]:
alpha = 1

In [266]:
train_set.TEXT

0       you have an important customer service announc...
1                             i m reaching home in 5 min 
2                   it s reassuring  in this crazy world 
3                               are you staying in town  
4                      you will go to walmart  i ll stay 
                              ...                        
4452    claim a 200 shopping spree  just call 08717895...
4453    jus telling u dat i ll b leaving 4 shanghai on...
4454    or u ask they all if next sat can a not  if al...
4455    did u find a sitter for kaitlyn  i was sick an...
4456    i think its far more than that but find out  c...
Name: TEXT, Length: 4457, dtype: object

### Model

In [333]:
class MultinomialNaiveBayesCustom():
    def __init__(self, alpha=1):
        """
        alpha: smoothing variable, if "1": laplacian smoothing
        """
        self.alpha = alpha

    def fit(self, data, X, y):
        """
        data: pandas dataframe containing data
        X: feature column name
        y: target column name
        """
        self.X, self.y = X, y
        self.labels = data[y].unique()

        priors = data.label.value_counts(
            normalize=True, sort=True, ascending=True)
        self.stats = {'p_' + c: priors[c] for c in self.labels}

        data['n_words_in_text'] = data[X].str.split().apply(len)

        feature_value_counts = data.groupby(y).n_words_in_text.sum()
        self.stats.update(
            {'n_'+c: feature_value_counts[c] for c in self.labels})

        data[X] = data[X].str.lower().str.replace('\W', ' ', regex=True)

        self.vocab = set(' '.join(data[X].values).split())
        
        self.vocab = list(self.vocab - set(stopwords.words('english')))

        self.stats.update({'n_vocab': len(self.vocab)})

        word_counts_per_sms = {unique_word: [0] * len(data) for unique_word in self.vocab}

        for idx, text in enumerate(data[X]):
            for word in text.split():
                if word in self.vocab:
                    word_counts_per_sms[word][idx] += 1

        word_counts = pd.DataFrame(word_counts_per_sms)

        self.data = pd.concat([data, word_counts], axis=1)

        self.data.drop(columns='n_words_in_text', inplace=True)
        del word_counts, priors, word_counts_per_sms, feature_value_counts, data
        gc.collect()

        self.parameters = self.calc_prob_feature_given_label()

    def calc_prob_feature_given_label(self):
        label_wise_feature_count = self.data.loc[:, self.data.columns != self.X].groupby(
            self.y).sum()
        parameters = {'param_'+c: ((label_wise_feature_count.loc[c]+self.alpha)/(
            self.stats['n_'+c]+(self.stats['n_vocab']*self.alpha))).to_dict() for c in self.labels}

        del label_wise_feature_count
        gc.collect()

        return parameters

    def predict(self, X: Union[str, list]):
        """ predicts the lables of input string or list of strings
            X: text messages
        """
        if not isinstance(X, list):
            X = list(X)

        X = [list(set(re.sub('\W', ' ', x.lower()).split())) for x in X]

        self.test_features_ = X

        self.log_probabilities = defaultdict(list)
        
        {self.log_probabilities[idx].append((np.log(self.stats['p_'+c]) +
                                                 sum(np.log([self.parameters['param_'+c][word]
                                                             for word in words if word in self.vocab])
                                                     ))) for idx, words in enumerate(X) for c in self.labels}

        self.predictions = [self.labels[np.argmax(
            v)] for k, v in self.log_probabilities.items()]

        return self.predictions

    def evaluate_metrics(self, y):

        if isinstance(y, list):
            list(y)
        truth_values = [p == t for p, t in zip(self.predictions, y)]
        self.misclassified_idx_ = [idx for idx,
                                   t in enumerate(truth_values) if not t]
        self.metrics = {'accuracy': sum(truth_values)/len(y)}
        self.prediction_df = pd.DataFrame(self.log_probabilities).T
        self.prediction_df.columns = self.labels
        self.prediction_df['text'] = self.test_features_
        return self.metrics

    def get_parameter_value(self, word):
        if word in mnb.vocab:
            return [mnb.parameters['param_'+c][word] for c in mnb.labels]
        else:
            return np.nan


In [335]:
%%time
mnb = MultinomialNaiveBayesCustom(alpha=1)
mnb.fit(train_set, 'TEXT', 'label')

Wall time: 12.3 s


In [184]:
mnb.stats

{'p_spam': 0.13417096701817366,
 'p_ham': 0.8658290329818263,
 'n_spam': 15148,
 'n_ham': 57196,
 'n_vocab': 7670}

In [185]:
mnb.data.head()

Unnamed: 0,TEXT,label,cricket,sing,enters,changes,45pm,collected,manege,quizzes,...,lit,shindig,thepub,forgets,prometazine,09066362220,sportsx,scool,toss,vijaykanth
0,you have an important customer service announc...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,i m reaching home in 5 min,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,it s reassuring in this crazy world,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,are you staying in town,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,you will go to walmart i ll stay,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [186]:
mnb.labels

array(['spam', 'ham'], dtype=object)

In [187]:
%%time
preds = mnb.predict(test_set.sms.tolist())

Wall time: 2.25 s


In [188]:
%%time
metrics = mnb.evaluate_metrics(test_set.label)

Wall time: 30.9 ms


In [189]:
metrics

{'accuracy': 0.9856502242152466}

In [190]:
test_set[test_set.index.isin(mnb.misclassified_idx_)]

Unnamed: 0,sms,label
233,U're welcome... Caught u using broken english ...,ham
290,"Hi, Mobile no. &lt;#&gt; has added you in th...",ham
353,"Hi, Mobile no. &lt;#&gt; has added you in th...",ham
379,Are you free now?can i call now?,ham
394,"Text me when you get off, don't call, my phone...",ham
417,Would you like to see my XXX pics they are so ...,spam
575,Missed call alert. These numbers called but le...,spam
631,Cheers for the message Zogtorius. Ive been st...,ham
673,Total video converter free download type this ...,ham
733,and picking them up from various points,ham


In [191]:
sorted(mnb.parameters['param_spam'].items(), key=lambda k:k[1], reverse=True)[:50]

[('call', 0.012533964414059077),
 ('free', 0.008239109474975897),
 ('2', 0.0071873082654045056),
 ('u', 0.0060478569550354985),
 ('txt', 0.005390481199053379),
 ('ur', 0.0050398807958629156),
 ('stop', 0.004470155140678412),
 ('mobile', 0.004470155140678412),
 ('4', 0.004470155140678412),
 ('text', 0.004207204838285564),
 ('claim', 0.004119554737487948),
 ('1', 0.003944254535892716),
 ('reply', 0.0038127793846962923),
 ('www', 0.0036374791831010607),
 ('prize', 0.003286878779910597),
 ('get', 0.0028048032255237093),
 ('cash', 0.0026733280743272856),
 ('send', 0.0025856779735296694),
 ('uk', 0.0024980278727320536),
 ('urgent', 0.00236655272153563),
 ('new', 0.00236655272153563),
 ('nokia', 0.002322727671136822),
 ('150p', 0.0022789026207380137),
 ('contact', 0.00214742746954159),
 ('com', 0.00214742746954159),
 ('please', 0.002103602419142782),
 ('win', 0.002059777368743974),
 ('msg', 0.002059777368743974),
 ('50', 0.002059777368743974),
 ('tone', 0.002059777368743974),
 ('co', 0.002015

In [192]:
sorted(mnb.parameters['param_ham'].items(), key=lambda k:k[1], reverse=True)[:50]

[('u', 0.012271451916258133),
 ('2', 0.003946597601208646),
 ('get', 0.003653686060493941),
 ('gt', 0.0036228532667344987),
 ('lt', 0.0035920204729750563),
 ('ok', 0.0033453581228995158),
 ('go', 0.0033145253291400733),
 ('ur', 0.0031603613603428606),
 ('good', 0.0029753645977862054),
 ('got', 0.002944531804026763),
 ('know', 0.002944531804026763),
 ('come', 0.002836617025868714),
 ('call', 0.0028057842321092714),
 ('like', 0.002605371072672895),
 ('day', 0.002605371072672895),
 ('time', 0.0024820398976351248),
 ('love', 0.0024512071038756823),
 ('4', 0.002327875928837912),
 ('going', 0.002142879166281257),
 ('one', 0.0020966299756420928),
 ('ü', 0.002050380785002929),
 ('want', 0.001988715197484044),
 ('home', 0.001988715197484044),
 ('lor', 0.001973298800604323),
 ('sorry', 0.001973298800604323),
 ('da', 0.0019424660068448802),
 ('need', 0.001927049609965159),
 ('k', 0.0018962168162057163),
 ('still', 0.0018808004193259951),
 ('see', 0.0018037184349273888),
 ('dont', 0.00175746924428

### Try on a new dataset

In [170]:
from nltk.corpus import stopwords

In [193]:
imdb_data = pd.read_csv('../datasets/imdb_labelled.txt', sep='\t', header=None, names=['TEXT', 'label'])

In [194]:
imdb_data.label = imdb_data.label.apply(lambda x: 'positive' if x==1 else 'negative')

In [195]:
imdb_data.head().values

array([['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
        'negative'],
       ['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
        'negative'],
       ['Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ',
        'negative'],
       ['Very little music or anything to speak of.  ', 'negative'],
       ['The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.  ',
        'positive']], dtype=object)

In [196]:
imdb_data.label.value_counts(normalize=True)

positive    0.516043
negative    0.483957
Name: label, dtype: float64

In [197]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [198]:
sss.get_n_splits(imdb_data.TEXT.values, imdb_data.label.values)

5

In [199]:
for train_index, test_index in sss.split(imdb_data.TEXT.values, imdb_data.label.values):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = imdb_data.TEXT.values[train_index], imdb_data.TEXT.values[test_index]
    y_train, y_test = imdb_data.label.values[train_index], imdb_data.label.values[test_index]

In [200]:
imdb_train = pd.DataFrame(zip(X_train, y_train), columns=['TEXT', 'label'])
imdb_test = pd.DataFrame(zip(X_test, y_test), columns=['TEXT', 'label'])

In [201]:
imdb_train.label.value_counts(normalize=True)

positive    0.516722
negative    0.483278
Name: label, dtype: float64

In [202]:
imdb_test.label.value_counts(normalize=True)

positive    0.513333
negative    0.486667
Name: label, dtype: float64

In [203]:
imdb_train.shape, imdb_test.shape

((598, 2), (150, 2))

In [204]:
%%time
mnb = MultinomialNaiveBayesCustom(alpha=1)
mnb.fit(imdb_train, 'TEXT', 'label')

Wall time: 1.39 s


In [205]:
mnb.stats

{'p_positive': 0.5167224080267558,
 'p_negative': 0.48327759197324416,
 'n_positive': 5016,
 'n_negative': 5378,
 'n_vocab': 2345}

In [206]:
mnb.data.head()

Unnamed: 0,TEXT,label,fest,sing,flat,bunch,loose,changes,singing,definitely,...,skilled,backed,bertolucci,awkwardly,researched,based,hayao,elderly,short,survivors
0,think of the film being like a dream,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,meredith m was better than all right,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,imdb ratings only go as low 1 for awful it s ...,negative,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,it s a case of so bad it is laughable,negative,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,unfortunately any virtue in this film s produ...,negative,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [207]:
mnb.labels

array(['positive', 'negative'], dtype=object)

In [208]:
%%time
preds = mnb.predict(imdb_test.TEXT.tolist())

Wall time: 138 ms


In [209]:
%%time
metrics = mnb.evaluate_metrics(imdb_test.label)

Wall time: 4.99 ms


In [210]:
metrics

{'accuracy': 0.7466666666666667}

In [211]:
imdb_test[imdb_test.index.isin(mnb.misclassified_idx_)]

Unnamed: 0,TEXT,label
10,"Being a 90's child, I truly enjoyed this show ...",positive
11,I was left shattered from the experience of wa...,negative
12,"putting the race card aside, lets look at the ...",negative
14,It just blew.,negative
18,"The writers were ""smack on"" and I think the be...",positive
22,Considering the relations off screen between T...,negative
24,"Lovely little thriller from Hitchcock, with lo...",positive
27,It is not good.,negative
31,There's also enough hypocrisy in this film to ...,negative
34,the movie is littered with overt racial slurs ...,negative


In [212]:
mnb.parameters.keys()

dict_keys(['param_positive', 'param_negative'])

In [213]:
sorted(mnb.parameters['param_negative'].items(), key=lambda k:k[1], reverse=True)[:50]

[('movie', 0.008675385213000129),
 ('0', 0.006992101514955328),
 ('bad', 0.006474168069403082),
 ('film', 0.006474168069403082),
 ('1', 0.006474168069403082),
 ('one', 0.004402434287194096),
 ('even', 0.003237084034701541),
 ('like', 0.002848633950537356),
 ('acting', 0.0027191505891492945),
 ('plot', 0.002460183866373171),
 ('time', 0.002201217143597048),
 ('would', 0.002201217143597048),
 ('really', 0.002071733782208986),
 ('good', 0.0018127670594328628),
 ('ever', 0.0018127670594328628),
 ('awful', 0.0018127670594328628),
 ('script', 0.0016832836980448013),
 ('made', 0.0016832836980448013),
 ('stupid', 0.0016832836980448013),
 ('films', 0.0016832836980448013),
 ('see', 0.0015538003366567395),
 ('movies', 0.0015538003366567395),
 ('show', 0.0015538003366567395),
 ('make', 0.0015538003366567395),
 ('story', 0.0015538003366567395),
 ('work', 0.0015538003366567395),
 ('characters', 0.0015538003366567395),
 ('seen', 0.0015538003366567395),
 ('nothing', 0.0015538003366567395),
 ('worst', 

## Multinomial Naive Bayes with Dot Product Trick