# MultinomialNB model

In [1]:
# ref - https://towardsdatascience.com/multinomial-na%C3%AFve-bayes-for-documents-classification-and-natural-language-processing-nlp-e08cc848ce6
# pyproj/VS file - https://github.com/arthurratz/bayes_classifier_multi/tree/main/bayes_classifier_multi/bayes_classifier_multi


#-----------------------------------------------------------------------------------
#   Multinomial Naїve Bayes' Classifier v.0.0.1
#
#        Pr,Cs = compute(D,C,S)
#
#        D - set of documents, C - set of classes, S - input sample
#
#        Pr - the posteriors of each class in C, Cs - the class of the sample S
#
#        The worst-case complexity of the Multinomial Bayes' classifier:
#
#                   p = O(nmz + 2nm), where n - # number of terms in S, 
#                                           m - # of classes in C
#                                           z - # of documents in D
#
#                   An Example: n = 100, m = 15, z = 10^4 => p ~ O(1,5e+07)
#
#   GNU Public License (C) 2021 Arthur V. Ratz
#-----------------------------------------------------------------------------------

In [2]:
import os
import re
import csv
import math
import nltk
import numpy as np

In [167]:
def log_p(n):
    # Get the probability's n natural algorithm
    return abs(math.log(abs(n))) if n != 0 and n != 1 else 1

def get_fea_class(D,k):
#     # Get all documents in D belonging to the k-th class in C
    return np.array([d for d in D if k == int(d[0])])
#     return np.array([d for d in D])

def get_count_class(D,k):
    # Get the count p(Ck) of the class Ck in documents D
    return len(get_fea_class(D, k))  # 여기서 0이 안나오는데

def get_counts_term(D,w):
    # Get the count of the term w occurrences in each document from D
    count_wt = np.array([len([term for term in d[1] if w == term ]) for d in D])
    print("counts term count_wt ", count_wt)
    # Get the total count of documents from D, containing the term w
    print("total count of documents from D, containing the term w", len(np.array([f_wt for f_wt in count_wt if f_wt > 0])))
    return len(np.array([f_wt for f_wt in count_wt if f_wt > 0]))

def get_prob_class(D,k):
    # Get the probability p(Ck) of the k-th class Ck
    return get_count_class(D,k) / len(D) 

def get_probs_term(D,w):
    # Get the probability of the term w occurrence 
    # in each document from the class Ck
    return get_counts_term(D,w) / len(D)

def parse(S):
    W = S.lower().split()

    # Parse the string S, performing 
    # the normalization and word-stamming using NLTK library
    W = np.array([re.sub(r"""[,.;@#?!&$\']+\ *""", '', w) for w in W])
    W = np.array([tag[0] for tag in nltk.pos_tag(W) if re.match('NN', tag[1]) != None or re.match('JJ', tag[1]) != None])

    return np.array([w for w in W if len(w) > 2])
    
def build_model(D):
    # Build the class prediction model, 
    # based on the corpus of documents in D
    D = np.array([np.array([d[0], parse(d[1])], dtype=object) for d in D], dtype=object)
    return np.array([d for d in D if len(d[1]) > 0])

def compute(D,C,S):
    W = parse(S)                 # A set of terms W in the sample S
    Pr = np.empty(0)             # A set of posteriors Pr(Ck | W)

#     n = len(W)  # n - # of terms W in S
#     m = len(C)  # m - # of classes in C

    # For each k-th class Ck, compute the posterior Pr(Ck | W)
#     for k in range(len(C)):
    for c in C:
        k = int(c[0])
        pr_ck_w = 0                  # pr_ck_w - the likelihood P(Ck | wi) 
                                     # of Ck is the class of the term wi

        d_ck = get_fea_class(D,k)    # d_ck - A set of documents from the class Ck
        try: p_ck = get_prob_class(D,k)   # p_ck - Probability of the k-th class Ck in documents D
        except Exception as e: print(e)
        else: pass

        # For each term W[i], compute the likelihood P(Ck | wi)
        for i in range(len(W)):
            # Obtain the count and probability of the 
            # term W[i] in the documents from class Ck
            prob_wd_n = get_probs_term(d_ck, W[i])  ## 
            count_wt_n = get_counts_term(d_ck, W[i])
            
            pr_ck_w += count_wt_n * log_p(prob_wd_n) if count_wt_n > 0 else 0  #### LOG_P !!!!

        pr_ck_w += p_ck

        # Append the posterior Pr(Ck | W) of the class Ck to the array Pr
        Pr = np.append(Pr, pr_ck_w)

    # Obtain an index of the class Cs as the class in C, 
    # having the maximum posterior Pr(Ck | W)
    Cs = np.where(Pr == np.max(Pr))[0][0]
   
    return Pr,Cs   # Return the array of posteriors Pr
                   # and the index of sample S class Cs

def evaluate(T,D,C):
    print('Classification:')
    print('===============\n')

    # For each sample S in the set T, compute the class of S
    # Estimate the real classification's multinomial entropy and its expectation
    for s in T:
        pr_s = '\0'
        Pr,Cs = compute(D,C,s)
        for ci,p in zip(range(len(C)),Pr):
            pr_s += prob_stats % (C[ci][1],p)

        print(sampl_stats % (s, C[Cs][1] if np.sum(Pr) > 0 else 'None', pr_s))


In [168]:
print(len(D))
print()

for d in D: 
    W = parse(d)  # parsed 
#     print(W)
#     print(len(W))  # # of words
#     print(range(len(W)))  # 0~# of words
    print([W[i] for i in range(len(W))])  # the words/terms
    print()
    
    for i in range(len(W)):
        w = W[i]
        print(w)
        print("probs term ", get_probs_term(D,w))
        print("counts term ", get_counts_term(D,w))
        print()
    print()
#     w = W[i]


58

['pies', 'paper', 'flowers', 'stage', 'growth', 'power', 'accurate', 'observation', 'cynicism']

pies
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
probs term  0.0
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
counts term  0

paper
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
probs term  0.0
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
counts term  0

flowers
counts term count_wt  [0

['computers', 'tools', 'devil"', 'problem', 'computers', 'tools', 'somebody', 'certain']

computers
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
probs term  0.0
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
counts term  0

tools
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
probs term  0.0
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
counts term  0

devil"
counts term count_wt  [0 0 0 0 

['message', 'i’ve', 'myself:', 'cinema', 'conservative', 'story', 'unchallenging', 'zone', 'cinema', 'better', 'television', 'daring']

message
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
probs term  0.0
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
counts term  0

i’ve
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
probs term  0.0
counts term count_wt  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
total count of documents from D, containing the term w 0
counts term

In [136]:
[[term for term in d[1] if w == term] for d in D]

NameError: name 'w' is not defined

In [133]:
[len([term for term in d[1] if w == term ]) for d in D]

NameError: name 'w' is not defined

In [132]:
np.array([len([term for term in d[1] if w == term ]) for d in D])

NameError: name 'w' is not defined

In [131]:
get_counts_term(D,w)

NameError: name 'w' is not defined

In [94]:
[int(c[0]) for c in C]

[0, 1, 2, 3, 4]

In [84]:
len(C)

5

In [87]:
[k for k in range(len(C))]

[0, 1, 2, 3, 4]

In [123]:
for c in C:
    k = int(c[0])
    print("k:", k)
        
#     print(get_fea_class(D,k))  # CLASS OF LINES
#     print(np.array([d for d in D if k == int(d[0])]))
    print(len(get_fea_class(D,k)))  # => count class
    
    print("count_class : ", get_count_class(D,k))  # COUNTS OF LINES IN CLASS
#     print("len D : ", len(D))  # LEN OF TOTAL LINES
#     print("count_class / len D : ", get_count_class(D,k)/len(D))
    
    print("prob class : ", get_prob_class(D,k))

    print("===============")

k: 0
14
count_class :  14
prob class :  0.2413793103448276
k: 1
12
count_class :  12
prob class :  0.20689655172413793
k: 2
11
count_class :  11
prob class :  0.1896551724137931
k: 3
11
count_class :  11
prob class :  0.1896551724137931
k: 4
10
count_class :  10
prob class :  0.1724137931034483


In [100]:
14/58

0.2413793103448276

In [153]:
def output_data(T,D,C,fmt):

    print(model_stats % (len(C),len(D),len(T)))

    print('Classes:')
    print('========\n')

    for c in C:
        k = int(c[0])
        dc = get_fea_class(D,k)
        p_ck = get_count_class(D,k) / len(D)
        pd_stats = fmt % (len(dc), k + 1, p_ck)
        print('C%d: %s %s' % (k + 1, '{0: <12}'.format(c[2:]), pd_stats))

    print('\n')

    print('Documents:')
    print('==========\n')

    for d in D:
        print('C%d: \"%s...\"' % (int(d[0]) + 1, d[1][:80]))

    print("\n")


In [46]:
prob_stats  = 'Pr(%s) = %f '
class_stats = '[ Documents: %3d, P(C%d) = %f ]'
sampl_stats = 'Text: [ \"%s\" ]\nClass: \"%s\" [%s]\n'
model_stats = '[ Classes: %d Documents: %d Samples: %d ]\n'
multi_stats = 'Multinomial Entropy: [ max: %f, real: %f ] classes/term\n'

In [125]:
prob_stats

'Pr(%s) = %f '

In [62]:
T

['0_Pasteboard pies and paper flowers are being banished from the stage by the growth of that power of accurate observation which is commonly called cynicism by those who have not got it...',
 '1_People wonder why the novel is the most popular form of literature; people wonder why it is read more than books of science or books of metaphysics. The reason is very simple; it is merely that the novel is more true than they are.',
 '2_A good novel tells us about the truth about its hero, but a bad novel tells us the truth about its author...A sincere novel exhibits the simplicity of one particular man, an insincere novel exhibits the simplicity of mankind',
 '3_It has been well said that an author who expects results from a first novel is in a position similar to that of a man who drops a rose petal down the Grand Canyon of Arizona and listens for the echo.',
 '4_There are no significant bugs in our released software that any significant number of users want fixed. ... I\'m saying we don\'t

In [67]:
len(D)

58

In [138]:
[d for d in D]

['0_Pasteboard pies and paper flowers are being banished from the stage by the growth of that power of accurate observation which is commonly called cynicism by those who have not got it...',
 '0_My method is to take the utmost trouble to find the right thing to say, and then to say it with the utmost levity.',
 '0_We have no more right to consume happiness without producing it than to consume wealth without producing it.',
 '0_While the guardians of “literary” fiction still give each other prizes and authors can still achieve stardom and create good work, the fact remains that it is a movement that has lost all its creative force as a movement.',
 '0_There is first the literature of knowledge, and secondly, the literature of power. The function of the first is—to teach; the function of the second is—to move, the first is a rudder, the second an oar or a sail. The first speaks to the mere discursive understanding; the second speaks ultimately, it may happen, to the higher understanding

In [66]:
D

['0_Pasteboard pies and paper flowers are being banished from the stage by the growth of that power of accurate observation which is commonly called cynicism by those who have not got it...',
 '0_My method is to take the utmost trouble to find the right thing to say, and then to say it with the utmost levity.',
 '0_We have no more right to consume happiness without producing it than to consume wealth without producing it.',
 '0_While the guardians of “literary” fiction still give each other prizes and authors can still achieve stardom and create good work, the fact remains that it is a movement that has lost all its creative force as a movement.',
 '0_There is first the literature of knowledge, and secondly, the literature of power. The function of the first is—to teach; the function of the second is—to move, the first is a rudder, the second an oar or a sail. The first speaks to the mere discursive understanding; the second speaks ultimately, it may happen, to the higher understanding

In [117]:
[c[2:] for c in C]

['Literature', 'Computers', 'Biology', 'Fashion', 'Cinema']

In [65]:
C

['0_Literature', '1_Computers', '2_Biology', '3_Fashion', '4_Cinema']

In [154]:
# def main():
#     print(app_banner)
#     print('===========================================================================\n')

# f = open('words.txt', 'r')

# CONVERT TXT >> CSV

import pandas as pd

# T = open('else/arthurratz_multinomialNB_tutorial/eval.txt', 'rt', encoding='UTF8').readlines()
T = open('else/arthurratz_multinomialNB_tutorial/eval.txt', 'r', encoding='UTF8').read().split('\n')
# file_path = 'else/arthurratz_multinomialNB_tutorial/eval.txt'
# T = np.loadtxt(file_path, delimiter='\n', encoding='utf8')
# T = open(file_path, 'rt', encoding='utf8').read().split('\n')


# D = open('else/arthurratz_multinomialNB_tutorial/trainset.txt', 'rt', encoding='UTF8').readlines()
D = open('else/arthurratz_multinomialNB_tutorial/trainset.txt', 'r', encoding='UTF8').read().split('\n')

# C = open('else/arthurratz_multinomialNB_tutorial/classes.txt', 'rt', encoding='UTF8').readlines()
C = open('else/arthurratz_multinomialNB_tutorial/classes.txt', 'r', encoding='UTF8').read().split('\n')


M = build_model(D)

output_data(T,D,C,class_stats)
entropy = evaluate(T,M,C)

[ Classes: 5 Documents: 58 Samples: 21 ]

Classes:

C1: Literature   [ Documents:  14, P(C1) = 0.241379 ]
C2: Computers    [ Documents:  12, P(C2) = 0.206897 ]
C3: Biology      [ Documents:  11, P(C3) = 0.189655 ]
C4: Fashion      [ Documents:  11, P(C4) = 0.189655 ]
C5: Cinema       [ Documents:  10, P(C5) = 0.172414 ]


Documents:

C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C1: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C2: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C3: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C4: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."
C5: "_..."


Classification:

divisio

ZeroDivisionError: division by zero