# Table of Contents
 <p><div class="lev1"><a href="#Laplacian-Smoothing"><span class="toc-item-num">1&nbsp;&nbsp;</span>Laplacian Smoothing</a></div><div class="lev2"><a href="#Test---Query-Perfect-Storm"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Test - Query Perfect Storm</a></div>

# Laplacian Smoothing

In [1]:
%pdb

Automatic pdb calling has been turned ON


In [2]:
from IPython.display import display
import pprint

import bow
import numpy as np
import string

In [3]:
def laplacian_smoothing(
    v1: int, v2: int, k: int, k_class: int
):
    return (v1+k)/(v2+k*k_class)

In [4]:
def tokenize(text: str):
    return [
        word.strip(string.punctuation) for word in text.upper().split()
    ]

In [5]:
def p_query(
    query: list, 
    domain: bow.BagOfWords,
    bag: bow.BagOfWords,
):
    return [
        laplacian_smoothing(
            v1=domain[word] if word in domain else 0, 
            v2=sum(domain.values()), 
            k=1,
            k_class=len(bag)  # movie and song
        ) for word in query
    ]

## Test - Query Perfect Storm

In [6]:
s_movie = """
A PERFECT WORLD
MY PERFECT WOMAN
PRETTY WOMAN
"""

s_song = """
A PERFECT DAY
ELECTRIC STORM
ANOTHER RAIN DAY
"""

In [7]:
movie = bow.BagOfWords(tokenize(s_movie))
song = bow.BagOfWords(tokenize(s_song))
bag = movie + song

print('\nDICTIONARIES')
print('\nmovies:')
pprint.pprint(str(movie))
print('\nsongs:')
pprint.pprint(str(song))
print('\nall:')
pprint.pprint(str(bag))


DICTIONARIES

movies:
"{'MY': 1, 'PERFECT': 2, 'WORLD': 1, 'A': 1, 'WOMAN': 2, 'PRETTY': 1}"

songs:
("{'RAIN': 1, 'PERFECT': 1, 'A': 1, 'STORM': 1, 'DAY': 2, 'ELECTRIC': 1, "
 "'ANOTHER': 1}")

all:
("{'MY': 1, 'PERFECT': 3, 'DAY': 2, 'ELECTRIC': 1, 'WORLD': 1, 'A': 2, 'RAIN': "
 "1, 'STORM': 1, 'WOMAN': 2, 'ANOTHER': 1, 'PRETTY': 1}")


In [8]:
print('\nDICTIONARIES')
n_movie = s_movie.count('\n') - 1
n_song = s_song.count('\n') - 1
print('movie entries:', n_movie, '| song entries:',  n_song)


DICTIONARIES
movie entries: 3 | song entries: 3


In [9]:
p_movie = laplacian_smoothing(
    v1=n_movie, 
    v2=n_movie+n_song, 
    k=1,
    k_class=2  # movie and song
)

p_song = laplacian_smoothing(
    v1=n_song, 
    v2=n_movie+n_song, 
    k=1,
    k_class=2  # movie and song
)

print('P(MOVIE): ', p_movie, '| P(SONG):', p_song)

P(MOVIE):  0.5 | P(SONG): 0.5


In [10]:
queries = [
    tokenize('Perfect'),
    tokenize('Storm')
]
print('query:', queries)

query: [['PERFECT'], ['STORM']]


In [11]:
for query in queries:
    print('\nQuery: ', query)
    
    p_q_movie = np.prod(p_query(query, movie, bag))
    p_q_song = np.prod(p_query(query, song, bag))
    
    print('P(%s|MOVIE)' % ','.join(query), p_q_movie)
    print('P(%s|SONG)' % ','.join(query), p_q_song)

    p_movie_q = p_q_movie*p_movie / (p_q_movie*p_movie + p_q_song*p_song)
    p_song_q = p_q_song*p_song / (p_q_movie*p_movie + p_q_song*p_song)

    print('P(MOVIE|%s):' % ','.join(query), p_movie_q)
    print('P(SONG|%s):' % ','.join(query), p_song_q)


Query:  ['PERFECT']
P(PERFECT|MOVIE) 0.157894736842
P(PERFECT|SONG) 0.105263157895
P(MOVIE|PERFECT): 0.6
P(SONG|PERFECT): 0.4

Query:  ['STORM']
P(STORM|MOVIE) 0.0526315789474
P(STORM|SONG) 0.105263157895
P(MOVIE|STORM): 0.333333333333
P(SONG|STORM): 0.666666666667
