In [39]:
## Written by: Wai Zin Linn
## Attribution: Hugh Liu's solutions for CS540 2021 Epic and Hongtao Hao

import string
import re
from collections import Counter
from itertools import product
from itertools import permutations
import random
from numpy import cumsum
import numpy as np

# adjust on your own
P_my = 0.63
P_fake = 0.37
num_charactors = 1000

with open('darkknight.txt', encoding = 'utf-8') as f:
    data = f.read()

def process_text(data):
    data = data.lower()
    data = re.sub(r'[^a-z ]+', '', data)
    data = ' '.join(data.split())
    return data 
data = process_text(data)

allchar = ' ' + string.ascii_lowercase
unigram = Counter(data)
unigram_prob = {ch: round(unigram[ch]/len(data),4) for ch in allchar}
uni_list = [unigram_prob[c] for c in allchar]

##q2
print("Unigram Prob: ")
print(uni_list)

# to distinguish between fake_unigram_prob below
my_unigram_prob = unigram_prob

def ngram(n):
    # all possible n-grams
    d = dict.fromkeys([''.join(i) for i in product(allchar, repeat=n)],0)
    # update counts
    d.update(Counter(data[x:x+n] for x in range(len(data)-1)))
    return d

bigram = ngram(2)
bigram_prob = {c: bigram[c] / unigram[c[0]] for c in bigram}
bigram_lst = []
big_lst = []

print('\n')

##q3
count = 0;
for c in bigram:
    bigram_lst.append(round(bigram[c] / unigram[c[0]],4))
    count += 1
    if count%27 == 0:
        big_lst.append(bigram_lst)
        bigram_lst = []

print("Bigram prob without Laplace Smoothing: ")
counter = 0;
for lst in big_lst:
    for c in lst:
        print(c,end=",")
        counter += 1;
        if counter%27 == 0:
            print('\n')
    
bigram_prob_L = {
    c: (bigram[c] + 1) / (unigram[c[0]] + 27) for c in bigram}

##q4

print("Bigram prob with Laplace Smoothing: ")

counter = 0
for c in bigram:
    print(round(bigram_prob_L[c],4),end=",")
    counter += 1;
    if counter%27 == 0:
        print('\n')

trigram= ngram(3)
trigram_prob_L = {c: (trigram[c] + 1) / (bigram[c[:2]] + 27) for c in trigram}

# based on https://python-course.eu/numerical-programming/weighted-probabilities.php
def weighted_choice(collection, weights):
    weights = np.array(weights)
    weights_sum = weights.sum()
    weights = weights.cumsum()/weights_sum
    x = random.random()
    for i in range(len(weights)):
        if x < weights[i]:
            return collection[i]

def gen_bi(c):
    w = [bigram_prob[c+i] for i in allchar]
    return weighted_choice(allchar, weights = w)[0]

def gen_tri(ab):
    w = [trigram_prob_L[ab+i] for i in allchar]
    return weighted_choice(allchar, weights=w)[0]

def gen_sen(c, num):
    # generate the second char
    res = c + gen_bi(c)
    for i in range(num-2):
        if bigram[res[-2:]] == 0:
            t = gen_bi(res[-1])
        else:
            t = gen_tri(res[-2:])
        res += t
    return res


sentences = []
for char in allchar:
    sentence = gen_sen(char, num_charactors)
    sentences.append(sentence)
    
#q5

with open('q5.txt', 'w') as f:
    for sentence in sentences:
        f.write(sentence)
        f.write("\n")

## fake script
with open('script.txt', encoding = 'utf-8') as f:
    data = f.read()

data = process_text(data)

unigram = Counter(data)
unigram_prob = {ch: round(unigram[ch]/len(data),4) for ch in allchar}
uni_list = [unigram_prob[c] for c in allchar]

#q7
print("Likelihood probabilities of the Naive Bayes estimator for the fake script","\n")
print(uni_list)
print('\n')

fake_unigram_prob = unigram_prob

#q8
count = 0
for char in allchar:
    count += 1
    print(P_fake*fake_unigram_prob[char]/(P_fake*fake_unigram_prob[char] + P_my*my_unigram_prob[char]),end=",")

print("\n")

#q9
lst = []
for sentence in sentences:
    my = 0
    fake = 0
    for char in sentence:
        my += np.log10(my_unigram_prob[char])
        fake += np.log10(fake_unigram_prob[char])
    if my > fake:
        lst.append(int('0'))
    else:
        lst.append(int('1'))

print(lst)

Unigram Prob: 
[0.1642, 0.067, 0.0158, 0.0233, 0.0305, 0.0907, 0.0166, 0.0222, 0.0449, 0.0509, 0.0024, 0.0125, 0.037, 0.0222, 0.0598, 0.0713, 0.0163, 0.0003, 0.0471, 0.0566, 0.0713, 0.0286, 0.0069, 0.0181, 0.0028, 0.0191, 0.0016]


Bigram prob without Laplace Smoothing: 
0.0,0.0903,0.0593,0.0548,0.0453,0.0148,0.0366,0.0347,0.0595,0.0532,0.0119,0.0099,0.0366,0.0417,0.0181,0.0549,0.0311,0.0007,0.0272,0.0732,0.1488,0.0138,0.0059,0.0523,0.0,0.0252,0.0001,

0.088,0.0003,0.0167,0.0538,0.0322,0.0003,0.0058,0.023,0.0017,0.0259,0.0012,0.0213,0.082,0.0411,0.1769,0.0003,0.025,0.0,0.1059,0.059,0.1174,0.0319,0.015,0.0092,0.0,0.0647,0.0014,

0.0196,0.3032,0.0257,0.0,0.0012,0.1149,0.0012,0.0012,0.0,0.0416,0.0012,0.0,0.0868,0.0024,0.0,0.1797,0.0,0.0,0.0391,0.0183,0.0,0.11,0.0037,0.0,0.0,0.0501,0.0,

0.0157,0.1258,0.0,0.0083,0.0,0.1283,0.0,0.0,0.2111,0.043,0.0,0.0969,0.0439,0.0008,0.0,0.1639,0.0,0.0,0.0712,0.0,0.0397,0.0439,0.0,0.0008,0.0,0.0066,0.0,

0.3548,0.0619,0.0044,0.0013,0.0088,0.1679,0.0019,0.

Likelihood probabilities of the Naive Bayes estimator for the fake script 

[0.094, 0.0021, 0.0143, 0.0107, 0.0052, 0.0024, 0.0082, 0.0101, 0.003, 0.002, 0.1311, 0.0179, 0.0045, 0.0092, 0.0031, 0.0025, 0.0109, 0.212, 0.0024, 0.0035, 0.002, 0.007, 0.0231, 0.0064, 0.1338, 0.0112, 0.2674]


0.2516169172225196,0.018075232046897895,0.3470646113479829,0.21241549522480951,0.09101660438052889,0.01530269348084578,0.22487399940705605,0.21085594989561587,0.03775895499540769,0.02255616179473893,0.9697714868350027,0.4568216305697337,0.06666666666666667,0.19574468085106383,0.029545864351768373,0.02017712241514702,0.2819885330722976,0.9975963067061772,0.029056640816727198,0.03504451600681948,0.016207100462121378,0.12567934782608697,0.6628664495114006,0.17195555878294963,0.9655939145699239,0.25616616183470364,0.989914553859084,

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
