In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import numpy as np 
from random import shuffle
from time import time 
import random

from operator import mul
from functools import reduce

Using TensorFlow backend.


# 1.  Gibbs Sampling Implementation (20 points) 

## 1.1 Implementation BPA

In [2]:
train_y = open('cbt-boundaries.txt',encoding="utf8").read().strip().split("\n")
train_x = open('cbt-characters.txt',encoding="utf8").read().strip().split("\n")

def BPA(pred,truth):
    rows = len(pred)
    tt = 0
    mm = 0
    for j in range(rows):
        l = len(pred[j]) - 1 #neglect the ending boundary
        for i in range(l):
            tt += 1
            if (pred[j][i] == truth[j][i]):
                mm += 1
    return mm/tt

unsegmented = []
rows = len(train_y)

for i in range(rows):
    unsegmented.append("0"*len(train_y[i]))

print("Unsegmented BPA:",BPA(unsegmented,train_y))

    
    

Unsegmented BPA: 0.7327530010591656


In [35]:
char_index = {}
index_char = {}

m = 0

for sent in train_x:
    for c in sent:       
        if c not in char_index:
            char_index[c] = m 
            index_char[m] = c
            m += 1 


## 1.2 Implementation Gibbs

In [36]:
import time


total_char = 938089

def compute_G(seg,beta,P,cache_G):
    if seg in cache_G:
        return cache_G[seg]
    l = list(map(lambda x: P[char_index[x]], seg))
    
    temp = ((1-beta)**(len(seg)-1)) * beta * reduce(mul, l, 1)

    cache_G[seg] = temp

    return temp 

def run_gibbs(K,sents,gamma,beta,s,P):
    
    def wrapper(ss,dic):
        if ss in dic: 
            dic[ss] +=1
        else:
            dic[ss] = 1
            
    def get_val(ss,dic):
        if ss in dic:
            return dic[ss]
        else:
            return 0

    def remove_val(ss,dic):
        if ss in dic: 
            dic[ss] -= 1
            if dic[ss] ==0:
                dic.pop(ss)
            return 1 
        else:            
            return 0
        
    
    prediction = []
    prev = []
    length = []    
    ll = len(sents)
    
    cache_G = {}
    
    possible_val = ["0","1"]
    
    init = random.choices(possible_val,[1-gamma,gamma],k=total_char)

    start = 0
    
    for i in range(ll):
        length.append(len(sents[i]))
        temp = init[start:start+length[i]]
        temp[-1] = "1"
        prev.append(temp)
        start += length[i]        
    #return prev
    #cur = prev.copy()
    
    dic = {}  
    n_ = 0 
    for i in range(ll):
        y_prev = 0
        y_next = 0 
        for k in range(length[i]):
            if prev[i][k] == "1":                                 
                y_next = k 
                ss = sents[i][y_prev:y_next+1]
                wrapper(ss,dic)
                n_ += 1                 
                y_prev = y_next+1
    
    for j in range(K): 
        print("iter:",j)
        #start_time = time.time()
        cnt = 0 
        for i in range(ll):
            y_prev = 0
            y_next = prev[i].index("1")
            
                
            for k in range(length[i]-1):
                
                
                
                if prev[i][k] == "1":
                    y_next = k+prev[i][k+1:].index("1")+1
                   
                full_seg = sents[i][y_prev:y_next+1] 
                prev_seg = sents[i][y_prev:k+1]
                next_seg = sents[i][k+1:y_next+1]
                
                if prev[i][k] == "1":                    
                    n_ -= remove_val(prev_seg,dic)                    
                    n_ -= remove_val(next_seg,dic)
                    
                else: 
                    n_ -= remove_val(full_seg,dic)                    
                    
                n_full = get_val(full_seg,dic)
                n_prev = get_val(prev_seg,dic)
                n_next = get_val(next_seg,dic)
                
                
                prob = [1,1]
                
                    
                prob[0]= (n_full + s * compute_G(full_seg,beta,P,cache_G))/(n_ + s)
                prob[1] = (1-gamma)*(n_prev + s * compute_G(prev_seg,beta,P,cache_G))*\
                            (n_next + (next_seg == prev_seg) +s * compute_G(next_seg,beta,P,cache_G))/\
                            ((n_  + s)*(n_ + 1 + s))
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                tt = random.choices(possible_val,prob)[0]
                                                
                if (tt != prev[i][k]):
                    cnt +=1
                
                prev[i][k] = tt
                if tt == "1":
                    y_prev = k+1
                    n_ += 2                    
                    wrapper(prev_seg,dic)
                    wrapper(next_seg,dic)                    
                else: 
                    n_ += 1 
                    wrapper(full_seg,dic)
                #print(n_)
        #print(time.time()-start_time)
        print("changed:",cnt)
        print("unique segments:",len(dic))
        print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),prev)),train_y))

                    
    return prev

# 2. Experimentation

## 2.1 uniform distribution

In [56]:
P = [1/len(char_index)]*len(char_index)
gamma = 0.2
beta = 0.3
s = 1
start_time = time.time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))# print final BPA
print("Time elapsed:",time.time()-start_time)
     

iter: 0
changed: 300603
unique segments: 24306
Evaluation BPA: 0.5832210057440612
iter: 1
changed: 190844
unique segments: 19732
Evaluation BPA: 0.6656010361075285
iter: 2
changed: 86408
unique segments: 18054
Evaluation BPA: 0.7007485273502865
iter: 3
changed: 43237
unique segments: 17154
Evaluation BPA: 0.7148861800969519
iter: 4
changed: 29410
unique segments: 16545
Evaluation BPA: 0.7222540079669083
iter: 5
changed: 25561
unique segments: 16055
Evaluation BPA: 0.7278838559663998
iter: 6
changed: 24188
unique segments: 15667
Evaluation BPA: 0.732588146179946
iter: 7
changed: 23369
unique segments: 15303
Evaluation BPA: 0.7369228597688369
iter: 8
changed: 22772
unique segments: 14948
Evaluation BPA: 0.7408276576923118
iter: 9
changed: 22508
unique segments: 14623
Evaluation BPA: 0.7447238357528211
iter: 10
changed: 21997
unique segments: 14315
Evaluation BPA: 0.7483495656127807
iter: 11
changed: 21510
unique segments: 14033
Evaluation BPA: 0.7517738061759163
iter: 12
changed: 21070
u

In [57]:
gamma = 0.5
beta = 0.3 
s = 1
start_time = time.time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))     
print("Time elapsed:",time.time()-start_time)

iter: 0
changed: 318473
unique segments: 19503
Evaluation BPA: 0.632470592798751
iter: 1
changed: 88617
unique segments: 17828
Evaluation BPA: 0.6767325116448961
iter: 2
changed: 24604
unique segments: 17216
Evaluation BPA: 0.685563561253285
iter: 3
changed: 14364
unique segments: 16855
Evaluation BPA: 0.6894381896563799
iter: 4
changed: 12967
unique segments: 16562
Evaluation BPA: 0.6925575025671029
iter: 5
changed: 12674
unique segments: 16278
Evaluation BPA: 0.6954947208726749
iter: 6
changed: 12418
unique segments: 16037
Evaluation BPA: 0.6980849896938763
iter: 7
changed: 12224
unique segments: 15809
Evaluation BPA: 0.7009403193012739
iter: 8
changed: 11779
unique segments: 15584
Evaluation BPA: 0.7033549584145486
iter: 9
changed: 11842
unique segments: 15365
Evaluation BPA: 0.7057146459014167
iter: 10
changed: 12083
unique segments: 15161
Evaluation BPA: 0.7083577113832833
iter: 11
changed: 12052
unique segments: 14983
Evaluation BPA: 0.7110514185600735
iter: 12
changed: 11957
uni

In [58]:
gamma = 0.1
beta = 0.3
s = 1
start_time = time.time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))     
print("Time elapsed:",time.time()-start_time) 

iter: 0
changed: 375694
unique segments: 20490
Evaluation BPA: 0.546434663054944
iter: 1
changed: 252221
unique segments: 14244
Evaluation BPA: 0.6318693573568914
iter: 2
changed: 139666
unique segments: 12465
Evaluation BPA: 0.6897388073773097
iter: 3
changed: 73178
unique segments: 11603
Evaluation BPA: 0.7153656599744206
iter: 4
changed: 44694
unique segments: 11126
Evaluation BPA: 0.7267115546030607
iter: 5
changed: 34222
unique segments: 10767
Evaluation BPA: 0.7332497206625658
iter: 6
changed: 31164
unique segments: 10475
Evaluation BPA: 0.7385746410096445
iter: 7
changed: 29323
unique segments: 10214
Evaluation BPA: 0.7426787732642021
iter: 8
changed: 28339
unique segments: 9991
Evaluation BPA: 0.7463842368565946
iter: 9
changed: 27686
unique segments: 9791
Evaluation BPA: 0.750071383240185
iter: 10
changed: 27131
unique segments: 9609
Evaluation BPA: 0.7536238442649358
iter: 11
changed: 26175
unique segments: 9451
Evaluation BPA: 0.7564242222459269
iter: 12
changed: 26126
uniqu

In [59]:
#best performance
gamma = 0.2
beta = 0.1
s = 1
start_time = time.time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))     
print("Time elapsed:",time.time()-start_time)

iter: 0
changed: 281720
unique segments: 25893
Evaluation BPA: 0.5952166225437431
iter: 1
changed: 174142
unique segments: 20528
Evaluation BPA: 0.6691039329202264
iter: 2
changed: 76710
unique segments: 18747
Evaluation BPA: 0.6998294344615656
iter: 3
changed: 40134
unique segments: 17850
Evaluation BPA: 0.7123605602479934
iter: 4
changed: 28466
unique segments: 17208
Evaluation BPA: 0.7193426492502335
iter: 5
changed: 25069
unique segments: 16695
Evaluation BPA: 0.7249531025580521
iter: 6
changed: 23984
unique segments: 16269
Evaluation BPA: 0.7301228653717478
iter: 7
changed: 23455
unique segments: 15868
Evaluation BPA: 0.7346967801579375
iter: 8
changed: 22933
unique segments: 15490
Evaluation BPA: 0.738925900425498
iter: 9
changed: 22279
unique segments: 15153
Evaluation BPA: 0.7429287492902081
iter: 10
changed: 21646
unique segments: 14830
Evaluation BPA: 0.7467645883099573
iter: 11
changed: 21035
unique segments: 14527
Evaluation BPA: 0.7500735382059264
iter: 12
changed: 20803
u

In [60]:
gamma = 0.2
beta = 0.3
s = 5
start_time = time.time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))     
print("Time elapsed:",time.time()-start_time) 

iter: 0
changed: 300787
unique segments: 24179
Evaluation BPA: 0.5822835956465382
iter: 1
changed: 191122
unique segments: 19633
Evaluation BPA: 0.6650946191582919
iter: 2
changed: 86349
unique segments: 17978
Evaluation BPA: 0.700644011511827
iter: 3
changed: 43383
unique segments: 17057
Evaluation BPA: 0.714639436519558
iter: 4
changed: 29457
unique segments: 16469
Evaluation BPA: 0.7221419497483539
iter: 5
changed: 25593
unique segments: 16017
Evaluation BPA: 0.7273839039143876
iter: 6
changed: 24511
unique segments: 15572
Evaluation BPA: 0.7325008700674182
iter: 7
changed: 23374
unique segments: 15195
Evaluation BPA: 0.7368043366530581
iter: 8
changed: 22297
unique segments: 14835
Evaluation BPA: 0.7407877908260954
iter: 9
changed: 21961
unique segments: 14533
Evaluation BPA: 0.7446505669176124
iter: 10
changed: 21709
unique segments: 14227
Evaluation BPA: 0.7481297591071546
iter: 11
changed: 21402
unique segments: 13951
Evaluation BPA: 0.7514462513832186
iter: 12
changed: 20707
un

In [61]:
gamma = 0.2
beta = 0.3
s = 0.1
start_time = time.time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))     
print("Time elapsed:",time.time()-start_time)  

iter: 0
changed: 302075
unique segments: 24121
Evaluation BPA: 0.5819000117445633
iter: 1
changed: 191083
unique segments: 19519
Evaluation BPA: 0.6653241230097544
iter: 2
changed: 86814
unique segments: 17889
Evaluation BPA: 0.700564277779394
iter: 3
changed: 44302
unique segments: 17027
Evaluation BPA: 0.7147374874607931
iter: 4
changed: 30478
unique segments: 16463
Evaluation BPA: 0.7221117802279738
iter: 5
changed: 26129
unique segments: 15978
Evaluation BPA: 0.7276597395292909
iter: 6
changed: 24678
unique segments: 15585
Evaluation BPA: 0.732687274604052
iter: 7
changed: 23712
unique segments: 15214
Evaluation BPA: 0.7370683199563834
iter: 8
changed: 22775
unique segments: 14878
Evaluation BPA: 0.7408556722469505
iter: 9
changed: 22404
unique segments: 14570
Evaluation BPA: 0.7445794530481452
iter: 10
changed: 21801
unique segments: 14295
Evaluation BPA: 0.7480780399293602
iter: 11
changed: 21732
unique segments: 13999
Evaluation BPA: 0.7514871957323058
iter: 12
changed: 21297
un

My best performance is when gamma =0.2, beta = 0.1. I found gamma,beta both matter a lot to the final BPA, while the value of s has little effect.

## 2.2 unigram distribution

In [92]:
from time import time

P = [0 for i in range(len(char_index))]
for i in range(len(train_x)):
    for j in range(len(train_x[i])):
        P[char_index[train_x[i][j]]] += 1
P = np.array(P)
P = P/ sum(P)
P = P.tolist()
gamma = 0.2
beta = 0.1
s = 1
start_time = time()
cur = run_gibbs(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time()-start_time)
     


iter: 0
changed: 206149
unique segments: 37200
Evaluation BPA: 0.6559327823085932
iter: 1
changed: 96289
unique segments: 28126
Evaluation BPA: 0.6851314906221279
iter: 2
changed: 44013
unique segments: 24723
Evaluation BPA: 0.6975537906386133
iter: 3
changed: 27387
unique segments: 22980
Evaluation BPA: 0.7039033971957431
iter: 4
changed: 22268
unique segments: 21858
Evaluation BPA: 0.7089718766195915
iter: 5
changed: 20070
unique segments: 21062
Evaluation BPA: 0.7131999194042813
iter: 6
changed: 19320
unique segments: 20424
Evaluation BPA: 0.717701642838133
iter: 7
changed: 19140
unique segments: 19871
Evaluation BPA: 0.7219275306570814
iter: 8
changed: 18833
unique segments: 19386
Evaluation BPA: 0.72586788551529
iter: 9
changed: 18528
unique segments: 18956
Evaluation BPA: 0.729613215973899
iter: 10
changed: 18077
unique segments: 18555
Evaluation BPA: 0.7331053379578898
iter: 11
changed: 17826
unique segments: 18203
Evaluation BPA: 0.7368722180739131
iter: 12
changed: 17698
uniqu

Yes, it has higher BPA (though only slightly higher).

## 2.3 Use annealing technique

In [23]:
def run_gibbs_annealing(K,sents,gamma,beta,s,P,rate,start_pw):
    
    def wrapper(ss,dic):
        if ss in dic: 
            dic[ss] +=1
        else:
            dic[ss] = 1
            
    def get_val(ss,dic):
        if ss in dic:
            return dic[ss]
        else:
            return 0

    def remove_val(ss,dic):
        if ss in dic: 
            dic[ss] -= 1
            if dic[ss] ==0:
                dic.pop(ss)
            return 1 
        else:            
            return 0
        
    
    prediction = []
    prev = []
    length = []    
    ll = len(sents)
    
    cache_G = {}
    
    possible_val = ["0","1"]
    
    init = random.choices(possible_val,[1-gamma,gamma],k=total_char)

    start = 0
    
    for i in range(ll):
        length.append(len(sents[i]))
        temp = init[start:start+length[i]]
        temp[-1] = "1"
        prev.append(temp)
        start += length[i]        
    #return prev
    #cur = prev.copy()
    
    dic = {}  
    n_ = 0 
    for i in range(ll):
        y_prev = 0
        y_next = 0 
        for k in range(length[i]):
            if prev[i][k] == "1":                                 
                y_next = k 
                ss = sents[i][y_prev:y_next+1]
                wrapper(ss,dic)
                n_ += 1                 
                y_prev = y_next+1
    
    for j in range(K): 
        print("iter:",j)
        #start_time = time.time()
        cnt = 0 
        for i in range(ll):
            y_prev = 0
            y_next = prev[i].index("1")
            
                
            for k in range(length[i]-1):
                
                
                
                if prev[i][k] == "1":
                    y_next = k+prev[i][k+1:].index("1")+1
                   
                full_seg = sents[i][y_prev:y_next+1] 
                prev_seg = sents[i][y_prev:k+1]
                next_seg = sents[i][k+1:y_next+1]
                
                if prev[i][k] == "1":                    
                    n_ -= remove_val(prev_seg,dic)                    
                    n_ -= remove_val(next_seg,dic)
                    
                else: 
                    n_ -= remove_val(full_seg,dic)                    
                    
                n_full = get_val(full_seg,dic)
                n_prev = get_val(prev_seg,dic)
                n_next = get_val(next_seg,dic)
                
                
                prob = [1,1]
                
                    
                prob[0]= (n_full + s * compute_G(full_seg,beta,P,cache_G))/(n_ + s)
                prob[1] = (1-gamma)*(n_prev + s * compute_G(prev_seg,beta,P,cache_G))*\
                            (n_next + (next_seg == prev_seg) +s * compute_G(next_seg,beta,P,cache_G))/\
                            ((n_  + s)*(n_ + 1 + s))
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                prob[0] = prob[0]  ** start_pw
                prob[1] = prob[1]  ** start_pw
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                tt = random.choices(possible_val,prob)[0]
                                                
                if (tt != prev[i][k]):
                    cnt +=1
                
                prev[i][k] = tt
                if tt == "1":
                    y_prev = k+1
                    n_ += 2                    
                    wrapper(prev_seg,dic)
                    wrapper(next_seg,dic)                    
                else: 
                    n_ += 1 
                    wrapper(full_seg,dic)
                #print(n_)
        if start_pw < 1:
            start_pw += rate
            if start_pw > 1: 
                start_pw = 1                
        #print(time.time()-start_time)
        print("changed:",cnt)
        print("unique segments:",len(dic))
        print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),prev)),train_y))

                    
    return prev

gamma = 0.2
beta = 0.1
s = 1

     

In [101]:
start_time = time()
rate = 0.1
start_pw = 0.1 
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time()-start_time)

iter: 0
changed: 433446
unique segments: 33224
Evaluation BPA: 0.5284708686343659
iter: 1
changed: 391948
unique segments: 19284
Evaluation BPA: 0.5408748514420492
iter: 2
changed: 348272
unique segments: 14129
Evaluation BPA: 0.5735441320821603
iter: 3
changed: 298910
unique segments: 11908
Evaluation BPA: 0.6167598150608401
iter: 4
changed: 243487
unique segments: 10587
Evaluation BPA: 0.6615367707191875
iter: 5
changed: 182926
unique segments: 9702
Evaluation BPA: 0.7034874888076467
iter: 6
changed: 126688
unique segments: 9096
Evaluation BPA: 0.7376027514602587
iter: 7
changed: 80692
unique segments: 8639
Evaluation BPA: 0.7608914662279156
iter: 8
changed: 48891
unique segments: 8267
Evaluation BPA: 0.7749062859273195
iter: 9
changed: 29967
unique segments: 8044
Evaluation BPA: 0.7827535936747445
iter: 10
changed: 22174
unique segments: 7849
Evaluation BPA: 0.7871626535817147
iter: 11
changed: 19823
unique segments: 7706
Evaluation BPA: 0.7903573902933878
iter: 12
changed: 19512
un

In [25]:
start_time = time.time()
rate = 0.02
start_pw = 0.1 
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time.time()-start_time)

iter: 0
changed: 456900
unique segments: 22855
Evaluation BPA: 0.5069955575381241
iter: 1
changed: 402693
unique segments: 16462
Evaluation BPA: 0.5153234226458885
iter: 2
changed: 387784
unique segments: 13934
Evaluation BPA: 0.5202065750159737
iter: 3
changed: 379652
unique segments: 12354
Evaluation BPA: 0.526341762481831
iter: 4
changed: 370498
unique segments: 11226
Evaluation BPA: 0.5330480158691677
iter: 5
changed: 362147
unique segments: 10364
Evaluation BPA: 0.5402466789284217
iter: 6
changed: 354955
unique segments: 9726
Evaluation BPA: 0.5489344233150053
iter: 7
changed: 345494
unique segments: 9178
Evaluation BPA: 0.5595239249684029
iter: 8
changed: 335724
unique segments: 8700
Evaluation BPA: 0.5698289711439313
iter: 9
changed: 324995
unique segments: 8302
Evaluation BPA: 0.5801350948023304
iter: 10
changed: 314524
unique segments: 7970
Evaluation BPA: 0.5915305536430234
iter: 11
changed: 302635
unique segments: 7722
Evaluation BPA: 0.6036576233529327
iter: 12
changed: 290

In [102]:
start_time = time()
rate = 0.2
start_pw = 0.2 
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time()-start_time)

iter: 0
changed: 381501
unique segments: 31906
Evaluation BPA: 0.5614450769268896
iter: 1
changed: 326012
unique segments: 18904
Evaluation BPA: 0.6040875390183484
iter: 2
changed: 239060
unique segments: 14871
Evaluation BPA: 0.6671181319894967
iter: 3
changed: 142967
unique segments: 13116
Evaluation BPA: 0.7169937365920726
iter: 4
changed: 67750
unique segments: 12205
Evaluation BPA: 0.7427240275447721
iter: 5
changed: 32509
unique segments: 11714
Evaluation BPA: 0.7522026443584613
iter: 6
changed: 23301
unique segments: 11372
Evaluation BPA: 0.7579682551996629
iter: 7
changed: 21029
unique segments: 11113
Evaluation BPA: 0.762374082658021
iter: 8
changed: 20387
unique segments: 10877
Evaluation BPA: 0.7664308056662669
iter: 9
changed: 19924
unique segments: 10679
Evaluation BPA: 0.7698701309895926
iter: 10
changed: 18950
unique segments: 10496
Evaluation BPA: 0.7733266960388497
iter: 11
changed: 18778
unique segments: 10307
Evaluation BPA: 0.7765440598908079
iter: 12
changed: 18432

In [103]:
start_time = time()
rate = 0.1
start_pw = 0.5 
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time()-start_time)

iter: 0
changed: 283211
unique segments: 34643
Evaluation BPA: 0.6238216377955131
iter: 1
changed: 215862
unique segments: 23476
Evaluation BPA: 0.6665201289962492
iter: 2
changed: 147537
unique segments: 19134
Evaluation BPA: 0.7047933980469545
iter: 3
changed: 92845
unique segments: 16956
Evaluation BPA: 0.7317875764070041
iter: 4
changed: 54927
unique segments: 15768
Evaluation BPA: 0.7478711632181827
iter: 5
changed: 32793
unique segments: 14994
Evaluation BPA: 0.7565783022964392
iter: 6
changed: 23171
unique segments: 14548
Evaluation BPA: 0.7616155347170368
iter: 7
changed: 20774
unique segments: 14185
Evaluation BPA: 0.7656129961673934
iter: 8
changed: 20060
unique segments: 13857
Evaluation BPA: 0.7694553000843669
iter: 9
changed: 19311
unique segments: 13609
Evaluation BPA: 0.7729926763489278
iter: 10
changed: 18877
unique segments: 13337
Evaluation BPA: 0.7760818197392707
iter: 11
changed: 18457
unique segments: 13108
Evaluation BPA: 0.7788412533711745
iter: 12
changed: 18178

Yes we can make annealing run better. The best we got is 0.84 BPA.

## 2.4 Use annealing that ends with  tau  > 1

In [29]:
def run_gibbs_annealing(K,sents,gamma,beta,s,P,rate,start_pw,ending):
    
    def wrapper(ss,dic):
        if ss in dic: 
            dic[ss] +=1
        else:
            dic[ss] = 1
            
    def get_val(ss,dic):
        if ss in dic:
            return dic[ss]
        else:
            return 0

    def remove_val(ss,dic):
        if ss in dic: 
            dic[ss] -= 1
            if dic[ss] ==0:
                dic.pop(ss)
            return 1 
        else:            
            return 0
        
    
    prediction = []
    prev = []
    length = []    
    ll = len(sents)
    
    cache_G = {}
    
    possible_val = ["0","1"]
    
    init = random.choices(possible_val,[1-gamma,gamma],k=total_char)

    start = 0
    
    for i in range(ll):
        length.append(len(sents[i]))
        temp = init[start:start+length[i]]
        temp[-1] = "1"
        prev.append(temp)
        start += length[i]        
    #return prev
    #cur = prev.copy()
    
    dic = {}  
    n_ = 0 
    for i in range(ll):
        y_prev = 0
        y_next = 0 
        for k in range(length[i]):
            if prev[i][k] == "1":                                 
                y_next = k 
                ss = sents[i][y_prev:y_next+1]
                wrapper(ss,dic)
                n_ += 1                 
                y_prev = y_next+1
    
    for j in range(K): 
        print("iter:",j)
        #start_time = time.time()
        cnt = 0 
        for i in range(ll):
            y_prev = 0
            y_next = prev[i].index("1")
            
                
            for k in range(length[i]-1):
                
                
                
                if prev[i][k] == "1":
                    y_next = k+prev[i][k+1:].index("1")+1
                   
                full_seg = sents[i][y_prev:y_next+1] 
                prev_seg = sents[i][y_prev:k+1]
                next_seg = sents[i][k+1:y_next+1]
                
                if prev[i][k] == "1":                    
                    n_ -= remove_val(prev_seg,dic)                    
                    n_ -= remove_val(next_seg,dic)
                    
                else: 
                    n_ -= remove_val(full_seg,dic)                    
                    
                n_full = get_val(full_seg,dic)
                n_prev = get_val(prev_seg,dic)
                n_next = get_val(next_seg,dic)
                
                
                prob = [1,1]
                
                    
                prob[0]= (n_full + s * compute_G(full_seg,beta,P,cache_G))/(n_ + s)
                prob[1] = (1-gamma)*(n_prev + s * compute_G(prev_seg,beta,P,cache_G))*\
                            (n_next + (next_seg == prev_seg) +s * compute_G(next_seg,beta,P,cache_G))/\
                            ((n_  + s)*(n_ + 1 + s))
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                prob[0] = prob[0]  ** start_pw
                prob[1] = prob[1]  ** start_pw
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                tt = random.choices(possible_val,prob)[0]
                                                
                if (tt != prev[i][k]):
                    cnt +=1
                
                prev[i][k] = tt
                if tt == "1":
                    y_prev = k+1
                    n_ += 2                    
                    wrapper(prev_seg,dic)
                    wrapper(next_seg,dic)                    
                else: 
                    n_ += 1 
                    wrapper(full_seg,dic)
                #print(n_)
        if start_pw < ending:
            start_pw += rate
            if start_pw > ending: 
                start_pw = ending                
        #print(time.time()-start_time)
        print("changed:",cnt)
        print("unique segments:",len(dic))
        print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),prev)),train_y))

                    
    return prev

gamma = 0.2
beta = 0.1
s = 1

     

In [22]:
start_time = time.time()
rate = 0.1
start_pw = 0.1 
ending = 1.5
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw,ending)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time.time()-start_time)

iter: 0
changed: 457249
unique segments: 23077
Evaluation BPA: 0.5055431106283987
iter: 1
changed: 385360
unique segments: 13520
Evaluation BPA: 0.5324392380472132
iter: 2
changed: 342901
unique segments: 11050
Evaluation BPA: 0.5679757006062996
iter: 3
changed: 298807
unique segments: 9975
Evaluation BPA: 0.610134372888807
iter: 4
changed: 245337
unique segments: 9249
Evaluation BPA: 0.6543768970432793
iter: 5
changed: 186048
unique segments: 8646
Evaluation BPA: 0.6977951468016538
iter: 6
changed: 129997
unique segments: 8148
Evaluation BPA: 0.7310085562914763
iter: 7
changed: 83737
unique segments: 7758
Evaluation BPA: 0.753869510359459
iter: 8
changed: 51281
unique segments: 7449
Evaluation BPA: 0.7676645235532368
iter: 9
changed: 31359
unique segments: 7236
Evaluation BPA: 0.7753739134932103
iter: 10
changed: 20213
unique segments: 7066
Evaluation BPA: 0.7798573197182598
iter: 11
changed: 13847
unique segments: 6943
Evaluation BPA: 0.7824314262964005
iter: 12
changed: 9640
unique 

In [27]:
start_time = time.time()
rate = 0.1
start_pw = 0.1 
ending = 1.2
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw,ending)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time.time()-start_time)

iter: 0
changed: 456153
unique segments: 23119
Evaluation BPA: 0.506426646582386
iter: 1
changed: 383913
unique segments: 13552
Evaluation BPA: 0.5326989114190557
iter: 2
changed: 343662
unique segments: 11063
Evaluation BPA: 0.5669477819476365
iter: 3
changed: 299217
unique segments: 9943
Evaluation BPA: 0.6088823377930349
iter: 4
changed: 245603
unique segments: 9236
Evaluation BPA: 0.6540967514968931
iter: 5
changed: 186146
unique segments: 8633
Evaluation BPA: 0.695831973011209
iter: 6
changed: 130128
unique segments: 8134
Evaluation BPA: 0.7290701646070582
iter: 7
changed: 83490
unique segments: 7723
Evaluation BPA: 0.7516067963309553
iter: 8
changed: 51474
unique segments: 7402
Evaluation BPA: 0.7656916524169557
iter: 9
changed: 31893
unique segments: 7194
Evaluation BPA: 0.7736445534857109
iter: 10
changed: 20281
unique segments: 7030
Evaluation BPA: 0.7782852722098851
iter: 11
changed: 13707
unique segments: 6907
Evaluation BPA: 0.7811330594371876
iter: 12
changed: 11061
unique

In [30]:
start_time = time.time()
rate = 0.02
start_pw = 0.1 
ending = 1.2
cur = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw,ending)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time.time()-start_time)

iter: 0
changed: 456983
unique segments: 22942
Evaluation BPA: 0.5070989958937128
iter: 1
changed: 402575
unique segments: 16295
Evaluation BPA: 0.5145756495336116
iter: 2
changed: 387911
unique segments: 14121
Evaluation BPA: 0.5197895891450066
iter: 3
changed: 378544
unique segments: 12478
Evaluation BPA: 0.5255110231885088
iter: 4
changed: 370521
unique segments: 11298
Evaluation BPA: 0.5324058360782209
iter: 5
changed: 362801
unique segments: 10428
Evaluation BPA: 0.5412789075185678
iter: 6
changed: 353002
unique segments: 9729
Evaluation BPA: 0.5509848732179781
iter: 7
changed: 345141
unique segments: 9207
Evaluation BPA: 0.5592782588738796
iter: 8
changed: 335748
unique segments: 8720
Evaluation BPA: 0.5689993093334799
iter: 9
changed: 325834
unique segments: 8335
Evaluation BPA: 0.5791664377015566
iter: 10
changed: 314493
unique segments: 8026
Evaluation BPA: 0.5912374783021886
iter: 11
changed: 302839
unique segments: 7735
Evaluation BPA: 0.6043041130753624
iter: 12
changed: 29

Yes, we can use a proper schedule to make it perform slightly better than when we limit max tau to 1.

# 3. Analysis

In [28]:
def run_gibbs_annealing(K,sents,gamma,beta,s,P,rate,start_pw):
    
    def wrapper(ss,dic):
        if ss in dic: 
            dic[ss] +=1
        else:
            dic[ss] = 1
            
    def get_val(ss,dic):
        if ss in dic:
            return dic[ss]
        else:
            return 0

    def remove_val(ss,dic):
        if ss in dic: 
            dic[ss] -= 1
            if dic[ss] ==0:
                dic.pop(ss)
            return 1 
        else:            
            return 0
        
    
    prediction = []
    prev = []
    length = []    
    ll = len(sents)
    
    cache_G = {}
    
    possible_val = ["0","1"]
    
    init = random.choices(possible_val,[1-gamma,gamma],k=total_char)

    start = 0
    
    for i in range(ll):
        length.append(len(sents[i]))
        temp = init[start:start+length[i]]
        
        temp[-1] = "1"
        prev.append(temp)
        start += length[i]        
    #return prev
    #cur = prev.copy()
    
    dic = {}  
    n_ = 0 
    for i in range(ll):
        y_prev = 0
        y_next = 0 
        for k in range(length[i]):
            if prev[i][k] == "1":                                 
                y_next = k 
                ss = sents[i][y_prev:y_next+1]
                wrapper(ss,dic)
                n_ += 1                 
                y_prev = y_next+1
    
    for j in range(K): 
        print("iter:",j)
        #start_time = time.time()
        cnt = 0 
        for i in range(ll):
            y_prev = 0
            y_next = prev[i].index("1")
            
                
            for k in range(length[i]-1):
                
                
                
                if prev[i][k] == "1":
                    y_next = k+prev[i][k+1:].index("1")+1
                   
                full_seg = sents[i][y_prev:y_next+1] 
                prev_seg = sents[i][y_prev:k+1]
                next_seg = sents[i][k+1:y_next+1]
                
                if prev[i][k] == "1":                    
                    n_ -= remove_val(prev_seg,dic)                    
                    n_ -= remove_val(next_seg,dic)
                    
                else: 
                    n_ -= remove_val(full_seg,dic)                    
                    
                n_full = get_val(full_seg,dic)
                n_prev = get_val(prev_seg,dic)
                n_next = get_val(next_seg,dic)
                
                
                prob = [1,1]
                
                    
                prob[0]= (n_full + s * compute_G(full_seg,beta,P,cache_G))/(n_ + s)
                prob[1] = (1-gamma)*(n_prev + s * compute_G(prev_seg,beta,P,cache_G))*\
                            (n_next + (next_seg == prev_seg) +s * compute_G(next_seg,beta,P,cache_G))/\
                            ((n_  + s)*(n_ + 1 + s))
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                prob[0] = prob[0]  ** start_pw
                prob[1] = prob[1]  ** start_pw
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                tt = random.choices(possible_val,prob)[0]
                                                
                if (tt != prev[i][k]):
                    cnt +=1
                
                prev[i][k] = tt
                if tt == "1":
                    y_prev = k+1
                    n_ += 2                    
                    wrapper(prev_seg,dic)
                    wrapper(next_seg,dic)                    
                else: 
                    n_ += 1 
                    wrapper(full_seg,dic)
                #print(n_)
        if start_pw < 1:
            start_pw += rate
            if start_pw > 1: 
                start_pw = 1                
        #print(time.time()-start_time)
        print("changed:",cnt)
        print("unique segments:",len(dic))
        print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),prev)),train_y))

                    
    return prev,dic
rate = 0.02
start_pw = 0.1 
cur,dic = run_gibbs_annealing(100,train_x,gamma,beta,s,P,rate,start_pw)


iter: 0
changed: 457008
unique segments: 23100
Evaluation BPA: 0.5069858601922876
iter: 1
changed: 401917
unique segments: 16456
Evaluation BPA: 0.5151283982462889
iter: 2
changed: 388375
unique segments: 13919
Evaluation BPA: 0.5187304234830927
iter: 3
changed: 378158
unique segments: 12432
Evaluation BPA: 0.5255508900547253
iter: 4
changed: 369744
unique segments: 11310
Evaluation BPA: 0.5319683780327102
iter: 5
changed: 362924
unique segments: 10481
Evaluation BPA: 0.54053005692342
iter: 6
changed: 354813
unique segments: 9762
Evaluation BPA: 0.5495561309314085
iter: 7
changed: 345675
unique segments: 9205
Evaluation BPA: 0.5586533188088643
iter: 8
changed: 336184
unique segments: 8751
Evaluation BPA: 0.570247034497769
iter: 9
changed: 324810
unique segments: 8356
Evaluation BPA: 0.5810003135475154
iter: 10
changed: 313310
unique segments: 8012
Evaluation BPA: 0.5922535446492739
iter: 11
changed: 303116
unique segments: 7714
Evaluation BPA: 0.6045928784847143
iter: 12
changed: 28963

In [125]:
print("Top 200 count segments:",end="\n\n")

sort = sorted(dic.items(),key=lambda x: x[1],reverse=True)

for i in range(200):
    print(sort[i][0])



Top 200 count segments:

s
the
,
.
,and
and
to
ing
in
t
e
a
ed
that
of
d
he
her
was
it
you
his
i
n
for
with
y
as
she
``
they
be
er
at
so
on
r
this
not
him
.''
all
one
l
theprince
my
ly
had
is
them
f
en
st
ofthe
th
there
then
me
's
theking
c
your
inthe
tothe
no
re
what
,but
p
h
but
b
;
very
said
an
al
went
w
s,
when
would
were
king
have
or
out
their
thing
some
;and
!
which
le
will
o
-
by
up
where
est
prince
came
every
about
edto
?''
!''
how
down
who
like
:``
g
after
hewas
from
itwas
come
back
,''said
--
now
off
s.
ent
more
sof
,who
princess
hehad
onthe
could
un
,``
see
other
time
before
,''
little
se
made
into
go
m
only
again
never
than
tobe
man
are
br
,which
es
over
k
de
;but
ar
we
know
ight
thought
con
s,and
two
took
shewas
do
ful
,for
edthe
ever
day
sand
say
ch
great
got
here
put
head
let
away
upon
if
hand
take
way
,he
long
,andthe
co
ter
ted
us
even
ac


In [124]:
print("First 20 segemented sentences",end="\n\n")

sort = sorted(dic.items(),key=lambda x: x[1],reverse=True)


for i in range(20):
    ss = ""
    for j in range(len(train_x[i])):
        if (cur[i][j] == '1'):
            ss += train_x[i][j] + " "
        else: 
            ss += train_x[i][j]
    print(ss)
    


First 20 segemented sentences

_bo ok _ ti t le _: and rew _ l ang __ _ prince _ prigio . t x t . out 
chapter i . ( chapter head ing p icture :p 1.j pg) how the fairi e swere not in vit edto cour t. 
once upon a time there re ign edin pant ou flia a king anda queen. 
witha l most every thing els e to make them happy ,they want ed one thing : they had no ch ildren. 
this ve xed theking even more than thequeen ,who was very clever and learn ed,and whohad hat ed do ll s when she wasa child . 
how ever, she ,too in spite of allthe books she read and allthe p icture s she pa int ed, wou ldhav e be engl ade n ought obe the mother of alittle pri nce. 
theking was anxi ous to con sult the fairi es ,but thequeen would noth ear of sucha thing . 
she didnot believe in fairi es : shesaid thatthe yhad never ex is ted ;and that she m a in tain ed, thought he his to ry ofthe royalfam i ly was full of chapter s about noth ing els e. 
well ,at long anda tlastth e yhad alittle boy ,who was gene ra llyr

## 3.1  Answers to questions
###  List 10 examples of each in your report
(a) Under_segmentation:
    - ",and"
    - "tobe"
    - "theking"
    - "theprince"
    - "tothe"
    - "inthe"
    - "anda"
    - "hewas"
    - "wasa"
    - "didnot"
    

(b) Over_segmentation: 
    - "s"
    - "p"
    - "icture"
    - "t"
    - "d"
    - "orm"
    - "ing"
    - "ch"
    - "ildren"
    - "ing"
    
   

### Are they reasonable errors? You may also want to look at other sentences in the ﬁnal sample to see these segments 

Yes, for under_segmentation, the error happens mostly because certain terms are often used together and the model predicts this to be a single segment. For example, we see "theking" lacks a segmentation after "the", and note that the phrase "the king" occur together a lot in the document. Thus the model is prone to combining them into a single segment. 

For over_segmentation, we realize e.g. the "p"  segment occurs a lot over the document, but "picture" doesn't. So the model tends to predict "p" (since possibility of "p" segment is high due to its high occurence) and "icture" to be more likely than "picture". The same goes for "ing" i.e. it appears a lot, but is used with different verbs, but the model doesn't realize this is present tense but rather considers "ing" to likely be a standalone segment. 




###  Are there cases where you think your model is doing a better job at placing spaces than the original text(i.e.,better than the cultural process of language formation and evolution)? Do you see signs of overﬁtting to the domain of the given dataset?

I think the model is sometimes better at placing spaces before punctuations. E.g. in daily writing, we won't place a space between the last word and period but the original file has. On the other hand, for instance, the model correctly gives "ildren.", which is better than the original. 

Yes, I think overfit happens when the model considers recurring substructure of a segment to be a standalone segment, such as "ing", "ed".

# 4. Bigram  

In [4]:
# estimate distribution
P = np.zeros((len(char_index),len(char_index)))

for i in range(len(train_x)):
    for j in range(len(train_x[i]) - 1):
        a = char_index[train_x[i][j]]
        b = char_index[train_x[i][j+1]]
        P[a][b] += 1

P = P / P.sum()

In [17]:
# no annealing
import time


total_char = 938089

def compute_G(seg,beta,P,cache_G):
    if seg in cache_G:
        return cache_G[seg]
    l  = 1
    ll = len(seg)
    for i in range(ll-1):
        l *= P[char_index[seg[i]]][char_index[seg[i+1]]]
        
    
    temp = ((1-beta)**(len(seg)-1)) * beta * l
    cache_G[seg] = temp

    return temp 

def run_gibbs_bigram(K,sents,gamma,beta,s,P):
    
    def wrapper(ss,dic):
        if ss in dic: 
            dic[ss] +=1
        else:
            dic[ss] = 1
            
    def get_val(ss,dic):
        if ss in dic:
            return dic[ss]
        else:
            return 0

    def remove_val(ss,dic):
        if ss in dic: 
            dic[ss] -= 1
            if dic[ss] ==0:
                dic.pop(ss)
            return 1 
        else:            
            return 0
        
    
    prediction = []
    prev = []
    length = []    
    ll = len(sents)
    
    cache_G = {}
    
    possible_val = ["0","1"]
    
    init = random.choices(possible_val,[1-gamma,gamma],k=total_char)

    start = 0
    
    for i in range(ll):
        length.append(len(sents[i]))
        temp = init[start:start+length[i]]
        temp[-1] = "1"
        prev.append(temp)
        start += length[i]        
    #return prev
    #cur = prev.copy()
    
    dic = {}  
    n_ = 0 
    for i in range(ll):
        y_prev = 0
        y_next = 0 
        for k in range(length[i]):
            if prev[i][k] == "1":                                 
                y_next = k 
                ss = sents[i][y_prev:y_next+1]
                wrapper(ss,dic)
                n_ += 1                 
                y_prev = y_next+1
    
    for j in range(K): 
        print("iter:",j)
        #start_time = time.time()
        cnt = 0 
        for i in range(ll):
            y_prev = 0
            y_next = prev[i].index("1")
            
                
            for k in range(length[i]-1):
                
                
                
                if prev[i][k] == "1":
                    y_next = k+prev[i][k+1:].index("1")+1
                   
                full_seg = sents[i][y_prev:y_next+1] 
                prev_seg = sents[i][y_prev:k+1]
                next_seg = sents[i][k+1:y_next+1]
                
                if prev[i][k] == "1":                    
                    n_ -= remove_val(prev_seg,dic)                    
                    n_ -= remove_val(next_seg,dic)
                    
                else: 
                    n_ -= remove_val(full_seg,dic)                    
                    
                n_full = get_val(full_seg,dic)
                n_prev = get_val(prev_seg,dic)
                n_next = get_val(next_seg,dic)
                
                
                prob = [1,1]
                
                    
                prob[0]= (n_full + s * compute_G(full_seg,beta,P,cache_G))/(n_ + s)
                prob[1] = (1-gamma)*(n_prev + s * compute_G(prev_seg,beta,P,cache_G))*\
                            (n_next + (next_seg == prev_seg) +s * compute_G(next_seg,beta,P,cache_G))/\
                            ((n_  + s)*(n_ + 1 + s))
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                tt = random.choices(possible_val,prob)[0]
                                                
                if (tt != prev[i][k]):
                    cnt +=1
                
                prev[i][k] = tt
                if tt == "1":
                    y_prev = k+1
                    n_ += 2                    
                    wrapper(prev_seg,dic)
                    wrapper(next_seg,dic)                    
                else: 
                    n_ += 1 
                    wrapper(full_seg,dic)
                #print(n_)
        #print(time.time()-start_time)
        print("changed:",cnt)
        print("unique segments:",len(dic))
        print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),prev)),train_y))

                    
    return prev
gamma = 0.2
beta = 0.1
s = 1
start_time = time.time()
cur = run_gibbs_bigram(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time.time()-start_time)
     

iter: 0
changed: 377391
unique segments: 19224
Evaluation BPA: 0.5483967593625181
iter: 1
changed: 221047
unique segments: 16603
Evaluation BPA: 0.6496639869667672
iter: 2
changed: 119387
unique segments: 15212
Evaluation BPA: 0.702621192579591
iter: 3
changed: 62222
unique segments: 14395
Evaluation BPA: 0.7254379698498743
iter: 4
changed: 37528
unique segments: 13885
Evaluation BPA: 0.735355122191945
iter: 5
changed: 28945
unique segments: 13511
Evaluation BPA: 0.7411509025535267
iter: 6
changed: 26103
unique segments: 13162
Evaluation BPA: 0.745927384119411
iter: 7
changed: 25038
unique segments: 12863
Evaluation BPA: 0.749782617830833
iter: 8
changed: 24162
unique segments: 12594
Evaluation BPA: 0.753706810445981
iter: 9
changed: 23085
unique segments: 12341
Evaluation BPA: 0.7571148887660558
iter: 10
changed: 22871
unique segments: 12103
Evaluation BPA: 0.7602234268480716
iter: 11
changed: 22191
unique segments: 11893
Evaluation BPA: 0.7629160565419911
iter: 12
changed: 21428
uniq

In [16]:
#with annealing
import time


total_char = 938089

def compute_G(seg,beta,P,cache_G):
    if seg in cache_G:
        return cache_G[seg]
    l  = 1
    ll = len(seg)
    for i in range(ll-1):
        l *= P[char_index[seg[i]]][char_index[seg[i+1]]]
        
    temp = ((1-beta)**(len(seg)-1)) * beta * l
    cache_G[seg] = temp

    return temp 

def run_gibbs_bigram(K,sents,gamma,beta,s,P):
    
    def wrapper(ss,dic):
        if ss in dic: 
            dic[ss] +=1
        else:
            dic[ss] = 1
            
    def get_val(ss,dic):
        if ss in dic:
            return dic[ss]
        else:
            return 0

    def remove_val(ss,dic):
        if ss in dic: 
            dic[ss] -= 1
            if dic[ss] ==0:
                dic.pop(ss)
            return 1 
        else:            
            return 0
        
    
    prediction = []
    prev = []
    length = []    
    ll = len(sents)
    
    cache_G = {}
    
    possible_val = ["0","1"]
    
    init = random.choices(possible_val,[1-gamma,gamma],k=total_char)

    start = 0
    
    for i in range(ll):
        length.append(len(sents[i]))
        temp = init[start:start+length[i]]
        temp[-1] = "1"
        prev.append(temp)
        start += length[i]        
    #return prev
    #cur = prev.copy()
    
    dic = {}  
    n_ = 0 
    for i in range(ll):
        y_prev = 0
        y_next = 0 
        for k in range(length[i]):
            if prev[i][k] == "1":                                 
                y_next = k 
                ss = sents[i][y_prev:y_next+1]
                wrapper(ss,dic)
                n_ += 1                 
                y_prev = y_next+1
    start_pw = 0.1
    rate = 0.02
    for j in range(K): 
        print("iter:",j)
        #start_time = time.time()
        cnt = 0 
        for i in range(ll):
            y_prev = 0
            y_next = prev[i].index("1")
            
                
            for k in range(length[i]-1):
                
                
                
                if prev[i][k] == "1":
                    y_next = k+prev[i][k+1:].index("1")+1
                   
                full_seg = sents[i][y_prev:y_next+1] 
                prev_seg = sents[i][y_prev:k+1]
                next_seg = sents[i][k+1:y_next+1]
                
                if prev[i][k] == "1":                    
                    n_ -= remove_val(prev_seg,dic)                    
                    n_ -= remove_val(next_seg,dic)
                    
                else: 
                    n_ -= remove_val(full_seg,dic)                    
                    
                n_full = get_val(full_seg,dic)
                n_prev = get_val(prev_seg,dic)
                n_next = get_val(next_seg,dic)
                
                
                prob = [1,1]
                
                    
                prob[0]= (n_full + s * compute_G(full_seg,beta,P,cache_G))/(n_ + s)
                prob[1] = (1-gamma)*(n_prev + s * compute_G(prev_seg,beta,P,cache_G))*\
                            (n_next + (next_seg == prev_seg) +s * compute_G(next_seg,beta,P,cache_G))/\
                            ((n_  + s)*(n_ + 1 + s))
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                prob[0] = prob[0]  ** start_pw
                prob[1] = prob[1]  ** start_pw
                
                summ = prob[0] + prob[1]
                #print(prob[0],prob[1])
                prob[0] /= summ
                prob[1] /= summ
                
                tt = random.choices(possible_val,prob)[0]
                                                
                if (tt != prev[i][k]):
                    cnt +=1
                
                prev[i][k] = tt
                if tt == "1":
                    y_prev = k+1
                    n_ += 2                    
                    wrapper(prev_seg,dic)
                    wrapper(next_seg,dic)                    
                else: 
                    n_ += 1 
                    wrapper(full_seg,dic)
                #print(n_)
        if start_pw < 1:
            start_pw += rate
            if start_pw > 1: 
                start_pw = 1  
        #print(time.time()-start_time)
        print("changed:",cnt)
        print("unique segments:",len(dic))
        print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),prev)),train_y))
                    
    return prev
gamma = 0.2
beta = 0.1
s = 1
start_time = time.time()
cur = run_gibbs_bigram(100,train_x,gamma,beta,s,P)
print("Evaluation BPA:",BPA(list(map(lambda x: "".join(x),cur)),train_y))
print("Time elapsed:",time.time()-start_time)
     

iter: 0
changed: 456865
unique segments: 23028
Evaluation BPA: 0.5065408597666818
iter: 1
changed: 402787
unique segments: 16447
Evaluation BPA: 0.5141231067279108
iter: 2
changed: 389045
unique segments: 14127
Evaluation BPA: 0.5195331482217761
iter: 3
changed: 379022
unique segments: 12456
Evaluation BPA: 0.524823589116992
iter: 4
changed: 370696
unique segments: 11321
Evaluation BPA: 0.5318681721257337
iter: 5
changed: 363297
unique segments: 10466
Evaluation BPA: 0.5396400560722086
iter: 6
changed: 354918
unique segments: 9738
Evaluation BPA: 0.5490033822187311
iter: 7
changed: 345316
unique segments: 9169
Evaluation BPA: 0.558582204939397
iter: 8
changed: 336376
unique segments: 8718
Evaluation BPA: 0.5693936680641619
iter: 9
changed: 325111
unique segments: 8337
Evaluation BPA: 0.5788809047408169
iter: 10
changed: 314800
unique segments: 8006
Evaluation BPA: 0.5915219337800577
iter: 11
changed: 302621
unique segments: 7710
Evaluation BPA: 0.6043881567392783
iter: 12
changed: 2898

Unlike I have expected (I thought bigram should be better since it captures more information), I found with or without annealing, the bigram model gives very similar result to the unigram model. This is probably due to the way we estimate the bigram distribution (i.e. not accounting for segment breaks and counting all consecutive bigrams), so that it adds noise to the distribution. 