In [56]:
import matplotlib.pyplot as plt
import numpy as np
import collections
import random
from tqdm.auto import tqdm


%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Let's take a look at the data:

In [57]:
with open('text_seg/data_small.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    print(text[:1000])

TributespouredinfromaroundtheworldThursdaytothelateLabourPartyleaderJohnSmith,whodiedearlierfromamassiveheartattackaged55.InWashington,theUSStateDepartmentissuedastatementregretting"theuntimelydeath"oftherapier-tonguedScottishbarristerandparliamentarian."Mr.Smith,throughouthisdistinguishedcareeringovernmentandinopposition,leftaprofoundimpressiononthehistoryofhispartyandhiscountry,"StateDepartmentspokesmanMichaelMcCurrysaid."Secretary(ofStateWarren)ChristopherextendshisdeepestcondolencestoMrs.SmithandtotheSmithchildren."InBonn,theheadoftheGermanSocialDemocraticParty,RudolfScharping,saidinastatementhewas"veryaffectedbythesuddendeathofJohnSmith."AgoodfriendofGermansocialdemocracyhasleftustooearly.Hewasveryclosetoachievinghislife'sgoalofmakingtheLabourPartythelargestpoliticalforceinBritain"andwouldbe"cruellymissed"inEurope,hesaid.HongKongGovernorChrisPatten,aformerConservativePartychairman,offeredhiscondolencestotheSmithfamilyandsaidhisformerpolitcalopponentwasa"goodanddecentman,widelyresp

In [58]:
text_size = len(text)
text_size

652297

In [59]:
C = len(set(text))

In [60]:
print("Number of unique characters: {}".format(C))

Number of unique characters: 74


- There is 2 𝑛−1 possible segmentations for 𝑛-characters long data.
- 𝑛 − 1 latent binary variables $𝑠_𝑖$: denoting whether there is of isn’t a separator between two characters.
- Collapsed Gibbs sampling. Sample one variable conditioned by all the others.
- Exchangeability: if we reorder the words in the sequence, overall probability is the same.
- We can virtually move the changed words at the end of the sequence, compute the overal probablility of the two possibilities and then move the words virtually back.


if $s_i$ is 1, then there is a separator between characters $c_i$ and $c_{i+1}$

In [61]:
# Fixing the random seeds
np.random.seed(1234)
random.seed(1234)

In [62]:
import collections

def segmentation(text, text_size, s):
    words = []
    current_word = ""
    for idx, character in enumerate(text):
        if idx == text_size - 1:
            current_word += character
            continue
        
        if s[idx] == 1:
            current_word += character
            words.append(current_word)
            current_word = ""
        else:
            current_word += character
    return words

def get_word_counts(words):
    count = {}
    counter = collections.Counter(words)
    return counter
    for key, value in counter.items():
        count[key] = value
    return count

In [63]:
def get_prev_word(text, s, i):
    if i == 0:
        return text[i]
    
    start_idx = i
    end_idx = i + 1
    
    # Edge cases
    if s[start_idx] == 1 and s[start_idx-1] == 1:
        return text[i]
    
    if s[start_idx] == 1 and s[start_idx-1] == 0:
        start_idx -= 1
        
    while start_idx >= 0:
        if s[start_idx] == 1:
            start_idx += 1
            break
        start_idx -= 1
    
    if start_idx < 0:
        start_idx = 0
        
    return text[start_idx:end_idx]

In [64]:
def get_next_word(text, s, i):
    word = ""
    start_idx = i + 1
    end_idx = i + 1
    while end_idx <= len(text) - 1:
        if s[end_idx] == 1:
            break
        end_idx += 1
    
    if end_idx == len(text):
        end_idx -= 1
    
    word = text[start_idx:end_idx+1]
    return word

In [66]:
def p0(word, p_c):
    uniform = 1.0 / float(C)
    return uniform**len(word) * p_c**(len(word)-1) * (1 - p_c)

In [67]:
def CRPTextSegmentation(text, text_size, iterations, alpha, p_c, p_cont, T = 1, T_decrease = 1):
    # randomly initialize text segmentation
    s = np.random.randint(low=0, high=2, size=text_size)
    
    # create the initial segmentation
    words = segmentation(text, text_size, s)
    count = get_word_counts(words)
    
    # total number of words
    t = sum(count.values())
    
    processing_progress_bar = tqdm(range(1, text_size - 1), desc="Text processing")
    
    for iteration in tqdm(range(iterations), desc="Iterations"):
        processing_progress_bar.reset()
        for i in np.random.permutation(range(0, text_size - 1)):
            
            prev_word = get_prev_word(text, s, i)
            next_word = get_next_word(text, s, i)
            
            joined = prev_word + next_word
            if s[i] == 0:
                count[joined] = max(0, count[joined] - 1)
                t -= 1
            else:
                count[prev_word] = max(0, count[prev_word] - 1)
                count[next_word] = max(0, count[next_word] - 1)
                t -= 2
            
            p_0 = (alpha * p0(joined, p_c) + count[joined]) / (alpha + t)
            p_1 = (alpha * p0(prev_word, p_c) + count[prev_word]) / (alpha + t)
            p_1 *= (alpha * p0(next_word, p_c) + count[next_word]) / (alpha + t + 1)
            p_1 *= p_cont
            
            # Annealing
            p_0 = p_0 ** (1/T)
            p_1 = p_1 ** (1/T)
            
            # Normalization
            suma = p_0 + p_1
            p_0 /= suma
            p_1 /= suma
            
            s[i] = np.random.choice([0, 1], p=[p_0, p_1])
            
            if s[i] == 0:
                count[joined] += 1
                t += 1
            else:
                count[prev_word] += 1
                count[next_word] += 1
                t += 2

            processing_progress_bar.update(1)
        
    words_updated = segmentation(text, text_size, s)
    final_output = " ".join(words_updated)
    return final_output

## Task 3

**Download the gold data and the evaluation script. What precision and recall you get?**

In [68]:
def save_and_evaluate(output, output_file_path):
    output_file_path = "text_seg/" + output_file_path
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output)
    !perl text_seg/eval.pl text_seg/data_small_gold.txt $output_file_path
    
    with open('text_seg/results.txt', 'r') as f_results:
        lines = f_results.readlines()
        precision = round(float(lines[0]), 4)
        recall = round(float(lines[1]), 4)
        f1 = round(float(lines[2]), 4)
        
        return precision, recall, f1

In [69]:
def fit(run_name, iterations, alpha, p_c, p_cont, T=1, T_decrease=1):
    output = CRPTextSegmentation(text, text_size, iterations, alpha, p_c, p_cont, T, T_decrease)
    
    output_file_path = "{}.txt".format(run_name)
    print("Evaluating run: {} in file {}".format(run_name, output_file_path))
    precision, recall, f1 = save_and_evaluate(output, output_file_path)
    
    print(output[:2000])
    print()
    print()
    
    return precision, recall, f1

In [70]:
# The list that will keep all configurations for different runs
# and the evaluation measures calculated for these configs
runs = []
best_run = None

In [71]:
name, iterations, alpha, p_c, p_cont, T, T_decrease = "basic", 100, 100, 0.5, 0.99, 1, 1
precision, recall, f1 = fit(name, iterations, alpha, p_c, p_cont, T, T_decrease)
config_and_results = (name, iterations, alpha, p_c, p_cont,
                      T, T_decrease, precision, recall, f1)

runs.append(config_and_results)

if best_run == None:
    best_run = config_and_results
else:
    if best_run[9] < config_and_results[9]:
        best_run = config_and_results

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating run: basic in file basic.txt
P:0.172, R:0.289, F:0.216
T ri but esp our edin from a round the world Thursday tothe late Lab ourP arty lead er John S mith ,who died ear li er from am as sive hear tattack aged 55 .In Wash ingto n, theUS St at eDep art ment issue da state ment reg re t ting "the unti me ly death " ofthe ra pi er- ton gu ed Sco t t ish bar r i ster and parli a ment arian ."M r .S mith , th rough out his dis t in gu ish ed ca re er in governm ent and in op positi on , lef t a pro f ound imp re ssion onthe his tory of his part yand his country ," State Depa r t ments p okesm an Mich a el McC ur ry said. " Secre tary ( of State Warr en ) Chri sto pher ext end s his de e p est con do l ence s to M r s.S mith and tothe Smi th c hildr en ."In B on n, the head ofthe G erman S ocial Demo cratic Part y, Ru do l f Sch ar ping ,said i nasta te me nthewa s" very aff ected bythe su d den death of J ohnS mith ."A good frie nd ofG erman s ocial de mocrac y has lef t us to o ea

## Trying out different parameters

- ... try to change the parameters to obtain better segmentations
- What precision and recall you get?
- Try to do annealing and run the model for different temperatures.

In [73]:
import csv

with open('text_seg/config.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        name = row["name"]
        iterations = int(row["iterations"])
        alpha = int(row["alpha"])
        p_c = float(row["p_c"])
        p_cont = float(row["p_cont"])
        T = float(row["T"])
        T_decrease = float(row["T_decrease"])
        
        print("Currently working on: {}".format(name))
        print("Configuration:")
        print(iterations, alpha, p_c, p_cont, T, T_decrease)
        precision, recall, f1 = fit(
            name, 
            iterations, 
            alpha, 
            p_c, 
            p_cont, 
            T, 
            T_decrease)
        
        config_and_results = (name, iterations, alpha, p_c, p_cont,
                              T, T_decrease, precision, recall, f1)
        
        runs.append(config_and_results)
        
        if best_run == None:
            best_run = config_and_results
        else:
            if best_run[9] < config_and_results[9]:
                best_run = config_and_results
        

Currently working on: run2
Configuration:
100 100 0.4 0.99 1.0 1.0


Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating run: run2 in file run2.txt
P:0.173, R:0.290, F:0.216
T ri but es p our edin from a round the world Thurs day tothe late Lab our Party leader John Smi th ,who died e arlier from a mass ive hear t at tack aged 55 .In Wa shingto n ,theU S State D epar tment issue da s tatement re gre tting " the un time ly de a th " ofthe rap i er- t on gu ed S co t tish bar ris ter and p arliam ent arian ."M r.S mi th , th rough out his dis t ing u ished c are er ing overnm ent and in oppo sition , left a prof ound imp re ssion onthe histo ryof his party and his c ountry , " State D epar tment s po kesman Mi chael M cCur r ysaid ."S e cretar y( of State War re n ) Chri stoph er ext end s his d eep est con dol ences to M r s. Smi th and tothe Smi th chi ldren . "InB on n , the head ofthe Germa nSoci al D emoc ratic Party , Ru d olf S char ping ,said ina state ment hewas" very aff ect edbythe s udde nd eath of John Smi th ."A good frien d of G erman s ocial democr acy has left us to o e arly .He

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating run: run3 in file run3.txt
P:0.178, R:0.295, F:0.222
T ri but espo ur edin from arou ndthe worl d Thursday tothe late Labou r Party lead er John S mith ,who died e a rlier from a mass ive hear t att ack aged 55 .In W ashing ton , theUS State Dep artme nt issu eda state ment reg re tting "the un time ly death " ofthe rap i er- to n gu edS co t t ish b arr is ter and parli a ment arian ."M r .S mith , th rough out his dis ting u i shed car e er ing overnm ent and in oppos i tion, left a pro found imp re s sion onthe history of his p arty and his coun t ry ," State Dep artme n t s pokesma n M ichae lMcC ur ry said. " Se cretary ( of St ateWar re n ) Chri stoph e r exten d s his deep est cond o le nces to M r s.S mith and totheS mith chi ldren ."In Bo n n ,the head ofthe German Soci al D emocr atic Party , Rud o lf Sch ar p ing ,said ina state ment he was" very aff ected bythe su dden death ofJ ohnS mith ."A good frie nd of German so cial democr ac y has left us too early .He wa

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/150 [00:00<?, ?it/s]

Evaluating run: run4 in file run4.txt
P:0.181, R:0.301, F:0.226
Tri but es po u red in from around the world T hursday tothe late Labo ur Party leader J ohnS mi th ,who di ed earlier from am a ssive hear t attack aged 55 .In Was hington ,the US State Depar tment issued astate ment re gre t ting "the unti mely death " ofthe rap i e r-t on gu ed Scot t ish barr ister and par liam en tarian ."M r .S mi th , throu g hou this dist in gu ished ca re er in govern ment and in oppo sitio n , left a prof ound imp re ssion onthe his tory of his party an d his country , " State Depar tment spokes man Mic hael Mc Curr ysaid ."S e cre tary ( of State War re n ) Chris top here x tend s his deep e st con do le nce sto M rs . Smith an d tothe S mi th chi ldren ."I n Bonn ,the h eadoft he G erman S ocia l Demo cratic Party , Rudo l f Sc harp ing ,said in astate men the was" very aff ected bythe s ud d end e ath of Jo hnSm ith ."A good fri endof Germ ans ocia ld emocra cy has left us to o early .He was v

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/150 [00:00<?, ?it/s]

Evaluating run: run5 in file run5.txt
P:0.178, R:0.297, F:0.223
Tri but e spo ur ed in from around the worl dThu rsday tothe late L abour Part y leader John Smi th ,who died earlier from am ass ive he art at tack ag ed 55 .In Wash ingto n, theUS State Depar tment issu edas tatement re gret ting "the unti mely death " ofthe ra pi er - to n gu ed Sco tti sh b arri ster and par liam ent a rian ."M r .Smi th , th rough out his dis t ing u ished care er in governm ent and in oppo s ition , left a prof ound impr ession onthe his tory of his part y and his coun try ," State Depar tment spokes man Micha elM c C ur ry said ."S ecret ary ( of State War re n ) Chri stop here xten d s h is d eep est con do lence sto M r s .Smi tha nd tothe Smi th chi ldre n ."I n Bonn ,the head oft heGe rman Soci al Democ ra ticPa rty , Rudo l f Sch ar p ing ,said ina s tate men the was" very aff ected bythe s ud d en death of John Smi th ."A good fr i endof Germa n social d emocr ac y has left us to o e arly .He 

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/150 [00:00<?, ?it/s]

Evaluating run: run6 in file run6.txt
P:0.183, R:0.304, F:0.229
T ribu te s p our edin from around the world T hursday tothe late La bour Party leade r John Smith ,who died e arlier froma ma ssi ve hear t a ttack aged 5 5. In Wa shington , the US State De partment issue da state ment re g re t ting "the un time ly death " ofthe rap i er- ton gue d Sco t t ish barr ister and parli ament ar ian ."M r.S mi th , through out his dist ing u ish ed ca re er ing overnment and in o pposit ion , left a prof ound imp re ssion onthe his tory of his part y and his countr y ," State D epartme n t spokes man Mich a el McC ur r ysaid ." Secret ary ( of State Wa rren ) Chri stop here xtend s his de ep est c ondol ence s to M rs. Smith and tothe Smith c hildr en ."In Bo n n ,the head ofthe German Social D emocra tic Party , Rud ol fS char p ing ,said ina st atement hewas " very aff ected bythe su d den death of John Smith ."A good f riend of German s ocial d emocra c y has left us to o early .He was ver

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating run: run7 in file run7.txt
P:0.098, R:0.216, F:0.135
T r ib ut e sp ou red i n froma roun d the world Thur sda y t o the late Labour Party leader John Smith ,w ho d i ed ea r l ier froma mass iv e he a r t attack a g ed 5 5 .In Washi ngt on , the US StateD epart ment issue d a st ate ment re gr et t ing "the u n ti me ly d e ath " of ther ap ier - t o n gue d Scot t i sh bar ri ster and par lia ment ar ian ."M r. Smith ,th rough ou th is d i st ing ui sh ed car e er ing overn ment a nd in o ppo sit ion, left ap rof ound im pre ssion onthe his tory ofhi s party an d h i s countr y ," StateD epar t ments pokes man Micha elMcC ur r y said ." Secret ar y ( ofS t at e Wa r r en ) Ch ri stopher ext end s his d eep e st con d o l en ce sto M r s .S m it h andt oth e Smith c hi ld re n . "I n Bo n n ,th e headof th e Germa n So cial D e mocrat i c Party , Rudo lf Sc ha rp i ng, saidi n a sta t emen the was" very a ffect e d bythe s ud de n d ea t h of John Smith ."A g ood fri e ndof

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating run: run8 in file run8.txt
P:0.104, R:0.224, F:0.142
T ri but es p o u redi n froma ro und the world Thursday to t he lat e Labour Party leader John Smith ,who die d earlier f r om am as sive h ear t attack a g ed 55 .In Washingto n , t he US State Depart ment issue da sta t e m ent reg re t t ing "the un tim e ly death " ofthe r a p i er -t on gu ed S co t t ish b arri st e r and parlia ment arian ." M r .S m ith , through ou thi s d is ting ui shed c ar e e ri n go ve r n ment and in op po s it i o n , lef t a pr of ound im pres sio n onthe hi s t o ryof his pa rty and his c ount r y, " State De part ment spoke sm an Micha e l M cCur ry s a i d. " S ecreta r y( o f State Warr en ) Christ opher e xt end s his de ep e st cond o l en c es to M r s. Smith and tothe Smith child ren ."I nB onn ,th e he ad of t he Ger m an SocialD e mocra ti c Party , Rudol fS ch a r p ing, said i na state m e nt h e was" very a ffect edby t he sudde nd e a t h of John S m ith . "A good f ri e n 

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating run: run9 in file run9.txt
P:0.112, R:0.238, F:0.152
Tr ibu t es po u red in fro m a r ou ndthe wo rl d Thursday tothe la te Labou r Party leader John Smith ,wh o die de ar li er froma m ass iv e hear t attack ag ed 55 .I n Wa s hing t on, theUS State Depar tment i ss u ed a s tat e ment reg ret ting "th e u n t i m e l y d e ath " oft h e r a p i er- t o n g u edS co t t i s h bar r ister and pa r lia ment ar ia n. "Mr .Sm ith , th rough out his d ist ing u ish e dc a re eri ng overnmen t and in op posit i o n , lef t a pro found i mp res sion on t h e history of hi s pa rty a n d h i s co un t r y," State Depar tment spokesman Michae lMcC u rry said ." Secret a r y( of State War r en )C h r is top h e r exte nd sh is de e p est co nd o l enc e sto M rs . Smith an dt ot h e Smith c h i l dr e n ."In B on n , th e h e ad o fth e Germa n Soci a l Democra t ic Par t y , Ru do l f Sc har ping ,said ina stat eme nt hew a s " ve r y a ffect edby th es ud d end ea t ho f John Smit

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/175 [00:00<?, ?it/s]

Evaluating run: run10 in file run10.txt
P:0.055, R:0.140, F:0.079
Tr i but e s pou r edin f rom around th eworld T h ursday tot he l a t e Labour Pa r ty le a d er Joh nS m i t h ,w ho di ede arli er froma mas s i v eh ea r t a t t a ck a ged 55 .In W ash ing t o n ,t he USS tate Depart m e n t is s ue d as tate me n t r egr e tt in g "th e un t i me l y d eat h " ofthe ra p i er -to n gued S c o t ti sh b ar ri s ter andp ar li a menta r ia n." Mr. S mi th, through o u th isd is ting ui s h e dca re er i ng ove r n ment a nd i n opposi tion , left ap r of ou nd i m pr e ssio no nthe hi s to r y ofhis par t yan d his countr y," St at e De part m ent spokes m an M i cha e lM cC ur ry s a i d." Secre tary ( of Stat eW arre n ) Christ o p her ext en ds h isd ee p e st cond ole n c e sto M r s . Smith and t o the Smi t h ch il d r en ."I n B on n , t he head of the German S oc ia l Democr a ti c Party , R u dol fS ch ar ping ,said i n a s tat e men th e w as" ve ry af fe c t e d b y th esu

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/175 [00:00<?, ?it/s]

Evaluating run: run11 in file run11.txt
P:0.058, R:0.145, F:0.083
T ri but es p our ed i n f rom a r ou n d t h e wor ld T hursday t o th e l ate Labour Party l e a d er John Smith ,w h o d i ed e ar lier from a ma s sive he ar ta t tack a g ed 55 .I n Wash ing to n ,t heUS Sta t e D epart ment i s su eda s tat e ment re g r e t t ing " th e u nt ime l y d ea th " of th er ap ier -to ng u ed Scot ti sh b arr ist er an dp ar l ia men ta r i an ."M r .S mi t h, through out hi sd is t ing u ish edc ar ee r i n government andi n opp o s it ion , le f t a p rof o u n d i m pres s io n o nthe his tor yof his part y an dhis countr y , " State D e par t ment spokesm a n Michael McCur r y s a id ." Sec retary ( of StateW a r r e n ) Christ o pher e xt e n d s h i s d eep es t c o n d o lenc e sto M rs .S m i t ha n dto theS mi th children ."In Bo n n,t h e head o ft h e German Social Democra t i c Pa rty , R u d o lf S c ha r pin g, said inas t at e m ent he wa s" ver y af fe ct e d byt h esu d

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/175 [00:00<?, ?it/s]

Evaluating run: run12 in file run12.txt
P:0.058, R:0.145, F:0.083
T ribu t esp ou red i nf rom around th ew o rld T hursday to t hel a te Labour Part y lea d erJo hnSmi th , wh odi ed ear lie rf ro m am a s si v e h ear ta t t ac k a g e d 55 .In Washington ,the U S S tat e Dep a rtm ent issu eda s t at e men tre g r e tt ing "the u n ti me l y d eath " of t her api er -t o ng u e dS c o tti s hb ar ris te ran d parli ame n ta r ian ."M r .Sm ith ,th r ou g hout h i sd is t in g u i s h e dca r ee ring o vernment andi n opp osi ti o n , lef tap ro f ou ndi m pres sion o nt heh i st or yof h i sp a r ty an d hi s countr y," St a te D e p a r tment spokesm an M i c h aelMcC u r r y said." Secret ary ( of State War re n) C h ri stop her e x tend sh is d e ep es t c on d ole n c est oM rs .S mi th an dt ot h e Smith childr en ."I n Bo n n, th ehea dof t h e German Soc ial Democr atic Pa r ty , Rudo lf Sch ar p ing, s a i d in a state m e n t he w as" v e ry aff ect ed by thes ud de n de a 

In [74]:
import pandas as pd

df = pd.DataFrame.from_records(runs, columns =['Run name', 'Iterations', 'alpha', 'p_c', 'p_cont', 
                                              'T', 'T_decrease', 'Precision', 'Recall', 'F1'])

In [75]:
df

Unnamed: 0,Run name,Iterations,alpha,p_c,p_cont,T,T_decrease,Precision,Recall,F1
0,basic,100,100,0.5,0.99,1.0,1.0,0.1725,0.2894,0.2161
1,run2,100,100,0.4,0.99,1.0,1.0,0.1727,0.2896,0.2164
2,run3,100,100,0.6,0.99,1.0,1.0,0.1776,0.2951,0.2217
3,run4,150,100,0.4,0.99,1.0,1.0,0.181,0.3008,0.226
4,run5,150,100,0.5,0.99,1.0,1.0,0.178,0.2973,0.2227
5,run6,150,100,0.6,0.99,1.0,1.0,0.183,0.3044,0.2286
6,run7,250,100,0.4,0.99,2.5,0.9935,0.0982,0.2161,0.135
7,run8,250,100,0.5,0.99,2.5,0.9935,0.1035,0.2244,0.1417
8,run9,250,100,0.6,0.99,2.5,0.9935,0.1118,0.238,0.1521
9,run10,175,150,0.35,0.99,3.5,0.989,0.0551,0.1401,0.0791


## Best run:

In [76]:
print(best_run)

('run6', 150, 100, 0.6, 0.99, 1.0, 1.0, 0.183, 0.3044, 0.2286)


## Task 5

Instead of Chinese Restaurant Process, try to employ the Pitman-Yor Process. Does it improve your results?

In [77]:
def PitmanYorTextSegmentation(text, text_size, iterations, alpha, p_c, p_cont, T = 1, T_decrease = 1):
    # randomly initialize text segmentation
    s = np.random.randint(low=0, high=2, size=text_size)
    
    # create the initial segmentation
    words = segmentation(text, text_size, s)
    count = get_word_counts(words)
    
    # total number of words
    t = sum(count.values())
    
    processing_progress_bar = tqdm(range(1, text_size - 1), desc="Text processing")
    
    for iteration in tqdm(range(iterations), desc="Iterations"):
        processing_progress_bar.reset()
        for i in np.random.permutation(range(0, text_size - 1)):
            
            prev_word = get_prev_word(text, s, i)
            next_word = get_next_word(text, s, i)
            
            joined = prev_word + next_word
            if s[i] == 0:
                if count[joined] == 0:
                    print(joined + " not found")
                    print(count)
                count[joined] = max(0, count[joined] - 1)
                t -= 1
            else:
                count[prev_word] = max(0, count[prev_word] - 1)
                count[next_word] = max(0, count[next_word] - 1)
                t -= 2
            
            p_0 = (alpha * p0(joined, p_c) + count[joined]) / (alpha + t)
            p_1 = (alpha * p0(prev_word, p_c) + count[prev_word]) / (alpha + t)
            p_1 *= (alpha * p0(next_word, p_c) + count[next_word]) / (alpha + t + 1)
            p_1 *= p_cont
            
            # Annealing
            p_0 = p_0 ** (1/T)
            p_1 = p_1 ** (1/T)
            
            # Normalization
            suma = p_0 + p_1
            p_0 /= suma
            p_1 /= suma
            
            s[i] = np.random.choice([0, 1], p=[p_0, p_1])
            
            if s[i] == 0:
                count[joined] += 1
                t += 1
            else:
                count[prev_word] += 1
                count[next_word] += 1
                t += 2

            processing_progress_bar.update(1)
        
    words_updated = segmentation(text, text_size, s)
    final_output = " ".join(words_updated)
    return final_output