In [1]:
import matplotlib.pyplot as plt
import numpy as np
import collections
import random
from tqdm.auto import tqdm


%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

Let's take a look at the data:

In [2]:
with open('text_seg/data_small.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    print(text[:1000])

TributespouredinfromaroundtheworldThursdaytothelateLabourPartyleaderJohnSmith,whodiedearlierfromamassiveheartattackaged55.InWashington,theUSStateDepartmentissuedastatementregretting"theuntimelydeath"oftherapier-tonguedScottishbarristerandparliamentarian."Mr.Smith,throughouthisdistinguishedcareeringovernmentandinopposition,leftaprofoundimpressiononthehistoryofhispartyandhiscountry,"StateDepartmentspokesmanMichaelMcCurrysaid."Secretary(ofStateWarren)ChristopherextendshisdeepestcondolencestoMrs.SmithandtotheSmithchildren."InBonn,theheadoftheGermanSocialDemocraticParty,RudolfScharping,saidinastatementhewas"veryaffectedbythesuddendeathofJohnSmith."AgoodfriendofGermansocialdemocracyhasleftustooearly.Hewasveryclosetoachievinghislife'sgoalofmakingtheLabourPartythelargestpoliticalforceinBritain"andwouldbe"cruellymissed"inEurope,hesaid.HongKongGovernorChrisPatten,aformerConservativePartychairman,offeredhiscondolencestotheSmithfamilyandsaidhisformerpolitcalopponentwasa"goodanddecentman,widelyresp

In [3]:
text_size = len(text)
text_size

652297

In [4]:
C = len(set(text))

In [5]:
print("Number of unique characters: {}".format(C))

Number of unique characters: 74


- There is 2 𝑛−1 possible segmentations for 𝑛-characters long data.
- 𝑛 − 1 latent binary variables $𝑠_𝑖$: denoting whether there is of isn’t a separator between two characters.
- Collapsed Gibbs sampling. Sample one variable conditioned by all the others.
- Exchangeability: if we reorder the words in the sequence, overall probability is the same.
- We can virtually move the changed words at the end of the sequence, compute the overal probablility of the two possibilities and then move the words virtually back.


if $s_i$ is 1, then there is a separator between characters $c_i$ and $c_{i+1}$

In [6]:
# Fixing the random seeds
np.random.seed(1234)
random.seed(1234)

In [7]:
import collections

def segmentation(text, text_size, s):
    words = []
    current_word = ""
    for idx, character in enumerate(text):
        if idx == text_size - 1:
            current_word += character
            continue
        
        if s[idx] == 1:
            current_word += character
            words.append(current_word)
            current_word = ""
        else:
            current_word += character
    return words

def get_word_counts(words):
    count = {}
    counter = collections.Counter(words)
    return counter
    for key, value in counter.items():
        count[key] = value
    return count

In [8]:
def get_prev_word(text, s, i):
    if i == 0:
        return text[i]
    
    start_idx = i
    end_idx = i + 1
    
    # Edge cases
    if s[start_idx] == 1 and s[start_idx-1] == 1:
        return text[i]
    
    if s[start_idx] == 1 and s[start_idx-1] == 0:
        start_idx -= 1
        
    while start_idx >= 0:
        if s[start_idx] == 1:
            start_idx += 1
            break
        start_idx -= 1
    
    if start_idx < 0:
        start_idx = 0
        
    return text[start_idx:end_idx]

In [9]:
def get_next_word(text, s, i):
    word = ""
    start_idx = i + 1
    end_idx = i + 1
    while end_idx <= len(text) - 1:
        if s[end_idx] == 1:
            break
        end_idx += 1
    
    if end_idx == len(text):
        end_idx -= 1
    
    word = text[start_idx:end_idx+1]
    return word

In [10]:
def p0(word, p_c):
    uniform = 1.0 / float(C)
    return uniform**len(word) * p_c**(len(word)-1) * (1 - p_c)

In [11]:
def CRPTextSegmentation(text, text_size, iterations, alpha, p_c, p_cont, T = 1, T_decrease = 1):
    # randomly initialize text segmentation
    s = np.random.randint(low=0, high=2, size=text_size)
    
    # create the initial segmentation
    words = segmentation(text, text_size, s)
    count = get_word_counts(words)
    
    # total number of words
    t = sum(count.values())
    
    processing_progress_bar = tqdm(range(1, text_size - 1), desc="Text processing")
    
    for iteration in tqdm(range(iterations), desc="Iterations"):
        processing_progress_bar.reset()
        for i in np.random.permutation(range(0, text_size - 1)):
            
            prev_word = get_prev_word(text, s, i)
            next_word = get_next_word(text, s, i)
            
            joined = prev_word + next_word
            if s[i] == 0:
                count[joined] = max(0, count[joined] - 1)
                t -= 1
            else:
                count[prev_word] = max(0, count[prev_word] - 1)
                count[next_word] = max(0, count[next_word] - 1)
                t -= 2
            
            p_0 = (alpha * p0(joined, p_c) + count[joined]) / (alpha + t)
            p_1 = (alpha * p0(prev_word, p_c) + count[prev_word]) / (alpha + t)
            p_1 *= (alpha * p0(next_word, p_c) + count[next_word]) / (alpha + t + 1)
            p_1 *= p_cont
            
            # Annealing
            p_0 = p_0 ** (1/T)
            p_1 = p_1 ** (1/T)
            
            # Normalization
            suma = p_0 + p_1
            p_0 /= suma
            p_1 /= suma
            
            
            #print(f"p0: {p_0}, p1: {p_1}, suma: {suma}, T: {T}")
            
            s[i] = np.random.choice([0, 1], p=[p_0, p_1])
            
            if s[i] == 0:
                count[joined] += 1
                t += 1
            else:
                count[prev_word] += 1
                count[next_word] += 1
                t += 2

            processing_progress_bar.update(1)
            
        # Decreasing the temperature
        T = T * T_decrease
    
        
    words_updated = segmentation(text, text_size, s)
    final_output = " ".join(words_updated)
    return final_output

## Task 3

**Download the gold data and the evaluation script. What precision and recall you get?**

In [12]:
def save_and_evaluate(output, output_file_path):
    output_file_path = "text_seg/" + output_file_path
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output)
    !perl text_seg/eval.pl text_seg/data_small_gold.txt $output_file_path
    
    with open('text_seg/results.txt', 'r') as f_results:
        lines = f_results.readlines()
        precision = round(float(lines[0]), 4)
        recall = round(float(lines[1]), 4)
        f1 = round(float(lines[2]), 4)
        
        return precision, recall, f1

In [13]:
def fit(run_name, iterations, alpha, p_c, p_cont, T=1, T_decrease=1):
    output = CRPTextSegmentation(text, text_size, iterations, alpha, p_c, p_cont, T, T_decrease)
    
    output_file_path = "{}.txt".format(run_name)
    print("Evaluating run: {} in file {}".format(run_name, output_file_path))
    precision, recall, f1 = save_and_evaluate(output, output_file_path)
    
    print(output[:2000])
    print()
    print()
    
    return precision, recall, f1

In [14]:
# The list that will keep all configurations for different runs
# and the evaluation measures calculated for these configs
runs = []
best_run = None

In [15]:
name, iterations, alpha, p_c, p_cont, T, T_decrease = "basic", 100, 100, 0.5, 0.99, 1, 1
precision, recall, f1 = fit(name, iterations, alpha, p_c, p_cont, T, T_decrease)
config_and_results = (name, iterations, alpha, p_c, p_cont,
                      T, T_decrease, precision, recall, f1)

runs.append(config_and_results)

if best_run == None:
    best_run = config_and_results
else:
    if best_run[9] < config_and_results[9]:
        best_run = config_and_results

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating run: basic in file basic.txt
P:0.172, R:0.289, F:0.216
T ri but esp our edin from a round the world Thursday tothe late Lab ourP arty lead er John S mith ,who died ear li er from am as sive hear tattack aged 55 .In Wash ingto n, theUS St at eDep art ment issue da state ment reg re t ting "the unti me ly death " ofthe ra pi er- ton gu ed Sco t t ish bar r i ster and parli a ment arian ."M r .S mith , th rough out his dis t in gu ish ed ca re er in governm ent and in op positi on , lef t a pro f ound imp re ssion onthe his tory of his part yand his country ," State Depa r t ments p okesm an Mich a el McC ur ry said. " Secre tary ( of State Warr en ) Chri sto pher ext end s his de e p est con do l ence s to M r s.S mith and tothe Smi th c hildr en ."In B on n, the head ofthe G erman S ocial Demo cratic Part y, Ru do l f Sch ar ping ,said i nasta te me nthewa s" very aff ected bythe su d den death of J ohnS mith ."A good frie nd ofG erman s ocial de mocrac y has lef t us to o ea

## Trying out different parameters

- ... try to change the parameters to obtain better segmentations
- What precision and recall you get?
- Try to do annealing and run the model for different temperatures.

In [16]:
import csv

with open('text_seg/config.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        name = row["name"]
        iterations = int(row["iterations"])
        alpha = int(row["alpha"])
        p_c = float(row["p_c"])
        p_cont = float(row["p_cont"])
        T = float(row["T"])
        T_decrease = float(row["T_decrease"])
        
        print("Currently working on: {}".format(name))
        print("Configuration:")
        print(iterations, alpha, p_c, p_cont, T, T_decrease)
        precision, recall, f1 = fit(
            name, 
            iterations, 
            alpha, 
            p_c, 
            p_cont, 
            T, 
            T_decrease)
        
        config_and_results = (name, iterations, alpha, p_c, p_cont,
                              T, T_decrease, precision, recall, f1)
        
        runs.append(config_and_results)
        
        if best_run == None:
            best_run = config_and_results
        else:
            if best_run[9] < config_and_results[9]:
                best_run = config_and_results
        

Currently working on: run2
Configuration:
100 100 0.4 0.99 1.0 1.0


Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating run: run2 in file run2.txt
P:0.174, R:0.292, F:0.218
T ri but esp our edin from aroun d theworl dTh ursday tothe late Labou r Party l eader Joh nSmit h ,who died e arlier from ama s sive hear t att a ckag ed 55 .In W ashing ton ,the USS tateD e part ment issue d a state ment reg re tting " the un time ly death " ofthe rap i er- t ong u ed Sco t t ish bar ris ter and parli ament arian ."M r .S mith , throug hou th is dis tin gui shed care er in govern ment and in op posi tion, lef t a prof ound im press ion on the his tory ofhis part yand hisc ountry ," State De part ments pokesma n M ichael Mc C ur ry said ."S ecre tary ( of State War re n ) Chris tophe r ex t en d s his d eepe st con dol ence sto M r s .S mith and tothe Smith childr e n ."In Bo n n ,the head ofthe Germ an Soc ial Dem ocratic Party , Ru dol f S char p ing ,said ina state ment hewa s" very a ffect edbythe su d den death ofJ ohnS mith ."A good f riend of Germ an soci al d emocra cy has lef t us to o early .He 

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating run: run3 in file run3.txt
P:0.176, R:0.294, F:0.220
Tr i but es po ur edin from a round thew orld Thurs day tothe late Labo ur Party leader John Smit h ,who di ed earli er from am ass ive hear t att ack a ged 55 .In W a shington , theUS State Depar tment issu eda state ment re gre tting "the un time ly death " ofthe r a pi er - ton gu ed Sco t t ish bar ris ter and p arlia ment ari an ."M r .S mith , t hrough outh is dis ting u ish ed care er in govern ment and in oppos i tion, left a pro foun d im pres sion onthe his tory of his party and his countr y," Stat eDepar t ments pokesman Mich ael McC urry said ."S ecretary ( of Stat eWarre n ) Chri stop her ext end s his de ep est cond ol ence sto M rs. Smit hand tothe S mith chi ldren ."I n Bo n n ,the headof the Ger man So cial Demo cratic Party ,R u d ol f S char p ing ,said ina state ment he was" ver y affec tedby the su dden death ofJ ohnS mith ."A good fri end of Germ an social demo c racy has left us to o early .Hewas ver

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/150 [00:00<?, ?it/s]

Evaluating run: run4 in file run4.txt
P:0.179, R:0.299, F:0.224
Tr i but es p our ed in from a round the worl d Thursd ay tothe late Labou r Party lead er John Smith ,who di e de arlier from a mass ive hear ta ttack age d 55 .In Wa shington ,the US State De part ment issueda state ment reg re t ting "the un time ly death " ofthe rap i er- to n gu ed Scot t ish bar rist er and par liam en taria n ."M r . Smith , th rough out his dis ting u ish ed c a re er in governm ent and in oppos ition , left a pro found impr ession onthe his toryof his part y and his countr y," State Depa rtment spoke sman Mich a el Mc Cu rry said. " Sec retary ( of State Warre n ) Christ op here xte nd s his de ep e st con dol ences to M r s .S mi th and tothe Smith c hild r e n ."In B on n ,the head ofthe German So cial D emoc rati c Part y,R u dol f S char ping ,said ina s tateme n the was" very aff ected bythe su d den death of John Smith ."A good friend of German s ocial de mocra c y has left u sto o early .He

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/150 [00:00<?, ?it/s]

Evaluating run: run5 in file run5.txt
P:0.178, R:0.296, F:0.222
T ri but es p our edin from arou ndthe world T hursday tothe late L abour Par tylea der John S mi th ,who died earli er from am as sive hear t attac k aged 55 .In Washing t on ,the US State De part me ntiss ued a state ment re gr e tti ng"t he un time ly death " ofthe r ap ier - ton gu ed Sco t t i sh b arri ster and par lia ment arian ."M r .S mit h,t h roug h out his dis ting uish ed ca re er ing over n ment and in oppos ition , left a prof ound im pre ssion onthe his tory of his party and his countr y," State De part ment spoke sman Mich ael McC ur ry said." Secre tary ( o fSta teWa rren ) Chris top her ext end s his de ep est cond ol ence sto M r s .S m i t hand tothe Smith child re n ."In B on n ,the head ofthe Germ an Soci alDe mocra tic Party ,Ru do lfS char ping ,said ina state men thewa s"v e ry a ffect edbythe s udd en deathof John Smith ."A good fri end of Germ an soci al democ racy has left u stoo early .He was

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/150 [00:00<?, ?it/s]

Evaluating run: run6 in file run6.txt
P:0.179, R:0.300, F:0.224
T ri but e spo ur edin from ar ound the world T hursda y tothe late La bour Party leader John Smit h ,who di ed earl ier f roma mass ive hear t a ttack aged 55 .In Wash ington ,the US State Depar tment issue da state ment re gre tting " the un time ly death " ofthe ra pi er- to n gue dS cott ish bar r ister and par lia ment ari an ."M r .S mi th , through out his dis ting uish ed car e er in govern ment and in oppos i tion, left a pro found im pre ssion onthe his tory of his party and his countr y ," State Depar tment s pokesma n Mich a el Mc Cu r ry said." Se cretary ( of State W ar r en ) Chri stoph er e xten d s his de ep est con dol ence s to M r s.S mi th and tothe Smi th child r en ."I nB on n ,the head ofthe G erman Soci al De mocr atic Party , Ru dolf Sch ar p ing ,said ina state men the was" very aff ected bythe s ud den death of JohnS mi th ."A good fri end ofG erman s ocia l de mocr acy has left u stoo early .He

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating run: run7 in file run7.txt
P:0.199, R:0.319, F:0.245
Tr i but es p our edin from a round the world Thursday tothe late Labo ur Party leader John S mi th ,who died e arlier from a mass ive hear t attack a ged 55 .In Wash ington , theUS StateD epart ment issu eda state ment re gre t ting "the un time ly death " ofthe ra pi er- ton gu ed Sco t ti sh bar ris ter and parli a ment a rian ."M r . Smith , throug hou this dis tin gu ish ed car e er ing o vernment and in opposit ion, left a pro fo und im pre ssion onthe his to ryof his part y and his countr y," StateD epart ment spokes man Mich a elMcCu r ry said ."S ecret ary ( of State War ren ) Chr ist opher ex tend s his de ep est cond ol ence sto M rs .S mit hand tothe Smith chi ldre n ."I n Bonn ,the head ofthe Germa n S ocial De mo cra tic Party , Rudo lf Sch ar p ing ,said ina state men the was " very a ffec tedby the sud den death of John Smith ."A good frien dof Germ an s ocial democr acy has left us to o early .He was very 

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating run: run8 in file run8.txt
P:0.209, R:0.334, F:0.257
Tr i but es p our edin from a round the world Thursday tothe late Labour Party leader John Smith ,who died earli er from a mas sive he art a ttack aged 55 .In Washi ngton , theUS State De part ment issue da state ment re gre t ting " the un time ly death " ofthe rap i er- ton gue d Scot t ish bar ris ter and par lia ment arian ."M r . Smith , t hrough out his dis ting u ished c are er in gover nment and in oppos ition , left a prof ound imp re ssion onthe his tory ofhis party and his countr y," State De part ments pokes man Mich ael McC ur ry said ." Secret ary ( of StateW arre n ) Christ oph er ext end s his de e p e st cond olenc esto M r s. Smith and tothe Smith childr en ."In Bo n n ,the head ofthe Germa nS ocial Democr at ic Party , Rudo lf Sch ar p ing ,said ina state men the was" very a ffec ted bythe su d den death of John Smith ."A good fri end of Germ ans ocial de mocra cy has left us too early .He was very close

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating run: run9 in file run9.txt
P:0.201, R:0.322, F:0.248
T ri but esp our edin from a round the worl d Thursda y tothe late La bour Party leader J ohnSmi th ,who di ed e arlie r from am as sive he art attack a ged 55 .In Washin gton ,the US State Depart ment issued a state ment reg re tting "the un time ly death " of ther ap ier - t ong ued S co t t ish bar r ister and parli ament arian ."M r . Smit h, th rough out his dis tin gu ish ed car e er in governm ent and in oppos ition , left a pro found imp re ssion onthe his tory ofhis party and his c ountry ," State Depart ment spokes man Mi cha el McC ur ry said." Sec retary ( of State War re n ) Christ op here x t end s his d eep est cond ol ence sto M r s. Smit hand tothe Smit h chi ldren ."In B on n ,the head ofthe Germ an Soci al Democr a tic Party , Ru d olf S ch ar p ing , said ina state ment hewas " very a ffec t edbythe s ud den death of J ohnSmi th ."A good fri end ofG erman so cial democ racy has left us too early .He was

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/175 [00:00<?, ?it/s]

Evaluating run: run10 in file run10.txt
P:0.215, R:0.342, F:0.264
T ri but es p our edin from around the world Thursda y tothe late Labo ur Party leader John S mi th ,who died earlier from am as sive hear ta ttack aged 55 .In Wash ingto n ,the US State Depar tment issue da state ment reg re tting "the un time ly death " ofthe ra pi er - to ng u ed Sco t t ish b arri ster and par liamenta rian ."M r .S mi th , throug hou this dis tin gu ished ca re er in govern ment and in oppo s ition , left a prof ound im pre ssion onthe his tory of his party and his countr y," State Depar tment spokesm an Michae l McC ur ry said." Secre tary ( of State War ren ) Chris top her ext end s his d eep est cond ol ence sto M r s. Smith and tothe Smith ch ildr en ."In Bonn ,the head ofthe German Soci al Demo cra tic Party , Rudol f S char p ing ,said ina state men the was" very aff ected bythe s ud den death of John Smith ."A good fri end of German s oc ial d emocra cy has left us too early .He was very clos

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/175 [00:00<?, ?it/s]

Evaluating run: run11 in file run11.txt
P:0.216, R:0.342, F:0.265
Tri but e sp our edin from around the world Thursda y tothe late Labour Party leader John S mith ,who died e arlie r from ama s sive hear t at tack age d 55 .In Washi ngton , theUS State De part ment issu eda state ment re gre t ting " the unti me ly death " ofthe rap i er- ton gu ed S co t ti sh bar r ister and par liam ent ari an ."M r .S mith , t hrough out his dis ting u ish ed c are er ing over n ment and in oppos ition , left a pro found imp re ssion onthe his tory of his party and his count r y," State De part ments pokesm an Mich ael McC ur ry said ."S e cretar y( of StateW arre n ) Chris to pher ext end sh is d eep est cond ol ence sto M r s.S mith and tothe Smit h childr en ."In Bonn ,the head ofthe German Soci al Democ ra tic Party , Rud ol f Sch ar p ing ,said ina state ment he was" very aff ected bythe s ud den death of John Smit h ."A good fri end of German s ocia l demo c racy has left us too early .He was

Text processing:   0%|          | 0/652295 [00:00<?, ?it/s]

Iterations:   0%|          | 0/175 [00:00<?, ?it/s]

Evaluating run: run12 in file run12.txt
P:0.215, R:0.335, F:0.262
Tri but esp our edin from aroun dthe world Thursda y tothe late Labour Party leader John S mi th ,who died e arlier from ama ssive hear ta ttack ag ed 55 .In Washing ton ,the US State Depar tment issu eda state ment re gre tting "the unti me ly death " ofthe rap ier - ton gu ed Sco t t ish bar ris ter and parli ament ari an ."M r . Smith , t hrough ou this dis ting u ish ed car e er in governmen t and in opposi tion, left a prof ound imp re ssion onthe his tory of his party and his count ry, " State Depar tment spoke sman Micha elMcC ur ry said." Secr etary ( of State War re n ) Christ op her extend s his de ep est cond ol ence sto M r s.S mit hand tothe Smith chil dr en ."In B on n ,the head ofthe German Soci al Demo cratic Party , Rudol f S char p ing ,said ina state men the was" very a ffec tedby the s ud den death of John Smith ."A good f rie nd of German so cial de mocra cy has left us too early .He was very close t

In [17]:
import pandas as pd

df = pd.DataFrame.from_records(runs, columns =['Run name', 'Iterations', 'alpha', 'p_c', 'p_cont', 
                                              'T', 'T_decrease', 'Precision', 'Recall', 'F1'])

In [18]:
df

Unnamed: 0,Run name,Iterations,alpha,p_c,p_cont,T,T_decrease,Precision,Recall,F1
0,basic,100,100,0.5,0.99,1.0,1.0,0.1725,0.2894,0.2161
1,run2,100,100,0.4,0.99,1.0,1.0,0.1745,0.2919,0.2184
2,run3,100,100,0.6,0.99,1.0,1.0,0.1761,0.294,0.2203
3,run4,150,100,0.4,0.99,1.0,1.0,0.1788,0.2988,0.2238
4,run5,150,100,0.5,0.99,1.0,1.0,0.1779,0.2965,0.2224
5,run6,150,100,0.6,0.99,1.0,1.0,0.179,0.2999,0.2242
6,run7,250,100,0.4,0.99,2.5,0.9935,0.1987,0.3191,0.2449
7,run8,250,100,0.5,0.99,2.5,0.9935,0.2093,0.3342,0.2574
8,run9,250,100,0.6,0.99,2.5,0.9935,0.2014,0.3218,0.2477
9,run10,175,150,0.35,0.99,3.5,0.989,0.215,0.3421,0.264


## Best run:

In [19]:
print(best_run)

('run11', 175, 150, 0.5, 0.99, 3.5, 0.989, 0.216, 0.3415, 0.2647)


## Task 5

Instead of Chinese Restaurant Process, try to employ the Pitman-Yor Process. Does it improve your results?

In [20]:
def PitmanYorTextSegmentation(text, text_size, iterations, alpha, p_c, p_cont, T = 1, T_decrease = 1):
    # randomly initialize text segmentation
    s = np.random.randint(low=0, high=2, size=text_size)
    
    # create the initial segmentation
    words = segmentation(text, text_size, s)
    count = get_word_counts(words)
    
    # total number of words
    t = sum(count.values())
    
    processing_progress_bar = tqdm(range(1, text_size - 1), desc="Text processing")
    
    for iteration in tqdm(range(iterations), desc="Iterations"):
        processing_progress_bar.reset()
        for i in np.random.permutation(range(0, text_size - 1)):
            
            prev_word = get_prev_word(text, s, i)
            next_word = get_next_word(text, s, i)
            
            joined = prev_word + next_word
            if s[i] == 0:
                count[joined] = max(0, count[joined] - 1)
                t -= 1
            else:
                count[prev_word] = max(0, count[prev_word] - 1)
                count[next_word] = max(0, count[next_word] - 1)
                t -= 2
            
            p_0 = 0
            if (count[joined] == 0):
                p_0 = (alpha + d) / (alpha + t - 1)
                p_0 *= p0(joined, p_c)
            else:
                p_1 = count[]
            p_0 = (alpha * p0(joined, p_c) + count[joined]) / (alpha + t)
            
            p_1 = (alpha * p0(prev_word, p_c) + count[prev_word]) / (alpha + t)
            p_1 *= (alpha * p0(next_word, p_c) + count[next_word]) / (alpha + t + 1)
            p_1 *= p_cont
            
            # Annealing
            p_0 = p_0 ** (1/T)
            p_1 = p_1 ** (1/T)
            
            # Normalization
            suma = p_0 + p_1
            p_0 /= suma
            p_1 /= suma
            
            s[i] = np.random.choice([0, 1], p=[p_0, p_1])
            
            if s[i] == 0:
                count[joined] += 1
                t += 1
            else:
                count[prev_word] += 1
                count[next_word] += 1
                t += 2

            processing_progress_bar.update(1)
        
    words_updated = segmentation(text, text_size, s)
    final_output = " ".join(words_updated)
    return final_output

SyntaxError: invalid syntax (1338032384.py, line 35)