In [1]:
FASTTEXT_MODEL = "/lnet/troja/projects/neuralpiece/data/czeng.cs.lc.fasttext"
EMBEDDINGS = "/lnet/troja/projects/neuralpiece/out_nosample/subword_embeddings.00.txt"
SUBWORD_VOCAB = "/lnet/troja/projects/neuralpiece/out_nosample/subwords.00"

In [2]:
import numpy as np
from gensim.models.fasttext import FastText

In [3]:
fasttext = FastText.load(FASTTEXT_MODEL)

FileNotFoundError: [Errno 2] No such file or directory: '/lnet/troja/projects/neuralpiece/data/czeng.cs.lc.fasttext'

In [4]:
subword_embeddings = np.loadtxt(EMBEDDINGS)

In [5]:
subword_vocab = []
with open(SUBWORD_VOCAB) as f_sub:
    for line in f_sub:
        subword_vocab.append(line.strip())
subword2idx = {sbwrd: i for i, sbwrd in enumerate(subword_vocab)}

In [6]:
SUBSTR_CACHE = {}

def get_substrings(word, max_len=10):
    if word in SUBSTR_CACHE:
        return SUBSTR_CACHE[word]
  
    substrings = []
    for sub_len in range(1, min(len(word), max_len) + 1): 
        for i in range(0, len(word) - sub_len + 1): 
            substr = word[i:i + sub_len]
            if substr in subword2idx:
                substrings.append(substr)
    SUBSTR_CACHE[word] = substrings
    return substrings

In [7]:
get_substrings("bohyně")

['b',
 'o',
 'h',
 'y',
 'n',
 'ě',
 'bo',
 'oh',
 'hy',
 'yn',
 'ně',
 'boh',
 'ohy',
 'hyn',
 'yně',
 'bohy',
 'ohyn',
 'hyně',
 'bohyn',
 'ohyně',
 'bohyně']

In [8]:
subword_vocab[0:5]

['e', 'a', 'o', 'i', 'í']

In [9]:
from scipy.spatial import distance
def score_words_subwords(word):
    vector = fasttext.wv[word]
    subwords = get_substrings(word)
    subword_scores = [1 - distance.cosine(vector, subword_embeddings[subword2idx[substr]]) for substr in subwords]
        
    return sorted(zip(subwords, subword_scores), key=lambda x: -x[1])

In [10]:
score_words_subwords("hokejista")

[('okej', 0.5387656218362069),
 ('hoke', 0.5381985294810631),
 ('hokej', 0.538123337319575),
 ('kej', 0.5137320840094882),
 ('hok', 0.4997065823654565),
 ('ista', 0.4950580548752803),
 ('oke', 0.46147526352534374),
 ('hokeji', 0.44656221404446383),
 ('okeji', 0.4464365404902182),
 ('keji', 0.4463839787790089),
 ('kejist', 0.43510503931035194),
 ('kejis', 0.4351048881357409),
 ('okejist', 0.43510383142185094),
 ('hokejist', 0.43510383142185094),
 ('okejis', 0.4351037756884848),
 ('hokejis', 0.4351037756884848),
 ('ejis', 0.4111547016544026),
 ('ejist', 0.4111029570416187),
 ('jista', 0.3428880205420658),
 ('ke', 0.3428178899701393),
 ('ejista', 0.3402602077507616),
 ('kejista', 0.3390162089701769),
 ('okejista', 0.3390162089701769),
 ('jis', 0.3381212525617786),
 ('jist', 0.3358093593863216),
 ('ist', 0.3338906136469849),
 ('ok', 0.3281772565348855),
 ('ej', 0.31685628383078446),
 ('sta', 0.3082406762617851),
 ('is', 0.2994437159888421),
 ('ji', 0.2965662964116653),
 ('ta', 0.2869723736

In [11]:
from sklearn.neighbors import NearestNeighbors

In [12]:
neigh = NearestNeighbors(n_neighbors=15, metric='cosine', radius=0.4)
neigh.fit(subword_embeddings)
def words_subword_neighbor(word):
    dist, idx = neigh.kneighbors([fasttext.wv[word]], 15, return_distance=True)
    return([(subword_vocab[i], i, d) for i, d in zip(idx[0], dist[0])])

In [13]:
words_subword_neighbor("hokej")

[('hoke', 12922, 0.5018756418699996),
 ('hokej', 8854, 0.501953737112405),
 ('okej', 7706, 0.502786900498497),
 ('kej', 5277, 0.520176339821193),
 ('zápas', 2270, 0.5364101806778853),
 ('ápas', 1913, 0.5364101917914591),
 ('hok', 9756, 0.5447219618194686),
 ('tbal', 5384, 0.5469650896529104),
 ('hráč', 3701, 0.5473802733804807),
 ('klu', 4470, 0.551055699883775),
 ('ráč', 3021, 0.5551500216802667),
 ('tba', 6392, 0.5555692977968594),
 ('fotb', 9692, 0.557380654169357),
 ('otb', 9639, 0.5577641542782832),
 ('fotba', 9721, 0.55841467856955)]

In [37]:
neigh_word = NearestNeighbors(n_neighbors=15, metric='cosine', radius=0.4)
neigh_word.fit(fasttext.wv.vectors)
def subwords_word_neighbor(subword):
    dist, idx = neigh_word.kneighbors([subword_embeddings[subword2idx[subword]]], 15, return_distance=True)
    return([(fasttext.wv.index_to_key[i], i, d) for i, d in zip(idx[0], dist[0])])


In [39]:
subwords_word_neighbor("al")

[(',', 0, 0.35515189404199177),
 ('.', 1, 0.42521785749897933),
 ('fotografie.autor', 184324, 0.4442394900927684),
 ('monsivais', 192770, 0.458302724825445),
 ('vojinovic', 195879, 0.4621077680007388),
 ('fotografie.foto', 149119, 0.4630778352283317),
 ('sportfotodienst', 78597, 0.4658188801112991),
 ('lovetsky', 169858, 0.46880309021851996),
 ('¨', 33903, 0.4748163206401683),
 ('wijngaert', 185098, 0.47533954170381754),
 ('wigglesworth', 173726, 0.48167438516393435),
 ('aflo', 166335, 0.48643437946279644),
 ('jdem.cz', 84367, 0.4899041306430938),
 (';', 520, 0.4914034881774495),
 ('Černyševova', 167133, 0.4921384856956448)]

In [123]:
subwords_word_neighbor(".")

[(',', 0, 0.3416288854144136),
 ('.', 1, 0.3813665153424428),
 ('fotografie.autor', 184324, 0.45086635083753124),
 ('(', 22, 0.45106032543232266),
 ('14-Čr', 169633, 0.45603927229756525),
 ('10-usa', 188593, 0.4636186492685056),
 ('monsivais', 192770, 0.4644338161419529),
 (';', 520, 0.466182897499972),
 ('14-rus', 159766, 0.4669639501837155),
 ('jdem.cz', 84367, 0.46839445392672185),
 ('vojinovic', 195879, 0.46848098869754196),
 ('karabükspor', 191652, 0.4701720998358283),
 ('Černyševova', 167133, 0.4703040102095488),
 ('akhisar', 164045, 0.4713175818139965),
 ('fotografie.foto', 149119, 0.472464186033523)]

In [41]:
avg = fasttext.wv.vectors.sum(0)
[fasttext.wv.index_to_key[i] for i in neigh_word.kneighbors([avg], 15, return_distance=False)[0]]

['18.230',
 'oprandiová',
 '16.839',
 'linetteová',
 'wickmayerová',
 '18.532',
 'gringore',
 'condra',
 '17.113',
 'rukolou',
 '19.289',
 'baldassarri',
 'govorcovová',
 'moorks',
 'paszeková']

In [122]:
words_subword_neighbor(".")

[('dodal', 2213, 0.3262356898551284),
 ('odal', 1414, 0.3378039212625138),
 ('doda', 5463, 0.3443870723480196),
 ('doplnil', 14949, 0.34783080969313174),
 ('oplnil', 10974, 0.34783080969313174),
 ('oplni', 21346, 0.3511172259900821),
 ('doplni', 21351, 0.35115261834746336),
 ('ky.cz', 3921, 0.3589692034097457),
 ('ky.c', 13961, 0.3589702001739542),
 ('ky.', 13513, 0.35932444928878593),
 ('y.cz', 2960, 0.3606077601019242),
 ('inky.', 18216, 0.3606203382295998),
 ('nky.', 18128, 0.3606691076784574),
 ('y.c', 12955, 0.3606800975293718),
 ('inky.cz', 9892, 0.36070116668303887)]

In [15]:
words_subword_neighbor("tráva")

[('strom', 15925, 0.5959826226345131),
 ('vka', 2041, 0.6063627883274925),
 ('trom', 12574, 0.6107001735467371),
 ('zele', 15195, 0.6141099859320901),
 ('tráv', 14218, 0.6168458567483815),
 ('zelen', 15928, 0.618660639763914),
 ('ička', 2591, 0.6187618645902503),
 ('rostl', 8803, 0.6189336155594285),
 ('ostl', 7648, 0.6202619039215742),
 ('slun', 22266, 0.6202698300668059),
 ('čka', 778, 0.6256985633389611),
 ('tlin', 23450, 0.6273565514107209),
 ('pís', 11778, 0.6290883701680853),
 ('ploc', 28313, 0.6291485825504466),
 ('zahrad', 20373, 0.6292947841623461)]

In [44]:
fasttext.wv.most_similar("požíval")

[('požívali', 0.7996586561203003),
 ('požívá', 0.7707567811012268),
 ('požívat', 0.7109697461128235),
 ('požívají', 0.6895473003387451),
 ('požívající', 0.6842960715293884),
 ('užíval', 0.638585090637207),
 ('konzumoval', 0.6023831963539124),
 ('pěstoval', 0.5836661458015442),
 ('dopřával', 0.5830966234207153),
 ('využíval', 0.5762752294540405)]

In [44]:
from neuralpiece.unigram_segment import viterbi_segment
from scipy.special import log_softmax, logsumexp

ImportError: cannot import name 'forward_backward' from 'neuralpiece.unigram_segment' (/lnet/troja/projects/neuralpiece/notebooks/neuralpiece/unigram_segment.py)

In [35]:
def try_segment(word, n_samples=200):
    vector = fasttext.wv[word]
    subwords = get_substrings(word)
        
    #subword_scores = {
    #    substr: -distance.cosine(vector, subword_embeddings[subword2idx[substr]])
    #    for substr in subwords}
    
    subword_scores = {
        substr: vector.dot(subword_embeddings[subword2idx[substr]])
        for substr in subwords
    }
    normalizer = logsumexp(list(subword_scores.values()))
    for substr in subwords:
        subword_scores[substr] -= normalizer
    
    
    def seg():
        seg, score = viterbi_segment(word, subword_scores, sample=n_samples > 1)
        return " ".join(seg), score #/ len(seg)
    
    return sorted(set([seg() for _ in range(n_samples)]), key=lambda x: -x[1])

In [46]:
from neuralpiece.forward_backward import expected_counts

ImportError: cannot import name 'expected_counts' from 'neuralpiece.forward_backward' (/lnet/troja/projects/neuralpiece/notebooks/neuralpiece/forward_backward.py)

In [43]:
try_segment("hokejový", n_samples=100)

[('hokejový', -9.34614749411162), ('hoke jový', -11.621097727517697)]

In [72]:
try_segment("všemohoucí", freq_weight=1)

[('vše mohoucí', 0.7177633477957808),
 ('vše mo houcí', 0.7288197056583124),
 ('všem ohoucí', 0.735930218139992),
 ('vš emo houcí', 0.7372167618386521),
 ('vš emohoucí', 0.742135131874113),
 ('všemo houcí', 0.7445305217825682),
 ('vš emohou cí', 0.761853820436603),
 ('vše m ohoucí', 0.76458741457057),
 ('vše mohouc í', 0.7649684256421766),
 ('všem oho ucí', 0.7691480688547919),
 ('vš emo ho ucí', 0.7734113134161034),
 ('vš emoh oucí', 0.7747657854435128),
 ('všem ohouc í', 0.7770731933690808),
 ('vš emohouc í', 0.7812391237607539),
 ('všemo houc í', 0.7827739000565325),
 ('vše mohou cí', 0.783774622230163),
 ('vše moh oucí', 0.7868120565083477),
 ('v šem ohoucí', 0.7876322892303028),
 ('vš emoh ouc í', 0.7879234861146838),
 ('všem ohou cí', 0.789031637476768),
 ('v še mohoucí', 0.7900843113594765),
 ('v š emo houcí', 0.7919571661348475),
 ('v šemo houcí', 0.7929032724711673),
 ('v šem oh oucí', 0.794341075767783),
 ('vše moho ucí', 0.7954051085275736),
 ('v šem oho ucí', 0.799620159493

In [80]:
fasttext.wv.most_similar("hokejista", topn=20)

[('Hokejista', 0.7815737128257751),
 ('fotbalista', 0.7748795747756958),
 ('hokejový', 0.7478811144828796),
 ('basketbalista', 0.7446302175521851),
 ('sportovec', 0.7333778738975525),
 ('hráč', 0.7302396297454834),
 ('Jágr', 0.7297383546829224),
 ('útočník', 0.72944575548172),
 ('forvard', 0.7293649315834045),
 ('krasobruslař', 0.7186452150344849),
 ('olympionik', 0.7177209854125977),
 ('bruslař', 0.7146347761154175),
 ('odchovanec', 0.7041434049606323),
 ('zadák', 0.6981070637702942),
 ('bitkař', 0.6956697702407837),
 ('kanonýr', 0.6916385889053345),
 ('tvrďák', 0.6900885701179504),
 ('atlet', 0.6891803741455078),
 ('reprezentant', 0.6827546954154968),
 ('šikula', 0.680549681186676)]

In [71]:
from scipy.sparse.csgraph import maximum_flow
from scipy.sparse import csr_matrix

def score_subwords_by_flow(word, scale=1000000):
    vector = fasttext.wv[word]
    subwords = get_substrings(word)
    subword_scores = {
        substr: 1 - distance.cosine(vector, subword_embeddings[subword2idx[substr]])
        for substr in subwords}
    subword_idx = {}
    
    graph_matrix = csr_matrix((len(word) + 1, len(word) + 1), dtype=int)
    for i in range(len(word) + 1):
        for j in range(i, len(word) + 1):
            subword = word[i:j]
            if not subword in subword_scores:
                continue
            subword_idx[subword] = (i, j)
            graph_matrix[i, j] = int(scale * subword_scores[subword])
            
    flow = maximum_flow(graph_matrix, 0, len(word))
    subword_flows = [(sub, subword_scores[sub], flow.flow[subword_idx[sub]] / scale) for sub in subwords]
    
    return sorted(subword_flows, key=lambda x: x[2])
    
score_subwords_by_flow("hokej")

[('o', 0.2090151608069004, 0.0),
 ('k', 0.21261804534090234, 0.0),
 ('e', 0.20749454175495774, 0.0),
 ('ok', 0.25524046078376594, 0.0),
 ('ke', 0.2679057940891988, 0.0),
 ('oke', 0.4245530125206245, 0.0),
 ('ho', 0.20918457498852494, 0.209184),
 ('kej', 0.4798513016513297, 0.209184),
 ('h', 0.21244079773179547, 0.21244),
 ('okej', 0.49715396622464514, 0.21244),
 ('j', 0.2255831990906667, 0.225583),
 ('hoke', 0.4980545983569453, 0.225583),
 ('ej', 0.24203071625930195, 0.24203),
 ('hok', 0.4549685024394131, 0.24203),
 ('hokej', 0.497990245283086, 0.49799)]

In [137]:
score_subwords_by_flow("kokosový")

[('k', 0.27573386933442756, 0.0),
 ('o', 0.26976678538537924, 0.0),
 ('k', 0.27573386933442756, 0.0),
 ('o', 0.26976678538537924, 0.0),
 ('s', 0.2634236225142079, 0.0),
 ('o', 0.26976678538537924, 0.0),
 ('v', 0.26652513024653857, 0.0),
 ('ko', 0.28117371886235176, 0.0),
 ('ok', 0.30254341411143615, 0.0),
 ('ko', 0.28117371886235176, 0.0),
 ('os', 0.2769685500616067, 0.0),
 ('so', 0.2740604645557587, 0.0),
 ('ov', 0.2670259446811948, 0.0),
 ('oko', 0.2920192223040281, 0.0),
 ('kos', 0.3219955004880495, 0.0),
 ('oso', 0.2458218594931425, 0.0),
 ('sov', 0.20031801374429292, 0.0),
 ('okos', 0.34901423476430926, 0.0),
 ('koso', 0.3208508022923472, 0.0),
 ('osov', 0.31116523930097384, 0.0),
 ('okoso', 0.3339482637546354, 0.0),
 ('kosov', 0.31888464298232644, 0.0),
 ('okosov', 0.33356721839581915, 0.0),
 ('okosový', 0.3100058102402343, 0.275733),
 ('kosový', 0.31035020042224826, 0.281173),
 ('ý', 0.2973983504070741, 0.297398),
 ('kokosov', 0.33362547432413303, 0.297398),
 ('koko', 0.33679928

In [97]:
def min_max_segment(word: str, vocab):

    costs = [10000. for _ in range(len(word) + 1)]
    prev = [0 for _ in word]

    # First, dynamic programming
    for i in range(1, len(word) + 1):
        scores = []
        indices = []
        for j in range(i):
            subword_candidate = word[j:i]
            if subword_candidate in vocab:
                new_cost = min(costs[j], vocab[subword_candidate])
                scores.append(new_cost)
                indices.append(j)
                
        if not scores:
            costs[i] = -1000
            prev[i - 1] = i - 1
        else:
            idx = max(range(len(scores)), key=lambda i: scores[i])

            costs[i] = scores[idx]
            prev[i - 1] = indices[idx]
            
    # Second, reconstrct the best options
    subwords = []
    idx = len(prev) - 1
    while idx >= 0:
        new_idx = prev[idx]
        #if new_idx == 0:
        #    break
        subwords.append(word[new_idx:idx + 1])
        idx = new_idx - 1
    return list(reversed(subwords)), costs[-1]

In [120]:
def try_segment_min_max(word):
    vector = fasttext.wv[word]
    subwords = get_substrings(word)
    subword_scores = {
        substr: 1-distance.cosine(vector, subword_embeddings[subword2idx[substr]])
        for substr in subwords}

    return min_max_segment(word, subword_scores)

In [138]:
try_segment_min_max("kokosový")

(['kokos', 'ový'], 0.32433132550422095)

In [133]:
fasttext.wv.index_to_key[-20:-10]

['muravský',
 'Úryvky',
 'dominantami',
 'opilcem',
 'raškovicích',
 'záložnami',
 'otáhalová',
 'zapotit',
 'demokraciím',
 'psychotické']