In [9]:
"""
Top-k segementations of unintentionally funny URLs.
"""
import numpy as np
from collections import Counter
from hypergraphs.semirings import LazySort
from arsenal import colors
from arsenal.iterextras import take


def segmentation(x, g, L=None):
    N = len(x)
    L = N if L is None else L
    V = np.full(N+1, LazySort.zero())
    V[0] = LazySort.one()
    for i in range(1, N+1):
        for j in range(max(i-L,0), i):
            w = g(x[j:i])
            if w is None: continue
            V[i] += V[j] * LazySort(w, x[j:i])
    return V[N]


def fit():

    freq = Counter()
    for line in open('unigrams.txt'):
        w, x = line.lower().strip().split()
        freq[w] = float(x)
    z = sum(freq.values())
    freq = {w: freq[w]/z for w in freq}

    def score(w):
        return freq.get(w)

    return score

In [10]:
examples = [
    'penisland',
    'powergenitalia',
    'bobwehadababyitsaboy',
    'speedofart',
    'expertsexchange',
    'whorepresents',
]

score = fit()
for x in examples:
    print()
    print(colors.light.yellow % f'{x}')
    for y in take(5, segmentation(x, score)):
        print(y)


[1;33mpenisland[0m
(5.509575944547597e-09, ['penis', 'land'])
(4.842743573989047e-09, ['pen', 'island'])
(2.9858347243500054e-10, [['penis', 'l'], 'and'])
(3.90571246926652e-11, [['pen', 'is'], 'land'])
(1.4154876403578446e-11, [['p', 'en'], 'island'])

[1;33mpowergenitalia[0m
(2.3455655590397707e-10, ['power', 'genitalia'])
(4.423267111598216e-12, ['powergen', 'italia'])
(1.2017919308422304e-13, [[['power', 'genital'], 'i'], 'a'])
(6.361229222553756e-14, [['power', 'gen'], 'italia'])
(5.481432032987714e-14, [['power', 'genital'], 'ia'])

[1;33mbobwehadababyitsaboy[0m
(5.933983622953188e-25, [[[[[[['bob', 'we'], 'had'], 'a'], 'baby'], 'its'], 'a'], 'boy'])
(1.285912411401011e-26, [[[[[[['bob', 'we'], 'had'], 'a'], 'baby'], 'it'], 'sa'], 'boy'])
(5.067177776368366e-27, [[[[[[['bob', 'we'], 'had'], 'aba'], 'by'], 'its'], 'a'], 'boy'])
(4.643487231808153e-27, [[[[[['bob', 'we'], 'hada'], 'baby'], 'its'], 'a'], 'boy'])
(3.2328411117258136e-27, [[[[[['bob', 'we'], 'had'], 'a'], 'baby