In [1]:
import sentencepiece as spm
import pandas as pd

In [2]:
sp = spm.SentencePieceProcessor(model_file='./coca_unilm.model')

In [3]:
complex_words = pd.read_csv("./complex_words.tsv", sep="\t")
complex_words.head()

Unnamed: 0,class,word,bases,sequence,linearity,semantic relation
0,blend,adhocrat,adhoc aristocrat,AAAAXBBB,linear,time
1,blend,ambidancetrous,ambidextrous dance,AAAAXBBBBAAAAA,nonlinear,attribute
2,blend,analingus,anal cunnilingus,AAAXBBBBB,linear,loc_part_whole
3,blend,aquamation,aqua cremation,AAAABBBBBB,linear,causal
4,blend,artveillance,art surveillance,AAABBBBBBBBB,linear,purpose


In [4]:
complex_words['unilm_segmentation_raw'] = complex_words['word'].apply(
    lambda x: sp.encode(x, out_type=str))
complex_words['unilm_segmentation_raw']

0                 [▁a, d, ho, cra, t]
1           [▁am, bi, dance, t, rous]
2                  [▁an, al, ing, us]
3                 [▁a, qua, m, ation]
4               [▁art, ve, il, lance]
                    ...              
307    [▁, sha, gg, y, d, ood, le, s]
308                 [▁, s, hit, hole]
309                  [▁solo, mo, ons]
310                  [▁walk, book, s]
311                  [▁word, mill, s]
Name: unilm_segmentation_raw, Length: 312, dtype: object

In [5]:
segmentations_as_bpe = []
for i, row in complex_words.iterrows():
    new_seg = []
    segments = row['unilm_segmentation_raw']
    for j, seg in enumerate(segments):
        if j==0:
            new_seg.append(seg.replace("▁", ""))
        elif j > 0:
            new_seg.append("##" + seg)
    segmentations_as_bpe.append(new_seg)
complex_words['unilm_segmentation_bpe'] = segmentations_as_bpe
complex_words.head()

Unnamed: 0,class,word,bases,sequence,linearity,semantic relation,unilm_segmentation_raw,unilm_segmentation_bpe
0,blend,adhocrat,adhoc aristocrat,AAAAXBBB,linear,time,"[▁a, d, ho, cra, t]","[a, ##d, ##ho, ##cra, ##t]"
1,blend,ambidancetrous,ambidextrous dance,AAAAXBBBBAAAAA,nonlinear,attribute,"[▁am, bi, dance, t, rous]","[am, ##bi, ##dance, ##t, ##rous]"
2,blend,analingus,anal cunnilingus,AAAXBBBBB,linear,loc_part_whole,"[▁an, al, ing, us]","[an, ##al, ##ing, ##us]"
3,blend,aquamation,aqua cremation,AAAABBBBBB,linear,causal,"[▁a, qua, m, ation]","[a, ##qua, ##m, ##ation]"
4,blend,artveillance,art surveillance,AAABBBBBBBBB,linear,purpose,"[▁art, ve, il, lance]","[art, ##ve, ##il, ##lance]"


In [6]:
complex_words['segmentation_str'] = [' '.join(row['unilm_segmentation_bpe'])
                                     for i, row in complex_words.iterrows()]

In [7]:
complex_words.to_csv("./results/unilm_segmentations.tsv", sep="\t", index=None)