In [1]:
import pickle
import heapq
from tqdm import tqdm
from scipy import sparse

from embeddings import read

# Reverse-engineering language dictionaries from synsets

This notebook will serve (and later be replaced with a script) for extracting a word-level dictionary from synset alignment tables, where the English table is given and the target-language (starting with Italian) is learned as part of the large iterative process.

Our first step is to re-extract the English synset dictionary in its reduced form for the top 20,000 vocabulary items, as in `map_sense_embeddings.py`. This includes a later removal of resulting empty synset columns.

Note that we don't have to re-align the synsets with their identifiers; they're just serving as a vessel for dictionary alignment for now.

The git commit this code is copied from is [`00c662b`](https://github.com/yuvalpinter/vecmap/commit/00c662b740198e428f5e82d936f4af93f90e1ffa).

In [2]:
# load source alignment, trim
src_size = 20000
src_sns_filename = 'data/synsets/v3b_pairings.pkl'

with open(src_sns_filename, 'rb') as src_sns_file:
    src_senses = pickle.load(src_sns_file)

print(src_senses.shape, src_senses.getnnz())  # should be (60167, 200000) 80296
if src_senses.shape[0] < 100000:  # We want words as rows
    src_senses = src_senses.transpose()
src_senses = src_senses[:src_size]

# new columns for words with no senses in original input
newcols = [sparse.csc_matrix(([1],([i],[0])), shape=(src_size, 1)) for i in range(src_size)\
                   if src_senses.getrow(i).getnnz() == 0]
# trim senses no longer used, add new ones
colsums = src_senses.sum(axis=0).tolist()[0]
src_senses = sparse.hstack([src_senses[:,[i for i,j in enumerate(colsums) if j>0]]] + newcols).tocsr()

# this should be (20000, 35784) with 44795 nonzeros
print(f'trimmed sense dictionary dimensions: {src_senses.shape} with {src_senses.getnnz()} nonzeros')
sense_size = src_senses.shape[1]

(60167, 200000) 80296
trimmed sense dictionary dimensions: (20000, 35784) with 44795 nonzeros


In [3]:
# load word lists

encod = 'utf-8'
src_embs_file = 'data/embeddings/en.emb.txt'  # can also use en-words.txt
trg_embs_file = 'data/embeddings/it.emb.txt'

with open(src_embs_file, encoding=encod, errors='surrogateescape') as src_embs:
    src_words = read(src_embs, threshold=src_size)[0][:src_size]
    
with open(trg_embs_file, encoding=encod, errors='surrogateescape') as trg_embs:
    trg_words = read(trg_embs, threshold=src_size)[0][:src_size]
    
print(', '.join(src_words[-10:]))
print(', '.join(trg_words[-10:]))

curvature, daytona, correspondents, radiology, persians, saffron, catalonia, skipton, cola, xl
minerva, irresponsabile, anzianita&apos;, rilassante, qualificanti, sgomento, deontologia, immaginiamo, gianluigi, lieti


Let's explore the target assignments. We'll start with a small table that only assigned a few dozen senses.

In [4]:
trg_sns_filename = lambda n: f'outputs/tsns-00c662b-it{n:03d}.pkl'

with open(trg_sns_filename(1), 'rb') as trg_sense_1iter_file:
    trg_senses_1iter = pickle.load(trg_sense_1iter_file)
    
print(f'First iteration alignment has shape {trg_senses_1iter.shape} with {trg_senses_1iter.getnnz()} nonzeros.')

First iteration alignment has shape (20000, 35784) with 18 nonzeros.


Getting a sense of which words got assigned senses.

In [5]:
def assigned_trg_words(table, vocab=trg_words):
    return [w for l, w in zip(table, vocab) if l.getnnz() > 0]

print(', '.join(assigned_trg_words(trg_senses_1iter, trg_words)))

culturali, decretolegge, sedi, venti, porte, vitale, illustrato, richard, foggia, tasti, c.p., shock, camino, marittimi, fuso, iniziazione, gori


Oooookaaaay.
Which English words (senses) did they happen to align with?

In [6]:
def find_alignments(src_tab, trg_tab, src_vocab=src_words, trg_vocab=trg_words, threshold=0.0):
    alignments = []
    common_synsets = sparse.coo_matrix(src_tab.dot(trg_tab.transpose()), copy=True)
    for i,j,d in zip(common_synsets.row, common_synsets.col, common_synsets.data):
        if d > threshold:
            alignments.append((src_vocab[i], trg_vocab[j], d))
    return sorted(alignments, key=lambda x: -x[-1])  # order by descending match scores

iter1_align = find_alignments(src_senses, trg_senses_1iter)
print('\n'.join(['{: <12}{: <12} {:.3f}'.format(*a) for a in iter1_align]))

lawns       foggia       0.031
palette     sedi         0.022
purchase    shock        0.022
ads         culturali    0.018
alison      iniziazione  0.017
co-operate  gori         0.014
penelope    illustrato   0.013
sect        decretolegge 0.012
postcode    porte        0.011
robertson   richard      0.008
districts   tasti        0.008
bang        fuso         0.008
delicious   camino       0.006
kitchen     marittimi    0.006
buzz        vitale       0.005
vascular    c.p.         0.003
belmont     venti        0.003


In [7]:
# sanity check
lawns_idx = [i for i,w in enumerate(src_words) if w=='lawns'][0]
foggia_idx = [i for i,w in enumerate(trg_words) if w=='foggia'][0]
print(src_senses[lawns_idx].indices)
print(trg_senses_1iter[foggia_idx].indices)

print()

bang_idx = [i for i,w in enumerate(src_words) if w=='bang'][0]
fuso_idx = [i for i,w in enumerate(trg_words) if w=='fuso'][0]
print(src_senses[bang_idx].indices)
print(trg_senses_1iter[fuso_idx].indices)

[34646]
[34646]

[13282 17786 17902 17903 17904 17905 17906 17907]
[17902]


One positive finding is that the first iteration was pretty conservative in its alignment scores.

The words themselves seem pretty random except for maybe some POS agreement but hey, it's the first iteration.

In [8]:
with open(trg_sns_filename(2), 'rb') as trg_sense_2iter_file:
    trg_senses_2iter = pickle.load(trg_sense_2iter_file)
    
print(f'Second iteration alignment has shape {trg_senses_2iter.shape} with {trg_senses_2iter.getnnz()} nonzeros.\n')

iter2_align = find_alignments(src_senses, trg_senses_2iter)
print('\n'.join(['{: <12}{: <12} {:.3f}'.format(*a) for a in iter2_align][:20]))

Second iteration alignment has shape (20000, 35784) with 882 nonzeros.

co-operate  gori         0.695
lawns       foggia       0.694
robertson   richard      0.691
alison      iniziazione  0.690
ads         culturali    0.690
palette     sedi         0.689
purchase    shock        0.688
penelope    illustrato   0.686
bang        fuso         0.685
districts   tasti        0.683
sect        decretolegge 0.682
belmont     venti        0.680
postcode    porte        0.678
buzz        vitale       0.669
kitchen     marittimi    0.665
vascular    c.p.         0.664
delicious   camino       0.646
belmont     dieci        0.433
belmont     trenta       0.428
belmont     quindici     0.413


Higher confidence, lots of new alignments, some of the old ones self-fed, healthy overall. Let's jump straight to the last iterations.

In [9]:
with open(trg_sns_filename(4), 'rb') as trg_sense_4iter_file:
    trg_senses_4iter = pickle.load(trg_sense_4iter_file)
    
print(f'Fourth iteration alignment has shape {trg_senses_4iter.shape} with {trg_senses_4iter.getnnz()} nonzeros.\n')

iter4_align = find_alignments(src_senses, trg_senses_4iter, threshold=0.5)
print('\n'.join(['{: <14}{: <14} {:.3f}'.format(*a) for a in iter4_align][:20]))

Fourth iteration alignment has shape (20000, 35784) with 59943 nonzeros.

bang          fuso           0.618
wards         alt            0.600
waverley      arca           0.600
denise        ascensione     0.582
outlines      incisione      0.574
bosch         supplementare  0.568
kitchens      rotte          0.567
buildings     borsa          0.557
anderson      gordon         0.549
destinations  legittimi      0.545
temperate     circondario    0.539
townships     leve           0.537
cosmetic      classificazione 0.537
biographies   sottile        0.535
garden        crotone        0.525
grading       lit            0.524
tasty         casale         0.522
foliage       caltanissetta  0.521
sentiments    decisione      0.519
seafood       canna          0.517


In [10]:
with open(trg_sns_filename(5), 'rb') as trg_sense_5iter_file:
    trg_senses_5iter = pickle.load(trg_sense_5iter_file)
    
print(f'Fifth iteration alignment has shape {trg_senses_5iter.shape} with {trg_senses_5iter.getnnz()} nonzeros.\n')

iter5_align = find_alignments(src_senses, trg_senses_5iter, threshold=0.5)
print('\n'.join(['{: <14}{: <14} {:.3f}'.format(*a) for a in iter5_align][:20]))

Fifth iteration alignment has shape (20000, 35784) with 79018 nonzeros.

bang          fuso           0.600
wards         alt            0.588
waverley      arca           0.582
biographies   sottile        0.580
denise        ascensione     0.565
greatness     datata         0.542
greetings     paternità      0.539
hazel         tormentato     0.537
feelings      rinuncia       0.535
anderson      gordon         0.534
diary         trame          0.533
garden        crotone        0.531
brad          chiamiamo      0.529
passport      ragionevolmente 0.529
nicola        danza          0.528
kitchens      rotte          0.528
celebrities   ordinata       0.528
culinary      frazione       0.518
counsellors   prostituzione  0.515
stores        mercati        0.515


Now there's a lot of Italian sense assignments, let's see what are the top polysemic words, if any exist at all.
We'll threshold the top two entries and sort by the best second-ranked mapping.

In [11]:
topk = 10

def push_cap(h, val):
    if len(h) < topk:
        heapq.heappush(h, val)
    else:
        _ = heapq.heappushpop(h, val)

kheap = []
for j in tqdm(range(src_size)):
    row = trg_senses_5iter[j]
    if row.getnnz() > 1:
        second_val = sorted(row.data, reverse=True)[1]
        push_cap(kheap, (second_val, trg_senses_5iter[j].getnnz(), j, trg_words[j]))

print('2nd   total   word   word')
print('score senses  index')
print('\n'.join(['{:.3f}   {: <3d}   {: <6d} {}'.format(*a) for a in sorted(kheap, key=lambda x: -x[0])]))

100%|█████████████████████████████████████████| 20000/20000 [00:03<00:00, 6585.62it/s]


2nd   total   word   word
score senses  index
0.261   2     9295   dea
0.178   2     1941   ospedale
0.166   6     5002   avanzate
0.142   3     16776  centrocampista
0.136   2     8464   113
0.130   5     4928   situato
0.130   4     7017   juve
0.129   3     11091  sudan
0.128   4     15316  usiamo
0.123   3     14069  decorazione


Welp, can't say anything too valuable about these, or about the confidence of polysemy mapping.

The top three and bottom three do seem polysemic, but certainly there are more prominent specimens.