In [1]:
import pickle
import io
from gensim.models import Word2Vec, KeyedVectors

In [2]:
emb_name = 'sentence_id'

In [3]:
def load_vec(PATH_TO_FASTTEXT): 
    input_file = io.open(PATH_TO_FASTTEXT, 'r', encoding='utf-8', newline='\n', errors='ignore')
    no_of_words, vector_size = map(int, input_file.readline().split())
    word_to_vector: Dict[str, List[float]] = dict()
    for i, line in enumerate(input_file):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        vector = list(map(float, tokens[1:]))
        assert len(vector) == vector_size
        word_to_vector[word] = vector
    return word_to_vector

In [8]:
if emb_name == 'clique_word':
    word_vec = load_vec(f"./{emb_name}.vec")
elif emb_name == 'nt_word':
    word_vec = load_vec(f"./{emb_name}.vec")
elif emb_name == 'sentence_id':
    epochs = 50
    emb_dim = 200
    word_vec = Word2Vec.load(f"./word2vec_{epochs}_{emb_dim}.model")
    word_vec = word_vec.wv

In [12]:
print(len(word_vec))

8641799


In [13]:
type(word_vec.key_to_index)

dict

In [14]:
from collections import defaultdict
# store the vocabulary for further use:

merged_word_vec = dict()

vocabs = defaultdict(lambda: set())

for key, vec in word_vec.key_to_index.items():
    if len(key.split(':')) == 1:
        vocabs['additional'].add(key)
    elif len(key.split(':')) == 2:
        lang, word = key.split(':')
        if len(lang) == 3:
            pass
        elif len(lang) == 4:
            lang = lang[:3]
        else:
            print(lang)
            raise ValueError("The language code is not right")
        vocabs[lang].add(word)
        
        # merge word vectors
        if f"{lang}:{word}" not in merged_word_vec:
            merged_word_vec[f"{lang}:{word}"] = vec

In [7]:
wv_from_text = KeyedVectors(vector_size=200)
wv_from_text.add_vectors(list(merged_word_vec.keys()), list(merged_word_vec.values()))

In [15]:
# print(wv_from_text.most_similar('eng:dogs', topn=1000))

In [16]:
all_langs = []
for lang, vo in vocabs.items():
#     print(f"{lang}: {len(vo)}")
    all_langs.append(lang)

In [17]:
print(len(all_langs))

1336


In [19]:
print(vocabs['additional'])

{'71036030', '06001006', '19094015', '27006027', '42001073', '18030007', '85008018', '42022017', '69011011', '09018005', '47011028', '09012014', '86012014', '24044026', '19119172', '14010010', '46009010', '07001027', '23015003', '14008012', '66019007', '42001013', '21002011', '09019016', '14002001', '46001002', '45012012', '27006022', '18028010', '07008001', '11018012', '11018025', '47001014', '19018015', '01010015', '44015039', '01001018', '20019003', '80006035', '20020004', '23057012', '25005022', '01017015', '06013009', '70005024', '56001008', '04007069', '26042011', '24028007', '46014007', '46012003', '18033016', '18019016', '42005039', '55004009', '35003013', '24049014', '58012022', '26037027', '16004010', '10021009', '77005063', '66020012', '05028048', '77014002', '05028049', '20023004', '20027001', '14034024', '19040018', '45008035', '11008039', '67005014', '19075005', '85017025', '41005039', '09010015', '68004014', '19136004', '06021033', '40008033', '06015027', '19109017', '44

In [20]:
store_name = f"./stored_vocab/vocab_{emb_name}.pickle"
with open(store_name, 'wb') as handle:
    pickle.dump(dict(vocabs), handle)
print(f"The vocabularies net has been stored in {store_name}!")

The vocabularies net has been stored in ./stored_vocab/vocab_sentence_id.pickle!


In [12]:
wv_from_text.save(f"word_embeddings_{emb_name}.kv")

In [13]:
all_langs

['additional',
 'ksw',
 'bsn',
 'rwo',
 'hix',
 'cbu',
 'toj',
 'ktj',
 'crq',
 'cbv',
 'crt',
 'azg',
 'kpr',
 'bhl',
 'mbd',
 'mtg',
 'wrs',
 'nwb',
 'bch',
 'gnn',
 'wbp',
 'med',
 'snn',
 'mxp',
 'kyz',
 'rkb',
 'jpn',
 'tee',
 'ots',
 'por',
 'tzj',
 'mhx',
 'yaq',
 'yle',
 'guo',
 'cak',
 'bbr',
 'agt',
 'mop',
 'arl',
 'apn',
 'deu',
 'bmh',
 'muh',
 'nak',
 'tzo',
 'gbr',
 'lww',
 'ipi',
 'mam',
 'cbt',
 'ppo',
 'agu',
 'nlc',
 'djr',
 'ian',
 'xav',
 'for',
 'gbi',
 'amm',
 'blw',
 'mnx',
 'aaz',
 'ura',
 'mcf',
 'mbi',
 'mit',
 'quc',
 'duo',
 'cuk',
 'amu',
 'szb',
 'atb',
 'ign',
 'msm',
 'kss',
 'lac',
 'kxw',
 'gvn',
 'tpz',
 'gnb',
 'ivb',
 'sqi',
 'nuy',
 'neb',
 'kmh',
 'aso',
 'aln',
 'mtp',
 'ozm',
 'khs',
 'pad',
 'tfr',
 'dgc',
 'mif',
 'kyf',
 'hla',
 'cbc',
 'aly',
 'ncu',
 'udu',
 'tlf',
 'tob',
 'spl',
 'sll',
 'xed',
 'tca',
 'mzl',
 'atd',
 'nab',
 'daa',
 'cni',
 'bcw',
 'cjo',
 'fij',
 'myu',
 'geb',
 'att',
 'mir',
 'yml',
 'cac',
 'kno',
 'avu',
 'byx',
 