In [1]:
import os
import pickle

import gensim
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec

from embeddings import embedding_tools

In [2]:
def infer_emb(data, model, k, dest_file, overlap=False, method=None):
    df = pd.read_csv(data)
    seqs = embedding_tools.get_seqs(df)
    
    if method is not None:
        seqs = embedding_tools.randomize_seqs(seqs, method=method)
    
    embeds = embedding_tools.get_embeddings_new(model, seqs, k=k,
                                                overlap=overlap)
    
    embeds = pd.DataFrame(embeds, index=df.index)
    terms = list(range(embeds.shape[1]))
    name = model.split('/')[-1]
    
    with open(dest_file + 'X_' + name, 'wb') as f:
        pickle.dump((embeds, terms), f)

In [3]:
models = os.listdir('./outputs/docvec_models/')
models = [m for m in models if m[-3:] == 'pkl']
models

['small_3_5.pkl']

In [6]:
datasets = ['test']

for s in datasets:
    dest = './outputs/embeddings/'
    Xs = os.listdir('/'.join(s.split('/')[:-1]) + dest)
    for model in models:
        print('Model ' + model + ' for dataset ' + s + ':')
        k = int(model[-7])
        print('Inferring...')
        infer_emb('./inputs/' + s + '.txt', './outputs/docvec_models/' + model, k, dest)
        print('Done')

Model small_3_5.pkl for dataset test:
Inferring...
Done
