In [26]:
import gensim
import pickle
import os
import numpy as np
import multiprocessing as mp
import time

In [19]:
current_path = os.path.dirname(os.path.abspath("__file__"))

training_datasets = os.listdir(f"{current_path}/../data/processed/wikipedia/")
training_datasets = [ele for ele in training_datasets if '.txt' in ele]

windows = [3, 6, 9]
epochs = [10, 20, 30, 40, 50]
sg_types = [0,1]
hs_types = [0,1]

In [24]:
# Class for a memory-friendly iterator over the dataset
class MySentences(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename):
            yield line.split()


def train_save_word2vec_model(dataset, min_count=2, size=50, window=3, workers=mp.cpu_count(), sg=1, hs=0, epochs=10):
    sentences = MySentences(f"{current_path}/../data/processed/wikipedia/{dataset}")
    model = gensim.models.Word2Vec(sentences=sentences,
                                   min_count=min_count,
                                   size=size,
                                   window=window,
                                   workers=workers,
                                   sg=sg,  # 1 is sg, 0 is CBOW
                                   hs=hs,  # 1 is hs, 0 is ns
                                   iter=epochs
                                   )
    model.save(f"{current_path}/../models/{dataset[:-4]}_word2vec_win{window}_sg{sg}_hs{hs}_epochs{epochs}.model")
    return model


def save_pretrained_embeddings(model, dataset, min_count=2, size=50, window=3, workers=mp.cpu_count(), sg=1, hs=0, epochs=10, model_type='word2vec'):
    with open(f"{current_path}/../pretrained_embeddings/{dataset[:-4]}_{model_type}_win{window}_sg{sg}_hs{hs}_epochs{epochs}.txt", 'w') as f:
        for v in list(model.wv.vocab):
            vec = list(model.wv.__getitem__(v))
            f.write(v + ' ')
            vec_str = ['%.9f' % val for val in vec]
            vec_str = " ".join(vec_str)
            f.write(vec_str + '\n')
            
            
def process_word2vec(dataset, min_count=2, size=50, window=3, workers=mp.cpu_count(), sg=1, hs=0, epochs=10):
    model = train_save_word2vec_model(dataset, min_count, size, window, workers, sg, hs, epochs)
    save_pretrained_embeddings(model, dataset, min_count, size, window, workers, sg, hs, epochs)

In [25]:
for dataset in training_datasets:
    for window in windows:
        for epoch in epochs:
            for sg in sg_types:
                for hs in hs_types:
                    print(f"Starting dataset|{dataset} window|{window} epoch|{epoch} sg|{sg} hs|{hs}")
                    start_time = time.time()
                    process_word2vec(dataset, min_count=2, size=50, window=window, workers=mp.cpu_count(), sg=sg, hs=hs, epochs=epochs)
                    print(f"    Training took {int((time.time() - start_time) / 60)} min and {int((time.time() - start_time) % 60)} sec")

In [None]:
def train_save_fasttext_model(dataset, min_count=2, size=50, window=3, workers=mp.cpu_count(), sg=1, hs=0, epochs=10):
    sentences = MySentences(f"{current_path}/../data/processed/wikipedia/{dataset}")
    model = gensim.models.FastText(size=size,
                                   window=window,
                                   workers = workers,
                                   min_count = min_count,
                                   sg = 1,
                                   hs = 0,
                                   iter = epochs)
    model.build_vocab(sentences=sentences)
    model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)  # train
    model.save(f"{current_path}/../models/{dataset[:-4]}_fasttext_win{window}_sg{sg}_hs{hs}_epochs{epochs}.model")
    
def process_fasttext(dataset, min_count=2, size=50, window=3, workers=mp.cpu_count(), sg=1, hs=0, epochs=10):
    model = train_save_fasttext_model(dataset, min_count, size, window, workers, sg, hs, epochs)
    save_pretrained_embeddings(model, dataset, min_count, size, window, workers, sg, hs, epochs, model_type='fasttext')

In [None]:
for dataset in training_datasets:
    for window in windows:
        for epoch in epochs:
            for sg in sg_types:
                for hs in hs_types:
                    print(f"Starting dataset|{dataset} window|{window} epoch|{epoch} sg|{sg} hs|{hs}")
                    start_time = time.time()
                    process_fasttext(dataset, min_count=2, size=50, window=window, workers=mp.cpu_count(), sg=sg, hs=hs, epochs=epochs)
                    print(f"    Training took {int((time.time() - start_time) / 60)} min and {int((time.time() - start_time) % 60)} sec")