# FastText Model Training
adapted from: https://github.com/Abe2G/-Learning-Word-and-Sub-word-Vectors-for-Amharic-Less-Resourced-Language-/blob/master/train_word_embedding.ipynb

In [None]:

import logging
import os
from gensim.models.fasttext import FastText

from gensim.models import KeyedVectors
EMBEDDING_DIR='Model/'
PREPROCESSED_DIR='processed/'
class WordEmbeddingConfig(object):
    """Word2Vec Training parameters"""
    window=10 #Maximum skip length window between words
    emb_dim=200 # Set size of word vectors
    emb_lr=0.05 #learning rate for SGD estimation.
    nepoach=20 #number of training epochs
    nthread=100 #number of training threads
    sample = 0 #Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled
    negative = 15 #negative sampling is used with defined negative example
    hs = 0 #0 Use Hierarchical Softmax; default is 0 (not used)
    binary=0 # 0 means not saved as .bin. Change to 1 if allowed to binary format
    sg=0 # 0 means CBOW model is used. Change to 1 to use Skip-gram model
    iterate=10 # Run more training iterations
    minFreq=1 #This will discard words that appear less than minFreq times 
    if sg==0:
        model_name='am_fasttext_cbow_'+str(emb_dim)+'D'
    elif sg==1:
         model_name='am_fasttext_sg_'+str(emb_dim)+'D'

class corpus_sentences(object):# accept sentence stored one per line in list of files inside defined directory
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname),encoding='utf8'):
                yield line.split()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)


def load_am_word_vectors():
    if WordEmbeddingConfig.sg==0:
            model_type='CBOW'
    else:
        model_type='Skip-gram'        
    if os.path.exists(WordEmbeddingConfig.model_name):
        print('Loading Word2Vec Amharic Pretrained '+model_type+' model with '+str(WordEmbeddingConfig.emb_dim)+' dimension\n') 
        am_model= KeyedVectors.load(WordEmbeddingConfig.model_name)
    else:
        print('Loading Sentences with memory freindly iterator ...\n')
        sentences = corpus_sentences(PREPROCESSED_DIR) # a memory-friendly iterator        
        print('Training FastText '+model_type+' with '+str(WordEmbeddingConfig.emb_dim)+' dimension\n') 
        am_model = FastText(size=WordEmbeddingConfig.emb_dim, window=WordEmbeddingConfig.window, 
                            min_count=WordEmbeddingConfig.minFreq, workers=WordEmbeddingConfig.nthread,sg=WordEmbeddingConfig.sg,
                            iter=WordEmbeddingConfig.iterate,negative=WordEmbeddingConfig.negative,
                            hs=WordEmbeddingConfig.hs)
        am_model.build_vocab(sentences)

        am_model.train(sentences, total_examples=am_model.corpus_count, epochs=am_model.iter)
        #trim unneeded model memory = use (much) less RAM
        am_model.init_sims(replace=True)
        
        #Saving model    
        model_name=EMBEDDING_DIR+WordEmbeddingConfig.model_name
        am_model.save(model_name)        
        
    return am_model 
# uncomment the following to start training
am_model=load_am_word_vectors()

In [None]:
from gensim.models import Word2Vec,KeyedVectors
am_model= KeyedVectors.load('Model/am_fasttext_cbow_200D')
print(am_model.wv.doesnt_match("አንድ ሺህ ሚሊዮን አምስት ብዙ ጅማ".split()))

In [None]:
am_model.most_similar('ሐገር')

# Word2Vec Model training

In [None]:

import logging
import os
from gensim.models import Word2Vec,KeyedVectors
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
EMBEDDING_DIR='Model/'
PREPROCESSED_DIR='processed/'
class WordEmbeddingConfig(object):
    """Word2Vec Training parameters"""
    window=5 #Maximum skip length window between words
    emb_dim=200 # Set size of word vectors
    emb_lr=0.05 #learning rate for SGD estimation.
    nepoach=10 #number of training epochs
    nthread=100 #number of training threads
    sample = 0 #Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled
    negative = 10 #negative sampling is used with defined negative example
    hs = 0 #0 Use Hierarchical Softmax; default is 0 (not used)
    binary=0 # 0 means not saved as .bin. Change to 1 if allowed to binary format
    sg=0 # 0 means CBOW model is used. Change to 1 to use Skip-gram model
    iterate=20 # Run more training iterations
    minFreq=5 #This will discard words that appear less than minFreq times 
    if sg==0:
        model_name='5w_10ng_am_w2v_cbow_'+str(emb_dim)+'D'
    elif sg==1:
         model_name='5w_10ng_am_w2v_sg_'+str(emb_dim)+'D'

class corpus_sentences(object):# accept sentence stored one per line in list of files inside defined directory
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname),encoding='utf8'):
                yield line.split()
    
def train_w2v_model():
    print('Loading Sentences with memory freindly iterator ...\n')
    sentences = corpus_sentences(PREPROCESSED_DIR) # a memory-friendly iterator 
    if WordEmbeddingConfig.sg==0:
            model_type='CBOW'
    else:
        model_type='Skip-gram'    
    print('Training Word2Vec '+model_type+' with '+str(WordEmbeddingConfig.emb_dim)+' dimension\n') 
    _model = Word2Vec(sentences, size=WordEmbeddingConfig.emb_dim, window=WordEmbeddingConfig.window, 
                            min_count=WordEmbeddingConfig.minFreq, workers=WordEmbeddingConfig.nthread,sg=WordEmbeddingConfig.sg,
                            iter=WordEmbeddingConfig.iterate,negative=WordEmbeddingConfig.negative,sample=WordEmbeddingConfig.sample,
                            hs=WordEmbeddingConfig.hs,sorted_vocab=1)
  
    #trim unneeded model memory = use (much) less RAM
    _model.init_sims(replace=True)
    
    #Saving model   
    model_name=EMBEDDING_DIR+WordEmbeddingConfig.model_name
    _model.save(model_name)

    return _model        
train_w2v_model()

In [None]:
from gensim.models import Word2Vec,KeyedVectors
am_model= KeyedVectors.load('Model/5w_10ng_am_w2v_cbow_200D')
print(am_model.wv.doesnt_match("አንድ ሺህ ሚሊዮን አምስት ብዙ ጅማ".split()))

# Word2vec using SentencePiece

In [None]:
import sentencepiece as spm,glob

In [None]:
spm.SentencePieceTrainer.Train(input=glob.glob('processed/*.txt'), vocab_size=32000, model_prefix='amh_sp')

In [None]:

import logging
import os
from gensim.models import Word2Vec,KeyedVectors
import sentencepiece as spm

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
EMBEDDING_DIR='Model/'
PREPROCESSED_DIR='processed/'
class WordEmbeddingConfig(object):
    """Word2Vec Training parameters"""
    window=5 #Maximum skip length window between words
    emb_dim=100 # Set size of word vectors
    emb_lr=0.05 #learning rate for SGD estimation.
    nepoach=20 #number of training epochs
    nthread=100 #number of training threads
    sample = 0 #Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled
    negative = 15 #negative sampling is used with defined negative example
    hs = 0 #0 Use Hierarchical Softmax; default is 0 (not used)
    binary=0 # 0 means not saved as .bin. Change to 1 if allowed to binary format
    sg=1 # 0 means CBOW model is used. Change to 1 to use Skip-gram model
    iterate=10 # Run more training iterations
    minFreq=5 #This will discard words that appear less than minFreq times 
    if sg==0:
        model_name='sp_am_w2v_cbow_'+str(emb_dim)+'D'
    elif sg==1:
         model_name='sp_am_w2v_sg_'+str(emb_dim)+'D'

class corpus_sentences(object):# accept sentence stored one per line in list of files inside defined directory
    def __init__(self, dirname):
        self.dirname = dirname
        self.sp=spm.SentencePieceProcessor()
        self.sp_model=self.sp.Load("amh_sp.model")
    
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname),encoding='utf8'):
                yield self.sp.EncodeAsPieces(line)
    
def train_w2v_model():
    print('Loading Sentences with memory freindly iterator ...\n')
    sentences = corpus_sentences(PREPROCESSED_DIR) # a memory-friendly iterator 
    if WordEmbeddingConfig.sg==0:
            model_type='CBOW'
    else:
        model_type='Skip-gram'    
    print('Training Sentence Piece Word2Vec '+model_type+' with '+str(WordEmbeddingConfig.emb_dim)+' dimension\n') 
    _model = Word2Vec(sentences, size=WordEmbeddingConfig.emb_dim, window=WordEmbeddingConfig.window, 
                            min_count=WordEmbeddingConfig.minFreq, workers=WordEmbeddingConfig.nthread,sg=WordEmbeddingConfig.sg,
                            iter=WordEmbeddingConfig.iterate,negative=WordEmbeddingConfig.negative,
                            hs=WordEmbeddingConfig.hs,sorted_vocab=1)
  
    #trim unneeded model memory = use (much) less RAM
    _model.init_sims(replace=True)
    
    #Saving model   
    model_name=EMBEDDING_DIR+WordEmbeddingConfig.model_name
    _model.save(model_name)

    return _model        
train_w2v_model()

In [None]:
from gensim.models import Word2Vec,KeyedVectors
am_model= KeyedVectors.load('Model/sp_am_w2v_cbow_100D')

In [None]:
print('Total Vocab: ',len(am_model.wv.vocab))

In [None]:
am_model.most_similar('በሬ')

# Subword embedding from pretrained BPEmb

In [None]:
from bpemb import BPEmb
bpemb_en = BPEmb(lang="am", dim=50)

In [None]:
bpemb_am = BPEmb(lang="am", vs=100000)

In [None]:
bpemb_am.encode('የአገልግሎቶችና')

In [None]:
bpemb_am.most_similar('በሬ')