#### Load input data

In [1]:
import numpy
import gensim
from gensim.models.word2vec import Word2Vec
from pathlib import Path
import datetime
import glob
import os

TaggedDocument = gensim.models.doc2vec.TaggedDocument
homePath = str(Path.home()) + "/MLClassificationData"
dataPath = homePath + "/w2v/target"
modelPath = homePath + "/w2v/models/"
vecPath = homePath + "/w2v/vectors/"
n_dim = 100
sentences = []
cuDir = os.getcwd()

def prepareData(path, sentences):
    os.chdir(path);
    for ff in glob.glob("*"):
        if os.path.isdir(ff):
            dPath = path + "/" + ff
            prepareData(dPath)
            continue
        fPath = path + "/" + ff
        count = 0
        with open(fPath, 'r', encoding='UTF-8') as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                count += 1
                label = '%s_%s'%(ff, count)
                sentences.append(TaggedDocument(line.strip().split(), [label]))
                if len(sentences)%100 == 0:
                    print("Load %d lines"%(len(sentences)), end='\r')
        f.close()
        print ("Got %d lines from file %s"%(count, fPath))

def showTime(ds,de):
    result = ''
    seconds = (de-ds).total_seconds()
    hh = int(seconds/(60*24));
    if hh > 0:
        result = "%d h:"%(hh);
    seconds -= hh*60*24
    mm = int(seconds/60);
    if mm > 0:
        result += "%d min:"%(mm)
    ss = seconds - mm*60;
    result += "%d sec"%(ss)
    return result

ds = datetime.datetime.now()                                 
prepareData(dataPath, sentences)
de = datetime.datetime.now()
print ("At all: got %d lines in %s"%(len(sentences), showTime(ds,de)))

Got 2947557 lines from file /home/user/MLClassificationData/w2v/target/wiki_ar.txt
At all: got 2947557 lines in 3 min:7 sec


#### Create and train Word2Vec model

In [2]:
from gensim.models.callbacks import CallbackAny2Vec

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
    
    def on_epoch_begin(self, model):
        print ("Epoch %d"%(self.epoch), end='\r')
    
    def on_epoch_end(self, model):
        self.epoch += 1

logger = EpochLogger()        
w2v = Word2Vec(size=n_dim, window=10, min_count=3, workers=10)
ds = datetime.datetime.now()   
print ("Build vocabulary...")
w2v.build_vocab([x[0] for x in sentences])
de = datetime.datetime.now()
print ("Vocabulary is built in %s"%(showTime(ds,de)))
print ("Train model...")
ds = datetime.datetime.now()  
w2v.train([x[0] for x in sentences], epochs=100, total_examples=len(sentences), callbacks=[logger])
de = datetime.datetime.now()
print ("W2V model is completed in %s"%(showTime(ds,de)))

Build vocabulary...
Vocabulary is built in 1 min:51 sec
Train model...
W2V model is completed in 9 h:6 min:42 sec


#### Check quality of the Word2Vec model
This can be done, for example, by using __most_similar()__ method.    
Most (if not all) of the words in its output should have direct connection with the input.

In [4]:
w2v.wv.most_similar('مال')

  if np.issubdtype(vec.dtype, np.int):


[('اموال', 0.7828260660171509),
 ('لمال', 0.6755176782608032),
 ('قرض', 0.6673386096954346),
 ('مدخراة', 0.648013710975647),
 ('ينفق', 0.6403728723526001),
 ('مبلغ', 0.6283261775970459),
 ('مدخرات', 0.6184841394424438),
 ('اموالا', 0.6087712049484253),
 ('ادخار', 0.6072934865951538),
 ('ثروة', 0.5970333218574524)]

#### Save Word2Vec model in binary format.
Model saved in binary format can be reloaded in the future using **gensim.models.Word2Vec.load()**.    
Then it can be used for
- word embedding
- creating file of vectors
- re-train by additional corpora.  

_Note: though the gensim interface allows to re-train the existing model, it is not recommended.    
The practice would be, when new data arrives, to shuffle it with the "old" data and retrain a fresh model with all the data._

In [16]:
modelPath = homePath + "/w2v/models/"
modelName = "model-%s"%(datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S"))
ds = datetime.datetime.now() 
w2v.save(modelPath + modelName)
de = datetime.datetime.now()
print ("W2V model is saved in binary format in %s\n"%(showTime(ds,de)))

W2V model is saved in the binary format in 1 sec



#### Save Word2Vec model in text format (file of vectors)
Model saved in text format can be reloaded in the future using **gensim.models.KeyedVectors.load_word2vec_format()**.    
Then it can be used for word embedding.

In [17]:
vecPath = homePath + "/w2v/vectors/"
ds = datetime.datetime.now() 
w2v.wv.save_word2vec_format(vecPath + modelName + ".vec", binary=False)
de = datetime.datetime.now() 
print ("W2V model is saved in the text format in %s\n"%(showTime(ds,de)))

W2V model is saved in the text format in 39 sec

