#### Load input data
It is assumed here, that all related files are placed in the folder ~/MLClassificationData/w2v:    
- source files with tokenized content - under subfolder _target_, 
- models, saved in binary format - under subfolder _models_,
- models, saved in text format = under subfolder _vectors_.    

Script works recursively and merge contents of all files, found in subfolder _target_ and all its subfolders.    
Models are saved with unique names, containing the date and time of their creating.

In [1]:
import numpy
import gensim
from gensim.models.word2vec import Word2Vec
from pathlib import Path
import datetime
import glob
import os
import re
from nltk.stem.util import suffix_replace

homePath = str(Path.home()) + "/MLClassificationData"
dataPath = homePath + "/w2v/target"
modelPath = homePath + "/w2v/models/"
vecPath = homePath + "/w2v/vectors/"
n_dim = 100
sentences = []
cuDir = os.getcwd()

class ArabicNormalizer(object):
    __vocalization = re.compile(r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]')
    __kasheeda = re.compile(r'[\u0640]') # tatweel/kasheeda
    __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]')
    __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') 
    __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]')
    __waw_hamza = re.compile(r'[\u0624]') 
    __yeh_hamza = re.compile(r'[\u0626]')
    __alefat = re.compile(r'[\u0623\u0622\u0625]')

    def normalize(self, token):
        """
        :param token: string
        :return: normalized token type string
        """
        # strip diacritics
        token = self.__vocalization.sub('', token)
        #strip kasheeda
        token = self.__kasheeda.sub('', token)
        # strip punctuation marks
        token = self.__arabic_punctuation_marks.sub('', token)
        # normalize last hamza
        for hamza in self.__last_hamzat:
            if token.endswith(hamza):
                token = suffix_replace(token, hamza, '\u0621')
                break
        # normalize other hamzat
        token = self.__initial_hamzat.sub('\u0627', token)
        token = self.__waw_hamza.sub('\u0648', token)
        token = self.__yeh_hamza.sub('\u064a', token)
        token = self.__alefat.sub('\u0627', token)
        return token

normalizer = ArabicNormalizer()

def prepareData(path, sentences):
    global normalizer
    os.chdir(path);
    for ff in glob.glob("*"):
        if os.path.isdir(ff):
            dPath = path + "/" + ff
            prepareData(dPath)
            continue
        fPath = path + "/" + ff
        count = 0
        with open(fPath, 'r', encoding='UTF-8') as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                count += 1
                normalized_words = [normalizer.normalize(s) for s in line.strip().split()]
                sentences.append(normalized_words)
                if len(sentences)%100 == 0:
                    print("Load %d lines"%(len(sentences)), end='\r')
        f.close()
        print ("Got %d lines from file %s"%(count, fPath))

def showTime(ds,de):
    result = ''
    seconds = (de-ds).total_seconds()
    hh = int(seconds/(60*24));
    if hh > 0:
        result = "%d h:"%(hh);
    seconds -= hh*60*24
    mm = int(seconds/60);
    if mm > 0:
        result += "%d min:"%(mm)
    ss = seconds - mm*60;
    result += "%d sec"%(ss)
    return result

ds = datetime.datetime.now()                                 
prepareData(dataPath, sentences)
de = datetime.datetime.now()
print ("At all: got %d lines in %s"%(len(sentences), showTime(ds,de)))

Got 3879950 lines from file /home/user/MLClassificationData/w2v/target/wiki_ar.txt
At all: got 3879950 lines in 8 min:20 sec


In [2]:
numpy.random.shuffle(sentences)

#### Create and train Word2Vec model
Model is created with 100D vectors and trained in 100 epochs.

In [3]:
from gensim.models.callbacks import CallbackAny2Vec

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
    
    def on_epoch_begin(self, model):
        print ("Epoch %d"%(self.epoch), end='\r')
    
    def on_epoch_end(self, model):
        self.epoch += 1

logger = EpochLogger()        
w2v = Word2Vec(size=n_dim, window=10, min_count=3, workers=10)
ds = datetime.datetime.now()   
print ("Build vocabulary...")
w2v.build_vocab(sentences)
de = datetime.datetime.now()
print ("Vocabulary is built in %s"%(showTime(ds,de)))
print ("Train model...")
ds = datetime.datetime.now()  
w2v.train(sentences, epochs=100, total_examples=len(sentences), callbacks=[logger])
de = datetime.datetime.now()
print ("W2V model is completed in %s"%(showTime(ds,de)))

Build vocabulary...
Vocabulary is built in 1 min:38 sec
Train model...
W2V model is completed in 14 h:15 min:1 sec


#### Check quality of the Word2Vec model
This can be done, for example, by using __most_similar()__ method.    
Most (if not all) of the words in its output should have direct connection with the input.

In [4]:
w2v.wv.most_similar('مال')

[('اموال', 0.7668006420135498),
 ('ماله', 0.6916441321372986),
 ('قرض', 0.6779076457023621),
 ('مالها', 0.6629146337509155),
 ('ادخار', 0.6276471614837646),
 ('مدخراته', 0.6235713958740234),
 ('اموالها', 0.6218032240867615),
 ('ربح', 0.6210994124412537),
 ('بالمال', 0.6199471354484558),
 ('بمال', 0.6176666021347046)]

#### Save Word2Vec model in binary format.
Model saved in binary format can be reloaded in the future using **gensim.models.Word2Vec.load()**.    
Then it can be used for
- word embedding
- creating file of vectors
- re-train by additional corpora.  

_Note: though the gensim interface allows to re-train the existing model, it is not recommended.    
The practice would be, when new data arrives, to shuffle it with the "old" data and retrain a fresh model with all the data._

In [5]:
modelPath = homePath + "/w2v/models/"
modelName = "model-%s"%(datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S"))
ds = datetime.datetime.now() 
w2v.save(modelPath + modelName)
de = datetime.datetime.now()
print ("W2V model %s is saved in binary format in %s\n"%(modelName, showTime(ds,de)))

W2V model model-2019-Feb-20-223316 is saved in binary format in 4 sec



#### Save Word2Vec model in text format (file of vectors)
Model saved in text format can be reloaded in the future using **gensim.models.KeyedVectors.load_word2vec_format()**.    
Then it can be used for word embedding.

In [6]:
vecPath = homePath + "/w2v/vectors/"
ds = datetime.datetime.now() 
w2v.wv.save_word2vec_format(vecPath + modelName + ".vec", binary=False)
de = datetime.datetime.now() 
print ("W2V model %s is saved in the text format in %s\n"%(modelName, showTime(ds,de)))

W2V model model-2019-Feb-20-223316 is saved in the text format in 57 sec

