#### Tokenization with Stanford POS Tagger
Source text file(s) are splitted by sentences and sentences are tokenized by wrapper of Stanford POS Tagger (**ArabicStanfordPOSTagger.jar**), which uses Arabic specific model and properties.
It is assumed here, that all source text files will be placed in the folder ~/MLClassificationData.    
Original and resulting (tokenized) files will have the same names and will be placed in the same file's tree, under the same root folder:
- original files - under subfolder _source_, 
- tokenized files - under subfolder _target_.

_**Note:**_ you should change value of variable _root_ to set actual name of the root folder.


In [2]:
import os
import glob
import subprocess   
import threading
import datetime
import shutil
from pathlib import Path
from subprocess import Popen, PIPE

taggerPath = str(Path.home()) + "/MLClassification/tokenizer/arabic/stanford/taggers/ArabicStanfordPOSTagger.jar" 

# Change value of this variable 
root = "train/rtanews"
homePath = str(Path.home()) + "/MLClassificationData"
inPath = homePath + "/" + root + "/source"
lines = 0
folders = 0
files = 0
verbose = False

def tokenizeData(path, lines, folders, files):
    curDir = os.getcwd()
    os.chdir(path);
    for ff in glob.glob("*"):
        fPath = dPath = path + "/" + ff
        if os.path.isdir(fPath):
            dPath = fPath
            tPath = dPath.replace("source","target")
            if os.path.exists(tPath):
                shutil.rmtree(tPath)
            os.mkdir(tPath)
            folders += 1
            if verbose == False:
                print ("Create folders: %d, files: %d, lines: %d"%(folders, files, lines), end='\r')
            lines, folders, files = tokenizeData(dPath, lines, folders, files)
            continue
        ds = datetime.datetime.now()
        outPath = fPath.replace("source", "target")
        srv = subprocess.Popen("java -Xmx2g -jar " + taggerPath + ' "' + fPath + '" "' + outPath + '"', 
                stdout=subprocess.PIPE, shell=True) 
        srv.wait();
        reply = srv.communicate()
        rep = reply[0].decode()
        if "Error" in rep:
            print ("!!! When handle %s : %s"%(fPath, rep))
            return -1
        files += 1
        rArr = rep.split('\n');
        count = 0
        for i in range(len(rArr)):
            arr = rArr[i].split(' ')
            if len(arr) > 2 and arr[2] == "contains":
                count = int(arr[3]);
                break
        lines += count
        if verbose == True:
            print ("Created file %s contains %d sentences."%(outPath, count))
        else:
            print ("Create folders: %d, files: %d, lines: %d"%(folders, files, lines), end='\r')
    return lines,folders,files
        
def showTime(ds,de):
    result = ''
    seconds = (de-ds).total_seconds()
    hh = int(seconds/(60*60));
    if hh > 0:
        result = "%d h:"%(hh);
    seconds -= hh*60*60
    mm = int(seconds/60);
    if mm > 0:
        result += "%d min:"%(mm)
    ss = seconds - mm*60;
    result += "%d sec"%(ss)
    return result
        
ds = datetime.datetime.now()        
lines, folders, files = tokenizeData(inPath, lines, folders, files);
if lines >= 0:
    de = datetime.datetime.now()
    print ("At all: create %d folders and %d files containing %d lines in %s"%(folders, files, lines, showTime(ds,de)))

At all: create 40 folders and 16610 files containing 18840 lines in 12 h:19 min:43 sec
