#### Tokenization with Stanford POS Tagger
Source text file(s) are splitted by sentences and sentences are tokenized by wrapper of Stanford POS Tagger (**ArabicStanfordPOSTagger.jar**), which uses Arabic specific model and properties.
It is assumed here, that all source text files will be placed in the folder ~/MLClassificationData.    
Original and resulting (tokenized) files will have the same names and will be placed in the same file's tree, under the same root folder:
- original files - under subfolder _source_, 
- tokenized files - under subfolder _target_.

_**Note:**_ you should change value of variable _root_ to set actual name of the root folder.


In [8]:
import os
import glob
import subprocess   
import threading
import datetime
import shutil
from pathlib import Path
from subprocess import Popen, PIPE

taggerPath = str(Path.home()) + "/MLClassification/tokenizer/arabic/stanford/taggers/ArabicStanfordPOSTagger.jar" 

# Change value of this variable 
root = "test"
homePath = str(Path.home()) + "/MLClassificationData"
inPath = homePath + "/" + root + "/source"
lines = 0

def tokenizeData(path):
    global lines
    curDir = os.getcwd()
    os.chdir(path);
    for ff in glob.glob("*"):
        if os.path.isdir(ff):
            dPath = path + "/" + ff
            tPath = dPath.replace("source","target")
            if os.path.exists(tPath):
                shutil.rmtree(tPath)
            os.mkdir(tPath)
            tokenizeData(dPath)
            continue
        fPath = path + "/" + ff
        ds = datetime.datetime.now()
        outPath = fPath.replace("source", "target")
        srv = subprocess.Popen("java -Xmx2g -jar " + taggerPath + " " + fPath + " " + outPath, 
                stdout=subprocess.PIPE, shell=True) 
        srv.wait();
        reply = srv.communicate()
        rArr = reply[0].decode().split('\n');
        count = 0
        for i in range(len(rArr)):
            arr = rArr[i].split(' ')
            if len(arr) > 2 and arr[2] == "contains":
                count = int(arr[3]);
                break
        lines += count
        print ("Created file %s contains %d sentences."%(fPath, count))
    return lines
        
def showTime(ds,de):
    result = ''
    seconds = (de-ds).total_seconds()
    hh = int(seconds/(60*24));
    if hh > 0:
        result = "%d h:"%(hh);
    seconds -= hh*60*24
    mm = int(seconds/60);
    if mm > 0:
        result += "%d min:"%(mm)
    ss = seconds - mm*60;
    result += "%d sec"%(ss)
    return result
        
ds = datetime.datetime.now()        
tokenizeData(inPath);
de = datetime.datetime.now()
print ("At all: wrote %d lines in %s"%(lines, showTime(ds,de)))

Created file /home/user/MLClassificationData/test/source/docs.txt contains 34 sentences.
Created file /home/user/MLClassificationData/test/source/Fold1/docs.txt contains 34 sentences.
Created file /home/user/MLClassificationData/test/source/Fold1/Fold2/docs.txt contains 34 sentences.
Created file /home/user/MLClassificationData/test/source/Fold1/zdocs.txt contains 34 sentences.
Created file /home/user/MLClassificationData/test/source/zdocs.txt contains 34 sentences.
At all: wrote 170 lines in 7 sec
