In [1]:
import os
import time
from multiprocessing import Pool
from functools import partial
import pandas as pd
import json
import pyspark

from pyspark import SparkContext
from py4j.java_gateway import java_import
from pyspark.mllib.common import _to_java_object_rdd

In [2]:
# Import vnTokenizer from Java
java_import(sc._gateway.jvm, "vn.vitk.tok.Tokenizer")
Tokenizer = sc._jvm.vn.vitk.tok.Tokenizer
dataFolder = os.getcwd() + '/dat/tok'
token = Tokenizer(sc._jsc, dataFolder + "/lexicon.xml", dataFolder + "/regexp.txt")

In [36]:
topics = ['GiaoDuc', 'PhapLuat', 'TheGioi', 'TheThao', 'ThoiSu']

In [38]:
BATCH = 100

In [44]:
start = time.time()
for topic in topics:
    read_path = os.getcwd() + '/Data/Raw/' + topic
    write_path = os.getcwd() + '/Data/Tokenized/' + topic

    # resume
    list_tokened = os.listdir(write_path)
    list_raw = os.listdir(read_path)

    if(len(list_tokened)):
        last_file = list_tokened[-1]
        ind = list_tokened.index(last_file)
    else:
        ind = 0

    print(topic + ' Start at ' + str(ind))
    list_raw = list_raw[ind:]
    
    # Batching
    batch_indices = list_raw[0::BATCH]
    print('Number of remaining batches ', len(batch_indices))
    print('Each batch is ', BATCH)
    
    count_null = 0
    count_write = 0
    count = 0
    
    for b in range(len(batch_indices)): # in each batch
        
        # get all filename in its batch
        if(b < len(batch_indices)-1):
            start_index = list_raw.index(batch_indices[b])
            end_index = list_raw.index(batch_indices[b+1])
            list_file_name = list_raw[start_index:end_index]
        else:
            start_index = list_raw.index(batch_indices[b])
            list_file_name = list_raw[start_index:]            
        
        string_batch = ''
        for filename in list_file_name:
            # read data
            # read_time = time.time()

            with open(read_path+ '/'+ filename,'r') as fd:
                json_data = json.load(fd)

            title = json_data['title']
            title = title.replace('\t', ' ')
            title = title.replace('\n', ' ')

            content = json_data['content']
            content = content.replace('\t', ' ')
            content = content.replace('\n', ' ')

            title = ' '.join(title.split())
            content = ' '.join(content.split())

            if(content==title==""):
                count_null+=1
                bug_path = os.getcwd() + '/Data/log/null.txt' 
                with open(bug_path,'a') as fd:
                    fd.write(read_path+ '/'+ filename)
                    fd.write('\n')
                continue

            string_batch += filename + ' =========-----=========== '
            string_batch += title + ' '
            string_batch += content
            string_batch += ' ------=====-------------'
        
#         print(len(string_batch))
        string_batch_toked = token.tokenizeOneLine(string_batch)
        articles = string_batch_toked.split(' ------=====-------------')
#         print(len(articles))
        for article in articles:
            if(article):
#                 try:
                article_filename = article.split(' =========-----=========== ')[0].strip()
                article_content = article.split(' =========-----=========== ')[1].strip()
                # write data
                count_write += 1
                with open(write_path+ '/' + article_filename, 'w') as fd:
                    fd.write(article_content)
#                 except:
#                     pass
#                     print(string_batch)
#                     print('xxx----xxxx-x-x-xx-x-xxx-x-x-x-')
#                     print(article)
        count += BATCH
        if(count%1000==0):
            print('percent of topic', len(os.listdir(write_path))*100/len(os.listdir(read_path)))
            end = time.time()
            print('count null', count_null)
            print('count write', count_write)
            print('time', end - start)
            print('================')
            count_null = 0
            count_write = 0
            count = 0

GiaoDuc Start at 72918
Number of remaining batches  21
Each batch is  100
percent of topic 97.3122656239574
count null 225
count write 775
time 42.85903477668762
percent of topic 97.3122656239574
count null 235
count write 765
time 85.32816886901855
PhapLuat Start at 53291
Number of remaining batches  7
Each batch is  100
TheGioi Start at 2671
Number of remaining batches  742
Each batch is  100
percent of topic 4.7500032543184805
count null 6
count write 994
time 176.7807195186615
percent of topic 6.041316827429999
count null 8
count write 992
time 220.9223177433014
percent of topic 7.333932127933768
count null 7
count write 993
time 263.98820090293884
percent of topic 8.618737064084039
count null 13
count write 987
time 309.3694727420807
percent of topic 9.911352364587808
count null 7
count write 993
time 358.8764612674713
percent of topic 11.19355384595358
count null 15
count write 985
time 409.04786491394043
percent of topic 12.436703505551867
count null 45
count write 955
time 458.

percent of topic 45.720257559923134
count null 53
count write 947
time 5013.5677921772
percent of topic 48.91952938003573
count null 51
count write 949
time 5065.631062269211
percent of topic 52.13565721606041
count null 46
count write 954
time 5117.88457608223
percent of topic 55.405724303003744
count null 30
count write 970
time 5184.049728393555
percent of topic 58.51734484037353
count null 77
count write 923
time 5239.2453553676605
percent of topic 60.82324781714594
count null 316
count write 684
time 5282.905552387238
percent of topic 63.53369517580825
count null 196
count write 804
time 5327.03117275238
percent of topic 66.81387587229882
count null 27
count write 973
time 5379.323719739914
percent of topic 70.14462461652563
count null 12
count write 988
time 5436.795388460159
percent of topic 73.44840373529313
count null 20
count write 980
time 5491.370729207993
percent of topic 76.7049860095068
count null 34
count write 966
time 5544.164760351181
percent of topic 79.954825877355