In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd '/content/gdrive/My Drive/UITws_v1'

/content/gdrive/My Drive/UITws_v1


In [3]:
import sklearn.svm as svm
import multiprocessing as mp
from UITws_v1 import WSUtils, WSCountVectorizer

import time
import datetime
import json
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
print(datetime.datetime.now())

2022-04-15 05:45:41.657528


In [5]:
cpu_count = mp.cpu_count() # Number of CPUs for parallel
print(cpu_count)

2


In [6]:
# Testing with multiprocessing

total_time = 0

for feature in [['base'],
                ['base', 'long'],
                ['base', 'sep'],
                ['base', 'sfx'],
                ['base', 'long', 'sep'],
                ['base', 'long', 'sfx'],
                ['base', 'sep', 'sfx'],
                ['base', 'long', 'sep', 'sfx']]:
    feature_name = '_'.join(feature)
    print(f'feature_name: {feature_name}')
    with open(f'./checkpoints/{feature_name}.pkl', 'rb') as handle:
        checkpoint = pickle.load(handle)

    w_utils                         = WSUtils(checkpoint['utils_args']) # Utils object for Vietnamese Word Segmentationpredict_funct (RDRVNDict)
    vectorizer_WS                   = WSCountVectorizer(utils=w_utils, ratios=checkpoint['ratios'], feature_set=checkpoint['feature_set'])
    vectorizer_WS.vocabulary_       = checkpoint['vocabulary_']
    vectorizer_WS.fixed_vocabulary_ = checkpoint['fixed_vocabulary_']
    vectorizer_WS.stop_words_       = checkpoint['stop_words_']
    model_WS                        = checkpoint['model_WS']

    del checkpoint
        
    test_sentences        = w_utils.read_ws_corpus('./data/VLSP2013_WS_test_gold.txt') # Read sentences for testing

    start = time.time()
    pred_stns_test, prf_t = w_utils.predict_list_of_sentence_ws(model=model_WS, vectorizer=vectorizer_WS, NUM_PROCESSES=cpu_count, list_stn=test_sentences, get_support=False, has_underscore=True) # Predict
    end = time.time()
    total_time += (end - start)

    print([round(100*i, 2) for i in prf_t]) # Precision, Recall, F1-score
    print('')


print('total_time:', total_time)

feature_name: base
[97.5, 98.19, 97.85]

feature_name: base_long
[97.63, 98.32, 97.98]

feature_name: base_sep
[97.61, 98.46, 98.04]

feature_name: base_sfx
[97.59, 98.24, 97.92]

feature_name: base_long_sep
[97.71, 98.51, 98.11]

feature_name: base_long_sfx
[97.74, 98.38, 98.06]

feature_name: base_sep_sfx
[97.76, 98.53, 98.14]

feature_name: base_long_sep_sfx
[97.81, 98.57, 98.19]

total_time: 120.67523312568665


In [7]:
# Testing with batchifying

total_time = 0

for feature in [['base'],
                ['base', 'long'],
                ['base', 'sep'],
                ['base', 'sfx'],
                ['base', 'long', 'sep'],
                ['base', 'long', 'sfx'],
                ['base', 'sep', 'sfx'],
                ['base', 'long', 'sep', 'sfx']]:
    feature_name = '_'.join(feature)
    print(f'feature_name: {feature_name}')
    with open(f'./checkpoints/{feature_name}.pkl', 'rb') as handle:
        checkpoint = pickle.load(handle)

    w_utils                         = WSUtils(checkpoint['utils_args']) # Utils object for Vietnamese Word Segmentationpredict_funct (RDRVNDict)
    vectorizer_WS                   = WSCountVectorizer(utils=w_utils, ratios=checkpoint['ratios'], feature_set=checkpoint['feature_set'])
    vectorizer_WS.vocabulary_       = checkpoint['vocabulary_']
    vectorizer_WS.fixed_vocabulary_ = checkpoint['fixed_vocabulary_']
    vectorizer_WS.stop_words_       = checkpoint['stop_words_']
    model_WS                        = checkpoint['model_WS']

    del checkpoint
        
    test_sentences                  = w_utils.read_ws_corpus('./data/VLSP2013_WS_test_gold.txt') # Read sentences for testing

    start = time.time()
    pred_stns_test                  = w_utils.segment_words(model=model_WS, vectorizer=vectorizer_WS, texts=[i.replace('_', ' ') for i in test_sentences], pre_tokenized=True, batch_size=512) # Predict with batchifying
    end = time.time()
    total_time += (end - start)

    nb_correct, nb_output, nb_ref = 0, 0, 0
    for idx, p_sentence in enumerate(pred_stns_test):
        n_c, n_p, n_r = w_utils.get_support(w_utils.exact_wordboundary(test_sentences[idx]), w_utils.exact_wordboundary(p_sentence))
        nb_correct   += n_c
        nb_output    += n_p
        nb_ref       += n_r

    precision = nb_correct/nb_output
    recall    = nb_correct/nb_ref
    if precision+recall > 0: f1_score = 2*precision*recall/(precision+recall)
    else: f1_score = 0

    print([round(precision*100,2), round(recall*100,2), round(f1_score*100,2)]) # Precision, Recall, F1-score
    print('')


print('total_time:', total_time)

feature_name: base
[97.5, 98.19, 97.85]

feature_name: base_long
[97.63, 98.32, 97.98]

feature_name: base_sep
[97.61, 98.46, 98.04]

feature_name: base_sfx
[97.59, 98.24, 97.92]

feature_name: base_long_sep
[97.71, 98.51, 98.11]

feature_name: base_long_sfx
[97.74, 98.38, 98.06]

feature_name: base_sep_sfx
[97.76, 98.53, 98.14]

feature_name: base_long_sep_sfx
[97.81, 98.57, 98.19]

total_time: 64.51355910301208
