In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd '/content/gdrive/My Drive/UITws_v1'

/content/gdrive/My Drive/UITws_v1


In [3]:
import sklearn.svm as svm
import multiprocessing as mp
from UITws_v1 import WSUtils, WSCountVectorizer

import time
import datetime
import json
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
start = time.time()

print(datetime.datetime.now())

2022-04-15 04:36:11.570422


In [5]:
cpu_count = mp.cpu_count() # Number of CPUs for parallel
print(cpu_count)

2


In [6]:
for feature in [['base'],
                ['base', 'long'],
                ['base', 'sep'],
                ['base', 'sfx'],
                ['base', 'long', 'sep'],
                ['base', 'long', 'sfx'],
                ['base', 'sep', 'sfx'],
                ['base', 'long', 'sep', 'sfx']]:
    feature_name = '_'.join(feature)
    print(f'feature_name: {feature_name}')
    utils_args = json.load(open("./dict/vi_dict.json", "r", encoding="utf-8"))
    utils_args.update({"window_size" : 8 if 'long' in feature else 5})

    w_utils               = WSUtils(utils_args) # Utils object for Vietnamese Word Segmentation (RDRVNDict)
    training_sentences    = w_utils.read_ws_corpus('./data/VLSP2013_WS_train_gold.txt') # Read sentences for training

    X_syls_train, Y_train = w_utils.extract_training_pairs(training_sentences) # Extract training window of syllables
    ratios                = w_utils.compute_ratios(training_sentences) # Extract separable syllables
    seen_words, seen_sfx  = w_utils.pop_seen_words_sfx(training_sentences, ratios) # Extract know words and unknow words containing suffixes

    vectorizer_WS         = WSCountVectorizer(utils=w_utils, ratios=ratios, feature_set=feature)
    X_train               = vectorizer_WS.fit_transform(X_syls_train) # Transformation function

    model_WS              = svm.LinearSVC(C=0.1) # Linear SVM model
    model_WS.fit(X_train, Y_train) # Train Vietnamese Word segmentation model

    # Save model for inference
    
    with open(f'./checkpoints/{feature_name}.pkl', 'wb') as handle:
        pickle.dump({'model_WS': model_WS,
                     'utils_args': w_utils.get_utils_args(),
                     'ratios': vectorizer_WS.ratios,
                     'feature_set': vectorizer_WS.feature_set,
                     'vocabulary_': vectorizer_WS.vocabulary_,
                     'fixed_vocabulary_': vectorizer_WS.fixed_vocabulary_,
                     'stop_words_': vectorizer_WS.stop_words_,
                     'seen_words': seen_words,
                     'seen_sfx': seen_sfx}, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    test_sentences        = w_utils.read_ws_corpus('./data/VLSP2013_WS_test_gold.txt') # Read sentences for testing

    pred_stns_test, prf_t = w_utils.predict_list_of_sentence_ws(model=model_WS, vectorizer=vectorizer_WS, NUM_PROCESSES=cpu_count, list_stn=test_sentences, get_support=False, has_underscore=True) # Predict

    print([round(100*i, 2) for i in prf_t]) # Precision, Recall, F1-score
    print('')

feature_name: base
[97.5, 98.19, 97.85]

feature_name: base_long
[97.63, 98.32, 97.98]

feature_name: base_sep
[97.61, 98.46, 98.04]

feature_name: base_sfx
[97.59, 98.24, 97.92]

feature_name: base_long_sep
[97.71, 98.51, 98.11]

feature_name: base_long_sfx
[97.74, 98.38, 98.06]

feature_name: base_sep_sfx
[97.76, 98.53, 98.14]

feature_name: base_long_sep_sfx
[97.81, 98.57, 98.19]



In [7]:
end = time.time()
print(end - start)

print(datetime.datetime.now())

1803.8046896457672
2022-04-15 05:06:15.375803
