In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import copy as cp
import operator
import sefr_cut
from sefr_cut.preprocessing import preprocess
prepro = preprocess()
import sefr_cut.extract_features as extract_features
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
import math

'''
read more about pycrfsuite: https://python-crfsuite.readthedocs.io/en/latest/
'''
import pycrfsuite

# Prepare dataset

In [2]:
'''
path_corpus : put your training corpus in corpus/ and put the folder name here
y_pred : y from deepcut shape(#sentence,#character in sentence) ex. [[1,0,0,0,.....,0],[1,0,0,1,....,0]]
y_entropy : entropy calcuated from y_prob shape(#sentence,#character in sentence) ex. [[0.01,0.1,0.15,.....,0],[0.01,0.2,0.45,.....,0]]
y_prob : probability from softmax layer shape(#sentence,#character in sentence) ex. [[0.01,0.1,0.15,.....,0],[0.01,0.2,0.45,.....,0]]
'''

#path_corpus = ['CORPUS_FOLDER_NAME']
path_corpus = ['corpus/TH_lyrics']

# create x,y
x,y_true = prepro.preprocess_x_y(path_corpus)

# 2D to 1D
y_true = [j for sub in y_true for j in sub if len(j) > 1]
x = [j for sub in x for j in sub if len(j) > 1]

y_pred,y_entropy,y_prob = prepro.predict_(x) # DeepCut Baseline/BEST+WS/WS



In [4]:
X_data = []
for idx,item in enumerate(x):
    X_data.append(extract_features.extract_features_crf(x[idx],idx,y_entropy,y_prob))
y_data = [list(map(str, l)) for l in y_true]

#2d to 1d
X_data_1d = [j for sub in X_data for j in sub] 
y_data_1d = [j for sub in y_data for j in sub]


In [9]:
# Sample data
# Charater type from DeepCut
X_data_1d[21]

[{'bias': 'b',
  'char': 'ล',
  'entropy': 2.1099696347570087e-09,
  'prob': 8.732831124902418e-11,
  'start': False,
  'end': False,
  'char_[-1]': 'อ',
  'ctype[-1]': 'c',
  'char_[-2]': 'ม',
  'ctype[-2]': 'c',
  'char_[-3]': 'ส',
  'ctype[-3]': 'c',
  'char_[-4]': 'ะ',
  'ctype[-4]': 'v',
  'char_[+1]': 'ท',
  'ctype[+1]': 'c',
  'char_[+2]': 'า',
  'ctype[+2]': 'v',
  'dict_start': False,
  'dict_end': False}]

In [16]:
print("Number of sentences in dataset:", len(x))
print("Number of characters in dataset:", len(X_data_1d))

Number of sentence: 2844
Number of dataset: 70658


In [17]:
#if need to split train test
# X_train, X_test, y_train, y_test = train_test_split(X_data_1d, y_data_1d, test_size=0.1, random_state=99)

#Not split train test set
X_train = X_data_1d 
y_train = y_data_1d

In [18]:
#name of model
CRF_model_name = 'my_model.model'

# Train model

In [19]:
# Train model
trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.01,
    'c2': 0.01,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

#your model name
trainer.train(f'model/{CRF_model_name}')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 789
Seconds required: 0.062

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 48702.956978
Feature norm: 1.000000
Error norm: 49019.854233
Active features: 771
Line search trials: 1
Line search step: 0.000019
Seconds required for this iteration: 0.040

***** Iteration #2 *****
Loss: 34844.344130
Feature norm: 0.768047
Error norm: 29443.012769
Active features: 727
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #3 *****
Loss: 27607.624424
Feature norm: 0.979594
Error norm: 19668.471971
Active features: 701
Line search trials: 1
Line search step: 1.000000
Seconds required for this i

# Test model

In [26]:
model_path = 'model/my_model.model'
sefr_cut.SEFR_CUT.load_model(engine='my_model', model_path=model_path)

loading model.....
Success


<contextlib.closing at 0x221eb580430>

In [22]:
print(sefr_cut.tokenize(['อเวย์ฟรอมโฮม', 'แด๊ดเวย์อิสมายช้อย', 'เดอะซันบีตติ่งดาว', 'เทกคอนโทลออฟยัวไลฟ์'], k=40))

[['อเวย์', 'ฟรอม', 'โฮม'], ['แด๊ด', 'เวย์', 'อิส', 'มาย', 'ช้อย'], ['เดอะ', 'ซัน', 'บีตติ่ง', 'ดาว'], ['เทกคอน', 'โทล', 'ออฟ', 'ยัว', 'ไลฟ์']]
