In [1]:
import pycrfsuite
from sklearn.model_selection import train_test_split


In [2]:
def load_data(filename):
    sentences = []
    sentence = []
    labels = []
    label = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                word, pos_tag, chunk_tag = line.split()
                sentence.append((word, pos_tag))
                label.append(chunk_tag)
            else:
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
    return sentences, labels

In [3]:
training_file = "./train.txt/train.txt"
test_file = "./test.txt/test.txt"

# Load and preprocess data
sentences_train, labels_train = load_data(training_file)
sentences_test, labels_test = load_data(test_file)
print(sentences_train[0])
print(labels_train[0])

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT'), ('sharp', 'JJ'), ('dive', 'NN'), ('if', 'IN'), ('trade', 'NN'), ('figures', 'NNS'), ('for', 'IN'), ('September', 'NNP'), (',', ','), ('due', 'JJ'), ('for', 'IN'), ('release', 'NN'), ('tomorrow', 'NN'), (',', ','), ('fail', 'VB'), ('to', 'TO'), ('show', 'VB'), ('a', 'DT'), ('substantial', 'JJ'), ('improvement', 'NN'), ('from', 'IN'), ('July', 'NNP'), ('and', 'CC'), ('August', 'NNP'), ("'s", 'POS'), ('near-record', 'JJ'), ('deficits', 'NNS'), ('.', '.')]
['B-NP', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'I-VP', 'I-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'I-NP', 'B-SBAR', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'O', 'B-ADJP', 'B-PP', 'B-NP', 'B-NP', 'O', 'B-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-NP', 'O']


In [4]:
def transform_to_crf_format(sentences):
    X = []
    for sentence in sentences:
        sentence_features = []
        for word, pos_tag in sentence:
            features = {
                'bias': 1.0,
                'word.lower()': word.lower(),
                'word[-3:]': word[-3:],
                'word[-2:]': word[-2:],
                'word.isupper()': word.isupper(),
                'word.istitle()': word.istitle(),
                'word.isdigit()': word.isdigit(),
                'pos_tag': pos_tag
            }
            sentence_features.append(features)
        X.append(sentence_features)
    return X


In [5]:
def convert_labels(labels):
    new_labels = []
    for sentence_labels in labels:
        new_sentence_labels = []
        for label in sentence_labels:
            if label.startswith('B-'):
                new_sentence_labels.append('B')
            elif label.startswith('I-'):
                new_sentence_labels.append('I')
            else:
                new_sentence_labels.append(label)
        new_labels.append(new_sentence_labels)
    return new_labels


In [6]:
x_train = transform_to_crf_format(sentences_train)
y_train = convert_labels(labels_train)
print(x_train[0])
print(y_train[0])

[{'bias': 1.0, 'word.lower()': 'confidence', 'word[-3:]': 'nce', 'word[-2:]': 'ce', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'pos_tag': 'NN'}, {'bias': 1.0, 'word.lower()': 'in', 'word[-3:]': 'in', 'word[-2:]': 'in', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'pos_tag': 'IN'}, {'bias': 1.0, 'word.lower()': 'the', 'word[-3:]': 'the', 'word[-2:]': 'he', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'pos_tag': 'DT'}, {'bias': 1.0, 'word.lower()': 'pound', 'word[-3:]': 'und', 'word[-2:]': 'nd', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'pos_tag': 'NN'}, {'bias': 1.0, 'word.lower()': 'is', 'word[-3:]': 'is', 'word[-2:]': 'is', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'pos_tag': 'VBZ'}, {'bias': 1.0, 'word.lower()': 'widely', 'word[-3:]': 'ely', 'word[-2:]': 'ly', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': Fa

In [7]:
x_test = transform_to_crf_format(sentences_test)
y_test = convert_labels(labels_test)

In [8]:
# Đánh giá độ chính xác

def evaluate_accuracy(y_true, y_pred):
    correct = 0
    total = 0
    for true_labels, pred_labels in zip(y_true, y_pred):
        correct += sum(1 for true, pred in zip(true_labels,
                       pred_labels) if true == pred)
        total += len(true_labels)
    accuracy = correct / total
    return accuracy


In [9]:
# Tính toán các thống kê

def compute_statistics(y_true, y_pred):
    tp = 0  # True Positive
    fp = 0  # False Positive
    fn = 0  # False Negative

    for true_labels, pred_labels in zip(y_true, y_pred):
        for true, pred in zip(true_labels, pred_labels):
            if true == 'B' and pred == 'B':
                tp += 1
            elif true == 'B' and pred == 'I':
                fn += 1
            elif true == 'I' and pred == 'B':
                fp += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1_score

In [10]:
crf_model = pycrfsuite.Trainer()
for x, y in zip(x_train, y_train):
    crf_model.append(x, y)
crf_model.train('model.crfsuite')

# Dự đoán nhãn cho dữ liệu kiểm tra
tagger = pycrfsuite.Tagger()
tagger.open('model.crfsuite')
y_pred = [tagger.tag(x) for x in x_test]


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 29190
Seconds required: 0.163

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 187586.460751
Feature norm: 1.000000
Error norm: 48772.013510
Active features: 29190
Line search trials: 1
Line search step: 0.000011
Seconds required for this iteration: 0.125

***** Iteration #2 *****
Loss: 166690.259729
Feature norm: 1.451927
Error norm: 36913.118222
Active features: 29190
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.059

***** Iteration #3 *****
Loss: 122995.999235
Feature norm: 3.448444
Error norm: 27734.932802
Active features: 29190
Line search trials: 1
Line search step: 1.000000
Seconds re

In [11]:
# Đánh giá độ chính xác
accuracy = evaluate_accuracy(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8664541866306436


In [12]:
# Tính toán các thống kê
precision, recall, f1_score = compute_statistics(y_test, y_pred)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.9100036179450073
Recall: 0.8458531253940897
F1 Score: 0.8767564976797892
