In [1]:
import os
from bs4 import BeautifulSoup
from pathlib import Path
import csv
import pandas as pd

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def convert_to_conll(text):
    soup = BeautifulSoup(text, "html.parser")
    tokens_with_labels = []
    
    for elem in soup.find_all(string=True):
        if elem.strip() == "":
            continue
        parent = elem.parent
        entity_type = parent.get("type")
        words = word_tokenize(elem.strip())  # Dùng nltk để tách từ
        
        if entity_type:
            tokens_with_labels.append((words[0], f"B-{entity_type}"))  # B- tag
            for word in words[1:]:
                tokens_with_labels.append((word, f"I-{entity_type}"))  # I- tag
        else:
            for word in words:
                tokens_with_labels.append((word, "O"))  # Outside entity (O)
    
    return tokens_with_labels


In [4]:
convert_to_conll("""<ENAMEX TYPE="LOCATION">Trung Quốc</ENAMEX>, mạnh tay hạ cát-xê đóng phim dàn sao hạng A""")

[('Trung', 'B-LOCATION'),
 ('Quốc', 'I-LOCATION'),
 (',', 'O'),
 ('mạnh', 'O'),
 ('tay', 'O'),
 ('hạ', 'O'),
 ('cát-xê', 'O'),
 ('đóng', 'O'),
 ('phim', 'O'),
 ('dàn', 'O'),
 ('sao', 'O'),
 ('hạng', 'O'),
 ('A', 'O')]

In [6]:
root_folder = Path(r"F:\Năm_3-HK2\CS321\server\src\data\NER-20211011T010131Z-001\NER\nervlsp2018\nervlsp2018\VLSP2018-NER-train\VLSP2018-NER-train-Jan14")

In [None]:
data = []

for index, file_path in enumerate(root_folder.rglob("*.muc")):
     try:
          with open(file_path,"r", encoding="utf-8") as file:
               raw_text = file.read()
          data.append([index, file_path, raw_text])
     except Exception as e:
          print(f"Error reading file {file_path}: {e}")

csv_path = root_folder / "train.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as file:
     writer = csv.writer(file)
     writer.writerow(["index", "file_path", "text"])
     writer.writerows(data)

print(f"Saved {len(data)} records to {csv_path}")


Saved 781 records to F:\Năm_3-HK2\CS321\server\src\data\NER-20211011T010131Z-001\NER\nervlsp2018\nervlsp2018\VLSP2018-NER-train\VLSP2018-NER-train-Jan14\train.csv


In [7]:
from underthesea import sent_tokenize
csv_path = root_folder / "train.csv"
df = pd.read_csv(csv_path)

train_data = df['text']
train_data


0      Chuyện đau khi tiêm\n\nNỗi đau của người bệnh ...
1      10 hiện tượng kỳ lạ xảy ra trong khi bạn ngủ\n...
2      Chất coumarin trong thuốc lá nhập lậu gây hại ...
3      <ENAMEX TYPE="PERSON">Hòa Minzy</ENAMEX> tố tr...
4      Quà tặng cuộc sống: Yêu thương không chờ đợi\n...
                             ...                        
776    2,2 tỷ đồng ủng hộ đồng bào <ENAMEX TYPE="LOCA...
777    Tăng cường hợp tác giữa các tổ chức phi Chính ...
778    Cơ sở chế biến thủy sản gây ô nhiễm\n\nDù cơ s...
779    Trạm xe buýt nhếch nhác\n\nTrạm xe buýt gần ng...
780    Sẽ giảm phí hàng loạt trạm BOT\n\n<ENAMEX TYPE...
Name: text, Length: 781, dtype: object

In [8]:
preprocessed_train_data = []
for text in train_data:
     sentences = sent_tokenize(text)
     sentences = [convert_to_conll(sentence) for sentence in sentences]
     preprocessed_train_data+=sentences

In [10]:
preprocessed_train_data[15]

[('Một', 'O'),
 ('dược', 'O'),
 ('sĩ', 'O'),
 ('có', 'O'),
 ('kinh', 'O'),
 ('nghiệm', 'O'),
 ('lâm', 'O'),
 ('sàng', 'O'),
 ('mấy', 'O'),
 ('chục', 'O'),
 ('năm', 'O'),
 ('trong', 'O'),
 ('dùng', 'O'),
 ('thuốc', 'O'),
 ('giảm', 'O'),
 ('đau', 'O'),
 ('cho', 'O'),
 ('bệnh', 'O'),
 ('nhân', 'O'),
 ('ung', 'O'),
 ('thư', 'O'),
 ('giai', 'O'),
 ('đoạn', 'O'),
 ('cuối', 'O'),
 (',', 'O'),
 ('chuyên', 'O'),
 ('gia', 'O'),
 ('của', 'O'),
 ('Tổ', 'B-ORGANIZATION'),
 ('chức', 'I-ORGANIZATION'),
 ('y', 'I-ORGANIZATION'),
 ('tế', 'I-ORGANIZATION'),
 ('Thế', 'I-ORGANIZATION'),
 ('giới', 'I-ORGANIZATION'),
 ('về', 'O'),
 ('chăm', 'O'),
 ('sóc', 'O'),
 ('giảm', 'O'),
 ('nhẹ', 'O'),
 ('có', 'O'),
 ('kể', 'O'),
 (':', 'O'),
 ('bệnh', 'O'),
 ('nhân', 'O'),
 ('ung', 'O'),
 ('thư', 'O'),
 (',', 'O'),
 ('do', 'O'),
 ('bị', 'O'),
 ('đau', 'O'),
 ('triền', 'O'),
 ('miên', 'O'),
 (',', 'O'),
 ('kéo', 'O'),
 ('dài', 'O'),
 (',', 'O'),
 ('họ', 'O'),
 ('hiểu', 'O'),
 ('về', 'O'),
 ('cơn', 'O'),
 ('đau', 'O'),

In [11]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [50]:
from sklearn.model_selection import train_test_split

# Tạo tập feature và label
X = [sent2features(s) for s in preprocessed_train_data]
y = [sent2labels(s) for s in preprocessed_train_data]

# Chia dữ liệu train = 1000, test = 1000
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=300, test_size=100, random_state=42
)

# Kiểm tra kích thước tập dữ liệu
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 300, Test size: 100


In [51]:
X_train[2]

[{'bias': 1.0,
  'word.lower()': 'ảnh',
  'word[-3:]': 'Ảnh',
  'word[-2:]': 'nh',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'BOS': True,
  '+1:word.lower()': ':',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': ':',
  'word[-3:]': ':',
  'word[-2:]': ':',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': 'ảnh',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'foody.vn',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'foody.vn',
  'word[-3:]': '.vn',
  'word[-2:]': 'vn',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': ':',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '+1:word.lower()': '.',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': '.',
  'word[-3:]': 

In [52]:
%%time
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: total: 859 ms
Wall time: 870 ms


In [53]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-LOCATION',
 'I-LOCATION',
 'B-PERSON',
 'I-PERSON',
 'B-MISCELLANEOUS',
 'I-MISCELLANEOUS']

In [54]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.5324911249667502

In [55]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

     B-LOCATION      0.721     0.500     0.591        88
     I-LOCATION      0.711     0.607     0.655        89
B-MISCELLANEOUS      1.000     0.167     0.286         6
I-MISCELLANEOUS      1.000     0.083     0.154        12
 B-ORGANIZATION      0.600     0.279     0.381        43
 I-ORGANIZATION      0.488     0.260     0.339        77
       B-PERSON      0.731     0.543     0.623        35
       I-PERSON      0.757     0.757     0.757        37

      micro avg      0.681     0.463     0.551       387
      macro avg      0.751     0.399     0.473       387
   weighted avg      0.676     0.463     0.532       387



In [33]:
import pickle

with open("crf_model.pkl", "wb") as f:
     pickle.dump(crf, f)

with open("crf_model.pkl", "rb") as f:
     loaded_model = pickle.load(f)

In [35]:
test_sent = """Học sinh trường, Nguyễn Huệ. Tôi ở Huế"""

test_sent = sent_tokenize(test_sent)
pre_sent = [convert_to_conll(sent) for sent in test_sent]
pre_sent = [sent2features(s) for s in pre_sent]

crf.predict(pre_sent)

array([list(['O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O']),
       list(['O', 'O', 'B-LOCATION'])], dtype=object)

In [57]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-LOCATION -> I-LOCATION 6.159420
I-ORGANIZATION -> I-ORGANIZATION 5.090562
B-ORGANIZATION -> I-ORGANIZATION 4.985935
I-MISCELLANEOUS -> I-MISCELLANEOUS 4.475509
B-MISCELLANEOUS -> I-MISCELLANEOUS 4.395166
I-LOCATION -> I-LOCATION 4.329938
B-PERSON -> I-PERSON 4.175034
I-PERSON -> I-PERSON 3.392488
O      -> O       2.526575
B-ORGANIZATION -> B-LOCATION 1.757095
I-ORGANIZATION -> B-LOCATION 1.370370
O      -> B-PERSON 0.900135
O      -> B-MISCELLANEOUS 0.837638
O      -> B-ORGANIZATION 0.747948
O      -> B-LOCATION 0.602809
I-LOCATION -> B-ORGANIZATION 0.585503
I-ORGANIZATION -> B-ORGANIZATION 0.450746
I-ORGANIZATION -> B-PERSON 0.203053
B-MISCELLANEOUS -> I-PERSON -0.000017
I-MISCELLANEOUS -> B-PERSON -0.012799

Top unlikely transitions:
B-ORGANIZATION -> I-PERSON -0.562853
B-PERSON -> I-ORGANIZATION -0.605317
B-LOCATION -> I-PERSON -0.620580
B-PERSON -> O       -0.644018
I-LOCATION -> O       -0.645291
I-PERSON -> B-PERSON -0.810470
B-PERSON -> B-PERSON -0.821

In [22]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.501903 O        BOS
4.633844 O        bias
3.768639 I-LOCATION +1:word.lower():,
3.535465 O        EOS
3.531515 B-ORGANIZATION -1:word.lower():(
3.491343 B-PERSON -1:word.lower():ông
3.461747 B-ORGANIZATION word[-3:]:ank
3.461747 B-ORGANIZATION word[-2:]:nk
3.286244 B-LOCATION -1:word.lower():ở
3.264489 B-PERSON -1:word.lower():bà
3.083158 B-LOCATION +1:word.lower():,
3.048917 B-LOCATION word[-2:]:ia
2.954745 B-ORGANIZATION word[-2:]:le
2.934610 I-ORGANIZATION +1:word.lower():,
2.906658 B-PERSON word.istitle()
2.771158 B-ORGANIZATION +1:word.lower():,
2.763061 B-LOCATION word[-3:]:HCM
2.763061 B-LOCATION word[-2:]:CM
2.728897 B-ORGANIZATION word[-2:]:NA
2.726796 B-ORGANIZATION +1:word.lower():)
2.666674 B-PERSON +1:word.lower():,
2.664998 I-PERSON +1:word.lower():,
2.605291 O        -1:word.lower():iphone
2.578221 B-PERSON -1:word.lower():anh
2.568096 B-LOCATION word.lower():lào
2.568096 B-LOCATION word[-3:]:Lào
2.523456 B-ORGANIZATION word.lower():pvtex
2.523456 B-ORGA