In [2]:
import os
from bs4 import BeautifulSoup
from pathlib import Path
import csv
import pandas as pd

In [3]:
def convert_to_conll(text):
    soup = BeautifulSoup(text, "html.parser")
    tokens_with_labels = []
    
    for elem in soup.find_all(string=True):
        if elem.strip() == "":
            continue
        parent = elem.parent
        entity_type = parent.get("type")
        words = elem.strip().split()
        
        if entity_type:
            tokens_with_labels.append((words[0], f"B-{entity_type}"))  # B- tag
            for word in words[1:]:
                tokens_with_labels.append((word, f"I-{entity_type}"))  # I- tag
        else:
            for word in words:
                tokens_with_labels.append((word, "O"))  # Outside entity (O)
    
    return tokens_with_labels


In [7]:
root_folder = Path(r"F:\Năm_3-HK2\CS321\server\src\data\NER-20211011T010131Z-001\NER\nervlsp2018\nervlsp2018\VLSP2018-NER-train\VLSP2018-NER-train-Jan14")

data = []

for index, file_path in enumerate(root_folder.rglob("*.muc")):
     try:
          with open(file_path,"r", encoding="utf-8") as file:
               raw_text = file.read()
          data.append([index, file_path, raw_text])
     except Exception as e:
          print(f"Error reading file {file_path}: {e}")

csv_path = root_folder / "train.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as file:
     writer = csv.writer(file)
     writer.writerow(["index", "file_path", "text"])
     writer.writerows(data)

print(f"Saved {len(data)} records to {csv_path}")


Saved 781 records to F:\Năm_3-HK2\CS321\server\src\data\NER-20211011T010131Z-001\NER\nervlsp2018\nervlsp2018\VLSP2018-NER-train\VLSP2018-NER-train-Jan14\train.csv


In [9]:
from underthesea import sent_tokenize

df = pd.read_csv(csv_path)

train_data = df['text']
train_data


0      Chuyện đau khi tiêm\n\nNỗi đau của người bệnh ...
1      10 hiện tượng kỳ lạ xảy ra trong khi bạn ngủ\n...
2      Chất coumarin trong thuốc lá nhập lậu gây hại ...
3      <ENAMEX TYPE="PERSON">Hòa Minzy</ENAMEX> tố tr...
4      Quà tặng cuộc sống: Yêu thương không chờ đợi\n...
                             ...                        
776    2,2 tỷ đồng ủng hộ đồng bào <ENAMEX TYPE="LOCA...
777    Tăng cường hợp tác giữa các tổ chức phi Chính ...
778    Cơ sở chế biến thủy sản gây ô nhiễm\n\nDù cơ s...
779    Trạm xe buýt nhếch nhác\n\nTrạm xe buýt gần ng...
780    Sẽ giảm phí hàng loạt trạm BOT\n\n<ENAMEX TYPE...
Name: text, Length: 781, dtype: object

In [10]:
preprocessed_train_data = []
for text in train_data:
     sentences = sent_tokenize(text)
     sentences = [convert_to_conll(sentence) for sentence in sentences]
     preprocessed_train_data+=sentences

In [15]:
preprocessed_train_data[15:20]

[[('Một', 'O'),
  ('dược', 'O'),
  ('sĩ', 'O'),
  ('có', 'O'),
  ('kinh', 'O'),
  ('nghiệm', 'O'),
  ('lâm', 'O'),
  ('sàng', 'O'),
  ('mấy', 'O'),
  ('chục', 'O'),
  ('năm', 'O'),
  ('trong', 'O'),
  ('dùng', 'O'),
  ('thuốc', 'O'),
  ('giảm', 'O'),
  ('đau', 'O'),
  ('cho', 'O'),
  ('bệnh', 'O'),
  ('nhân', 'O'),
  ('ung', 'O'),
  ('thư', 'O'),
  ('giai', 'O'),
  ('đoạn', 'O'),
  ('cuối,', 'O'),
  ('chuyên', 'O'),
  ('gia', 'O'),
  ('của', 'O'),
  ('Tổ', 'B-ORGANIZATION'),
  ('chức', 'I-ORGANIZATION'),
  ('y', 'I-ORGANIZATION'),
  ('tế', 'I-ORGANIZATION'),
  ('Thế', 'I-ORGANIZATION'),
  ('giới', 'I-ORGANIZATION'),
  ('về', 'O'),
  ('chăm', 'O'),
  ('sóc', 'O'),
  ('giảm', 'O'),
  ('nhẹ', 'O'),
  ('có', 'O'),
  ('kể:', 'O'),
  ('bệnh', 'O'),
  ('nhân', 'O'),
  ('ung', 'O'),
  ('thư,', 'O'),
  ('do', 'O'),
  ('bị', 'O'),
  ('đau', 'O'),
  ('triền', 'O'),
  ('miên,', 'O'),
  ('kéo', 'O'),
  ('dài,', 'O'),
  ('họ', 'O'),
  ('hiểu', 'O'),
  ('về', 'O'),
  ('cơn', 'O'),
  ('đau', 'O'),
  (

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [22]:
from sklearn.model_selection import train_test_split

# Tạo tập feature và label
X = [sent2features(s) for s in preprocessed_train_data]
y = [sent2labels(s) for s in preprocessed_train_data]

# Chia dữ liệu train = 1000, test = 1000
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=1000, test_size=1000, random_state=42
)

# Kiểm tra kích thước tập dữ liệu
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 1000, Test size: 1000


In [25]:
X_train[2]

[{'bias': 1.0,
  'word.lower()': 'kodak',
  'word[-3:]': 'dak',
  'word[-2:]': 'ak',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'BOS': True,
  '+1:word.lower()': 'phát',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'phát',
  'word[-3:]': 'hát',
  'word[-2:]': 'át',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': 'kodak',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'triển',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'triển',
  'word[-3:]': 'iển',
  'word[-2:]': 'ển',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': 'phát',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'ứng',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'ứng',
 

In [15]:
%%time
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: total: 2.69 s
Wall time: 2.69 s


In [16]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOCATION',
 'I-LOCATION',
 'B-ORGANIZATION',
 'B-PERSON',
 'I-PERSON',
 'B-MISCELLANEOUS',
 'I-MISCELLANEOUS',
 'I-ORGANIZATION']

In [17]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.7084953067068765

In [18]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

     B-LOCATION      0.801     0.714     0.755       686
     I-LOCATION      0.808     0.711     0.756       691
B-MISCELLANEOUS      0.741     0.317     0.444        63
I-MISCELLANEOUS      0.597     0.272     0.374       136
 B-ORGANIZATION      0.702     0.555     0.620       373
 I-ORGANIZATION      0.684     0.614     0.647       554
       B-PERSON      0.766     0.724     0.745       479
       I-PERSON      0.750     0.889     0.813       451

      micro avg      0.755     0.680     0.715      3433
      macro avg      0.731     0.600     0.644      3433
   weighted avg      0.752     0.680     0.708      3433



In [19]:
import pickle

with open("crf_model.pkl", "wb") as f:
     pickle.dump(crf, f)

with open("crf_model.pkl", "rb") as f:
     loaded_model = pickle.load(f)

In [20]:
test_sent = """Học sinh trường Nguyễn Huệ. Tôi ở Huế"""

test_sent = sent_tokenize(test_sent)
pre_sent = [convert_to_conll(sent) for sent in test_sent]
pre_sent = [sent2features(s) for s in pre_sent]

crf.predict(pre_sent)

array([list(['O', 'O', 'O', 'B-PERSON', 'I-PERSON']),
       list(['O', 'O', 'B-LOCATION'])], dtype=object)

In [21]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-MISCELLANEOUS -> I-MISCELLANEOUS 5.818131
I-ORGANIZATION -> I-ORGANIZATION 5.334325
B-LOCATION -> I-LOCATION 5.295289
I-MISCELLANEOUS -> I-MISCELLANEOUS 5.189214
B-ORGANIZATION -> I-ORGANIZATION 4.900042
I-LOCATION -> I-LOCATION 4.623126
B-PERSON -> I-PERSON 4.442208
I-PERSON -> I-PERSON 3.016830
O      -> O       2.962851
I-ORGANIZATION -> B-LOCATION 2.349885
O      -> B-ORGANIZATION 1.217987
O      -> B-PERSON 1.094454
B-ORGANIZATION -> B-LOCATION 1.083474
O      -> B-MISCELLANEOUS 1.072854
I-ORGANIZATION -> B-ORGANIZATION 1.039318
O      -> B-LOCATION 1.030987
B-LOCATION -> B-ORGANIZATION 0.923479
I-LOCATION -> O       0.383667
I-MISCELLANEOUS -> O       0.254014
I-LOCATION -> B-ORGANIZATION 0.023042

Top unlikely transitions:
B-LOCATION -> B-LOCATION -0.921841
I-PERSON -> B-LOCATION -1.031983
I-LOCATION -> B-PERSON -1.064399
I-PERSON -> I-ORGANIZATION -1.076886
I-ORGANIZATION -> I-LOCATION -1.174304
I-MISCELLANEOUS -> B-LOCATION -1.197735
I-PERSON -> B-ORG

In [22]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.501903 O        BOS
4.633844 O        bias
3.768639 I-LOCATION +1:word.lower():,
3.535465 O        EOS
3.531515 B-ORGANIZATION -1:word.lower():(
3.491343 B-PERSON -1:word.lower():ông
3.461747 B-ORGANIZATION word[-3:]:ank
3.461747 B-ORGANIZATION word[-2:]:nk
3.286244 B-LOCATION -1:word.lower():ở
3.264489 B-PERSON -1:word.lower():bà
3.083158 B-LOCATION +1:word.lower():,
3.048917 B-LOCATION word[-2:]:ia
2.954745 B-ORGANIZATION word[-2:]:le
2.934610 I-ORGANIZATION +1:word.lower():,
2.906658 B-PERSON word.istitle()
2.771158 B-ORGANIZATION +1:word.lower():,
2.763061 B-LOCATION word[-3:]:HCM
2.763061 B-LOCATION word[-2:]:CM
2.728897 B-ORGANIZATION word[-2:]:NA
2.726796 B-ORGANIZATION +1:word.lower():)
2.666674 B-PERSON +1:word.lower():,
2.664998 I-PERSON +1:word.lower():,
2.605291 O        -1:word.lower():iphone
2.578221 B-PERSON -1:word.lower():anh
2.568096 B-LOCATION word.lower():lào
2.568096 B-LOCATION word[-3:]:Lào
2.523456 B-ORGANIZATION word.lower():pvtex
2.523456 B-ORGA