In [1]:
!pip install spacy



In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin

In [4]:
# Đọc dữ liệu
file_path = '/kaggle/input/ner-data/ner_datasetreference.csv'
dataset = pd.read_csv(file_path, encoding='ISO-8859-1')

# Điền các giá trị NaN trong cột "Sentence #" với giá trị trước đ
dataset['Sentence #'] = dataset['Sentence #'].ffill()

# Tách dữ liệu thành các câu và các nhãn tương ứng
sentences = dataset.groupby("Sentence #")["Word"].apply(list).values
tags = dataset.groupby("Sentence #")["Tag"].apply(list).values


In [5]:
def filter_invalid(sentences, tags):
    filtered_sentences = []
    filtered_tags = []
    for sent, tag in zip(sentences, tags):
        if all(isinstance(word, str) for word in sent):
            filtered_sentences.append(sent)
            filtered_tags.append(tag)
    return filtered_sentences, filtered_tags

sentences, tags = filter_invalid(sentences, tags)

In [6]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, tags, test_size=0.2, random_state=42
)


In [7]:
def convert_to_spacy_format(sentences, tags):
    nlp = spacy.blank("en")
    db = DocBin()
    for sent, tag in zip(sentences, tags):
        doc = nlp.make_doc(" ".join(sent))
        ents = []
        for i, label in enumerate(tag):
            if label != "O":
                start = len(" ".join(sent[:i])) + (1 if i > 0 else 0)
                end = start + len(sent[i])
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
        doc.ents = ents
        db.add(doc)
    return db

In [8]:
train_data_spacy = convert_to_spacy_format(train_sentences, train_tags)
test_data_spacy = convert_to_spacy_format(test_sentences, test_tags)

In [9]:
train_data_spacy.to_disk("/kaggle/working/train.spacy")
test_data_spacy.to_disk("/kaggle/working/test.spacy")

In [15]:

config_content = """
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = /kaggle/working/train.spacy
dev = /kaggle/working/test.spacy
vectors = null
[system]
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 1000, 2500, 2500]
include_static_vectors = false

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001

[initialize]
vectors = ${paths.vectors}
"""
with open("base_config.cfg", "w") as file:
    file.write(config_content)

In [16]:
!python -m spacy init fill-config /kaggle/working/base_config.cfg /kaggle/working/output/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/kaggle/working/output/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [17]:
!python -m spacy train /kaggle/working/output/config.cfg --output /kaggle/working/output

[38;5;4mℹ Saving to output directory: /kaggle/working/output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2024-06-23 05:54:32,418] [INFO] Set up nlp object from config
[2024-06-23 05:54:32,458] [INFO] Pipeline: ['tok2vec', 'ner']
[2024-06-23 05:54:32,464] [INFO] Created vocabulary
[2024-06-23 05:54:32,464] [INFO] Finished initializing nlp object
[2024-06-23 05:54:58,571] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     55.14    0.08    0.97    0.04    0.00
  0     200        126.33   3556.87   60.13   60.14   60.13    0.60
  0     400        349.80   2310.08   71.33   72.47   70.23    0.71
  0     600        229.11   2326.24   75.28   77

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy

# Tải mô hình đã huấn luyện
nlp = spacy.load("/kaggle/working/output/model-best")

def evaluate(ner_model, test_sentences, test_tags):
    y_true = []
    y_pred = []

    for sent, tags in zip(test_sentences, test_tags):
        doc = ner_model(" ".join(sent))
        pred_tags = ["O"] * len(tags)
        for ent in doc.ents:
            start_idx = len(doc[:ent.start].text.split())
            end_idx = start_idx + len(ent.text.split())
            for i in range(start_idx, end_idx):
                if i == start_idx:
                    pred_tags[i] = f"B-{ent.label_}"
                else:
                    pred_tags[i] = f"I-{ent.label_}"
        y_true.extend(tags)
        y_pred.extend(pred_tags)

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = evaluate(nlp, test_sentences, test_tags)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


Accuracy: 0.8417079184266356
Precision: 0.8419973379607162
Recall: 0.8417079184266356
F1-score: 0.8418526033188747


In [20]:
def predict_entities(nlp, sentences):
    predictions = []
    for sent in sentences:
        doc = nlp(sent)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        predictions.append(entities)
    return predictions

# Các câu mới để dự đoán
new_sentences = [
      "Barack Obama was born in Hawaii.",
    "Apple Inc. is a technology company based in Cupertino.",
]

# Dự đoán nhãn thực thể
predictions = predict_entities(nlp, new_sentences)

# In kết quả dự đoán
for sent, ents in zip(new_sentences, predictions):
    print(f"Câu: {sent}")
    for ent in ents:
        print(f"  - Thực thể: {ent[0]}, Nhãn: {ent[1]}")


Câu: Barack Obama was born in Hawaii.
  - Thực thể: Barack, Nhãn: B-per
  - Thực thể: Obama, Nhãn: I-per
  - Thực thể: Hawaii, Nhãn: B-geo
Câu: Apple Inc. is a technology company based in Cupertino.
  - Thực thể: Apple, Nhãn: B-org
  - Thực thể: Inc., Nhãn: I-org
  - Thực thể: Cupertino, Nhãn: B-geo
