In [1]:
!git clone https://github.com/stockmarkteam/ner-wikipedia-dataset.git

fatal: destination path 'ner-wikipedia-dataset' already exists and is not an empty directory.


In [3]:
!pip install torch transformers fugashi ipadic sklearn

Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn
  Using cached sklearn-0.0-py2.py3-none-any.whl
Installing collected packages: sklearn
Successfully installed sklearn-0.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
from collections import defaultdict
import json
import pandas as pd
import openpyxl

from sklearn.model_selection import train_test_split
import torch
from transformers import BertConfig, BertJapaneseTokenizer
from transformers import BertModel
from transformers import BertForTokenClassification
from transformers import pipeline
from transformers import Trainer, TrainingArguments

In [2]:
MAX_LENGTH = 256  # 最大文長 
BERT_MODEL = "cl-tohoku/bert-base-japanese"  # 使用する学習済みモデル
DATASET_PATH = "ner-wikipedia-dataset/ner.json"

In [3]:
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [4]:
with open(DATASET_PATH) as json_file:  # データセット読み込み
    ner_data_dict_list = json.load(json_file)
text_list = [ner_data_dict["text"] for ner_data_dict in ner_data_dict_list]
encoded_text_list = []  # エンコードされた文のリスト
split_text_list = []  # 単語ごとに分割された文のリスト

for text in text_list:
    encoded_text = tokenizer(text, max_length=MAX_LENGTH, pad_to_max_length=True)
    encoded_text_list.append(encoded_text)
    split_text = tokenizer.decode(encoded_text["input_ids"]).split()
    split_text_list.append(split_text)

entities_list = [ner_data_dict["entities"] for ner_data_dict in ner_data_dict_list]
labels_list = [["O"] * MAX_LENGTH for _ in range(len(text_list))]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [5]:
unique_labels = set("O")  # 付与された固有表現抽出ラベルの集合
found_named_entity = False

for sample_idx, (encoded_text, split_text, entities) in enumerate(zip(encoded_text_list, split_text_list, entities_list)):
    if len(entities) == 0:
        continue
    target_entity = entities.pop(0)
    entity_name = target_entity["name"]
    entity_label = target_entity["type"]
    for word_idx, word in enumerate(split_text):
        if word == "[CLS]" or word == "[SEP]" or word == "[PAD]":  # 特殊トークンにIGNOREラベルの付与
            labels_list[sample_idx][word_idx] = "IGNORE"
            continue
        
        if len(entities) == 0:
            continue
            
        if entity_name.startswith(word):
            if entity_name.endswith(word):
                label = f"B-{entity_label}"
                labels_list[sample_idx][word_idx] = label
                unique_labels.add(label)
                
                if len(entities) >= 1:
                    target_entity = entities.pop(0)
                    entity_name = target_entity["name"]
                    entity_label = target_entity["type"]
                
            else:
                for word_idx_2 in range(word_idx + 1, len(split_text)):
                    if "".join(split_text[word_idx : word_idx_2 + 1]) not in entity_name.replace(" ", "").replace("　",""):
                        found_named_entity = False
                        break
                    if entity_name.endswith(split_text[word_idx_2]):
                        labels = [f"B-{entity_label}"] + [f"I-{entity_label}"] * (word_idx_2 - word_idx) 
                        unique_labels |= set(labels)
                        labels_list[sample_idx][word_idx : word_idx_2 + 1] = labels
                        found_named_entity = True
                        break
                if found_named_entity:
                    if len(entities) >= 1:
                        target_entity = entities.pop(0)
                        entity_name = target_entity["name"]
                        entity_label = target_entity["type"]
                    

In [6]:
unique_labels = list(unique_labels)
label2id = {label : label_id for label_id, label in enumerate(unique_labels)}  # ラベルにIDを付与
label2id["IGNORE"] = -100  # IGNORE ラベル に ID -100 を付与
encoded_labels_list = [[label2id[label] for label in labels] for labels in labels_list]  # 各文に付与されたラベルをIDで置き換えたもののリストを作成
del label2id["IGNORE"]  # configに渡すときには消す必要あり
id2label= {id: label for label, id in label2id.items()}  # IDをkey、ラベル名をvalueとした逆の辞書を作成

In [7]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = defaultdict(list)
        for encoding_dict in encodings:
            for key, value in encoding_dict.items():
                self.encodings[key].append(value)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], device=torch.device('cuda'))
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_encoded_text_list, test_encoded_text_list, train_encoded_labels_list, test_encoded_labels_list = \
    train_test_split(encoded_text_list, encoded_labels_list, test_size=0.1, random_state=0)
train_encoded_text_list, val_encoded_text_list, train_encoded_labels_list, val_encoded_labels_list = \
    train_test_split(encoded_text_list, encoded_labels_list, test_size=0.2, random_state=0)

train_dataset = NERDataset(train_encoded_text_list, train_encoded_labels_list)
val_dataset = NERDataset(val_encoded_text_list, val_encoded_labels_list)
test_dataset = NERDataset(test_encoded_text_list, test_encoded_labels_list)

In [9]:
config = BertConfig.from_pretrained(BERT_MODEL, id2label=id2label, label2id=label2id)
model = BertForTokenClassification.from_pretrained('cl-tohoku/bert-base-japanese', config=config)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [10]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500, 
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=50,
    do_eval=True,
    eval_steps=50
)

In [14]:
trainer = Trainer(
    model=model,                         
    args=training_args, 
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Step,Training Loss
10,2.734
20,2.6397
30,2.4351
40,2.1295
50,1.6842
60,1.0815
70,0.6586
80,0.5713
90,0.6168
100,0.4753


TrainOutput(global_step=804, training_loss=0.38880041608614707, metrics={'train_runtime': 719.8049, 'train_samples_per_second': 1.117, 'total_flos': 2167189457126400.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 1598495, 'init_mem_gpu_alloc_delta': 440950272, 'init_mem_cpu_peaked_delta': 58137, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 345076, 'train_mem_gpu_alloc_delta': 1391537152, 'train_mem_cpu_peaked_delta': 99520322, 'train_mem_gpu_peaked_delta': 4541236224})

In [None]:
model_path = 'model.pth'
torch.save(model.state_dict(), model_path)

In [None]:
print(encoded_labels_list)

In [13]:
df_test = pd.read_csv('textdata_from_twitter.csv', names=["text", "label1", "label2", "label3", "label4", "label5"])

In [14]:
df_test

Unnamed: 0,text,label1,label2,label3,label4,label5
0,ブログを更新しました 若林 愛 「木管五重奏カラフルアウトリーチ」 ⇒ https://am...,若林 愛,木管五重奏,,,
1,滝巡り④\n亀田不動の滝\n\n#ninja1000\n#滝沢,亀田不動の滝,,,,
2,『小岩井農場』日本最大の総合農場に行ってきましたの！【前編】 #小岩井農場 #上丸牛舎 #牛...,小岩井農場,,,,
3,フードアトリエです！\n本日12時～12時半頃まで滝沢市役所入口にて出張販売始めました！\n...,滝沢市役所,フードアトリエ,お惣菜,出張販売,
4,『ゆいてとて』 １０月２日(土)・３日(日) 10:00〜16:00 ゆい工房 結の蔵にて開...,ゆい工房,ゆいてとて,,,
...,...,...,...,...,...,...
143,盛岡市中心部からちょっと走れば…\n#盛岡市\n#ダム\n#コンビニ弁当冷めた,,,,,
144,近ツーの東北のホテル・旅館 https://tabitabilink.com/knt/eri...,,,,,
145,@tabi317,,,,,
146,より,,,,,


In [15]:
df_val = df_test['text']

In [11]:
model_path = 'model.pth'
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [12]:
raw_inputs = df_val.values.tolist()
print(raw_inputs[0:5])

NameError: name 'df_val' is not defined

In [None]:
raw_inputs = [str(t) for t in raw_inputs]

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, config=config)

In [None]:
import re

def clean_text(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', ' ', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', ' ', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(
        r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
    replaced_text = re.sub(r'　', ' ', replaced_text)  # 全角空白の除去
    return replaced_text


def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    cleaned_text = soup.get_text()
    cleaned_text = ''.join(cleaned_text.splitlines())
    return cleaned_text


def clean_html_and_js_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    [x.extract() for x in soup.findAll(['script', 'style'])]
    cleaned_text = soup.get_text()
    cleaned_text = ''.join(cleaned_text.splitlines())
    return cleaned_text


def clean_url(html_text):
    cleaned_text = re.sub(r'http\S+', '', html_text)
    return cleaned_text

In [None]:
for i, text in enumerate(raw_inputs):
    raw_inputs[i] = clean_text(text)
    raw_inputs[i] = clean_url(text)

In [None]:
raw_inputs

In [None]:
predictions = []
for text in raw_inputs:
    predictions.append(ner_pipeline(text))

In [28]:
ner_pipeline("盛岡寿司懐石 重兵衛")

[{'entity': 'I-政治的組織名',
  'score': 0.08448078,
  'index': 2,
  'word': '寿司',
  'start': None,
  'end': None},
 {'entity': 'I-政治的組織名',
  'score': 0.09784629,
  'index': 3,
  'word': '懐',
  'start': None,
  'end': None},
 {'entity': 'I-政治的組織名',
  'score': 0.11476692,
  'index': 4,
  'word': '##石',
  'start': None,
  'end': None},
 {'entity': 'I-政治的組織名',
  'score': 0.100545764,
  'index': 5,
  'word': '重',
  'start': None,
  'end': None},
 {'entity': 'I-製品名',
  'score': 0.09412962,
  'index': 6,
  'word': '兵衛',
  'start': None,
  'end': None}]

In [29]:
predictions

[[{'entity': 'I-人名',
   'score': 0.0818402,
   'index': 1,
   'word': 'ブログ',
   'start': None,
   'end': None},
  {'entity': 'B-製品名',
   'score': 0.10631643,
   'index': 2,
   'word': 'を',
   'start': None,
   'end': None},
  {'entity': 'B-製品名',
   'score': 0.0865674,
   'index': 3,
   'word': '更新',
   'start': None,
   'end': None},
  {'entity': 'I-人名',
   'score': 0.08585945,
   'index': 4,
   'word': 'し',
   'start': None,
   'end': None},
  {'entity': 'I-製品名',
   'score': 0.09210798,
   'index': 5,
   'word': 'まし',
   'start': None,
   'end': None},
  {'entity': 'I-法人名',
   'score': 0.075507015,
   'index': 7,
   'word': '若林',
   'start': None,
   'end': None},
  {'entity': 'I-人名',
   'score': 0.105016336,
   'index': 9,
   'word': '「',
   'start': None,
   'end': None},
  {'entity': 'B-製品名',
   'score': 0.13268644,
   'index': 11,
   'word': '##管',
   'start': None,
   'end': None},
  {'entity': 'B-製品名',
   'score': 0.1284447,
   'index': 12,
   'word': '五',
   'start': None,
   '