<a href="https://colab.research.google.com/github/verypluming/JapaneseNLI/blob/master/JapaneseXLM_NLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

transformersライブラリのXLM: https://github.com/huggingface/transformers をファインチューニングして日本語テキスト推論を試すコード

In [0]:
# 必要なモジュールのインストール
! pip install transformers==2.6.0 mecab-python3==0.996.5 tensorflow scikit-learn pandas lxml
%tensorflow_version 2.x. 
!mkdir data
!mkdir models
# Google Driveのマウント
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# 必要なモジュール・関数の読み込み
import codecs
import os
import re
import sys
import glob
from collections import Counter
import pandas as pd
import json
import numpy as np
import tensorflow as tf
from transformers import XLMConfig, TFXLMForSequenceClassification, XLMTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

class Vocab:
    # 正解ラベルの設定（今回はcontradiction, entailment, neutralの３値を設定）
    def __init__(self):
        self.token_index = {label: i for i, label in enumerate(set(["contradiction", "entailment", "neutral"]))}
        self.index_token = {v: k for k, v in self.token_index.items()}

    def encode(self, labels):
        label_ids = [self.token_index.get(label) for label in labels]
        return label_ids

    def decode(self, label_ids):
        labels = [self.index_token.get(label_id) for label_id in label_ids]
        return labels

    @property
    def size(self):
        return len(self.token_index)

    def save(self, file_path):
        with open(file_path, 'w') as f:
            config = {
                'token_index': self.token_index,
                'index_token': self.index_token
            }
            f.write(json.dumps(config))

    @classmethod
    def load(cls, file_path):
        with open(file_path) as f:
            config = json.load(f)
            vocab = cls()
            vocab.token_index = config.token_index
            vocab.index_token = config.index_token
        return vocab

def convert_examples_to_features(x, y, vocab, max_seq_length, tokenizer):
    features = {
        'input_ids': [],
        'attention_mask': [],
        'token_type_ids': [],
        'label_ids': np.asarray(vocab.encode(y))
    }
    for pairs in x:
        tokens = [tokenizer.cls_token]
        token_type_ids = []
        for i, sent in enumerate(pairs):
            word_tokens = tokenizer.tokenize(sent)
            tokens.extend(word_tokens)
            tokens += [tokenizer.sep_token]
            len_sent = len(word_tokens) + 1
            token_type_ids += [i] * len_sent

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        features['input_ids'].append(input_ids)
        features['attention_mask'].append(attention_mask)
        features['token_type_ids'].append(token_type_ids)

    for name in ['input_ids', 'attention_mask', 'token_type_ids']:
        features[name] = pad_sequences(features[name], padding='post', maxlen=max_seq_length)

    x = [features['input_ids'], features['attention_mask'], features['token_type_ids']]
    y = features['label_ids']
    return x, y

def build_model(pretrained_model_name_or_path, num_labels):
    config = XLMConfig.from_pretrained(
        pretrained_model_name_or_path,
        num_labels=num_labels
    )
    model = TFXLMForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path,
        config=config
    )
    model.layers[-1].activation = tf.keras.activations.softmax
    return model

def evaluate(model, target_vocab, features, labels):
    label_ids = model.predict(features)
    label_ids = np.argmax(label_ids, axis=-1)
    y_pred = target_vocab.decode(label_ids)
    y_true = target_vocab.decode(labels)
    print(classification_report(y_true, y_pred, digits=4))

In [0]:
# ハイパーパラメータの設定
batch_size = 100
epochs = 50
model_path = 'models/'
pretrained_model_name_or_path = 'xlm-mlm-100-1280'
tokenizer = XLMTokenizer.from_pretrained(pretrained_model_name_or_path)
maxlen = 250
target_vocab =Vocab()

# モデルの構築
model = build_model(pretrained_model_name_or_path, target_vocab.size)
model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy')

# callbacksの設定
callbacks = [
    EarlyStopping(patience=3),
]

In [0]:
# ファインチューニングに用いるデータの読み込み
# 各行にタブ区切りでpremise（前提文）、hypothesis(仮説文)、gold_label（正解ラベル）が書かれたtrain.tsvファイルを用意し、Google Driveにアップロード
# （一行目はpremise, hypothesis, gold_labelと記述）
# データの例
# premise hypothesis  gold_label
# 太郎は花子が山頂まで登っている間に、山頂まで登った。  太郎は花子が山頂まで登る前に、山頂まで登った。  entailment
!cp /content/drive/My\ Drive/train.tsv data/.
df = pd.read_csv("data/train.tsv", sep="\t")
premises = list(df['premise'])
hypotheses = list(df['hypothesis'])
x = [(premise, hypothesis) for (premise, hypothesis) in zip(premises, hypotheses)]
y = list(df['gold_label'])

In [0]:
# 全データをファインチューニングに使う場合
x_train = x
y_train = y
# train:testを9:1で分割して評価する場合
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)
features_train, labels_train = convert_examples_to_features(
    x_train,
    y_train,
    target_vocab,
    max_seq_length=maxlen,
    tokenizer=tokenizer
)

# モデルのファインチューニング
model.fit(x=features_train,
          y=labels_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1,
          callbacks=callbacks)
model.save_pretrained(model_path)

In [0]:
# ファインチューニングしたモデルをMyDriveに保存
!cp models/tf_model.h5 /content/drive/My\ Drive/.
!cp models/config.json /content/drive/My\ Drive/.

In [0]:
# MyDriveに保存したモデルの読み込み（モデルがある場合に使用）
!cp /content/drive/My\ Drive/tf_model.h5 models/.
!cp /content/drive/My\ Drive/config.json models/.
config = XLMConfig.from_json_file('models/config.json')
model = TFXLMForSequenceClassification.from_pretrained('models/tf_model.h5', config=config)

In [0]:
# 任意のデータでテスト
# x_test:　前提文と仮説文のペアをタプルとしたリスト, y_test: 正解ラベル（entailmentかneutralかcontradiction）のリスト
x_test = [('太郎は花子が山頂まで登っている間に、山頂まで登った。', '太郎は花子が山頂まで登る前に、山頂まで登った。')]
y_test = ['entailment']
features_test, labels_test = convert_examples_to_features(x_test, y_test, target_vocab, max_seq_length=maxlen, tokenizer=tokenizer)

# ラベルを予測
label_ids = model.predict(features_test)
label_ids = np.argmax(label_ids, axis=-1)
y_pred = target_vocab.decode(label_ids)
y_true = target_vocab.decode(labels_test)
print(y_pred, y_true)

# 混同行列の作成
evaluate(model, target_vocab, features_test, labels_test)