In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor



In [2]:
corpus_path = "../../corpus/twitter/"
data_name = "hate_labeled.csv"

In [3]:
df = pd.read_csv(corpus_path+data_name)

In [4]:
df

Unnamed: 0,label,txt
0,2,マスゴミってエリートも糞もないんだってねW
1,2,ランサーぶっぱするな死ね。
2,2,ジエンまじでクソ死ねばいいのに(殺せない)
3,3,ねぇ〜〜バカー！！！！！！
4,1,ゴミコントローラー引いてCSだせまんでした御免なさい...
...,...,...
1133,2,与野党マスゴミ茶番芸能人言論人、全部グル。
1134,2,非実在児童ポルノ規制とホラー映画規制は等しく馬鹿
1135,1,そんな事ないです…自分全然ゴミカスっす
1136,3,マジでキモすぎ死ぬんやったら勝手に死ねよ


In [5]:
df['label'].value_counts()

3    465
2    351
0    181
1    141
Name: label, dtype: int64

In [None]:
import transformers
transformers.BertTokenizer = transformers.BertJapaneseTokenizer

In [8]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models

transformer = models.Transformer('cl-tohoku/bert-base-japanese-whole-word-masking')
pooling = models.Pooling(transformer.get_word_embedding_dimension(),    
  pooling_mode_mean_tokens=True,
  pooling_mode_cls_token=False, 
  pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[transformer, pooling])

sentences = ['吾輩は猫である',  '本日は晴天なり']
embeddings = model.encode(sentences)

for i, embedding in enumerate(embeddings):
  print("[%d] : %s" % (i, embedding.shape, ))

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0] : (768,)
[1] : (768,)


In [9]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models ,losses
from sentence_transformers.readers import InputExample
from sentence_transformers.losses import TripletDistanceMetric, BatchAllTripletLoss
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.readers import TripletReader
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader

In [10]:
def make_dataset(df, mode="All"):
    X = []
    y = []
    path = "../hand_labeled/"
    datalist = ['DCM', 'DIT', 'IRS']
    convs = read_conv(path, datalist)

    usr_utt = []
    for conv in convs:
        for i, ut in enumerate(conv):
            if not ut.is_system():
                usr_utt.append(clean_text(ut.utt))
    import random

    if mode=="All":
        for la, txt in zip(df.label, df.txt):
            X.append( InputExample(guid="", texts=[txt], label=float(la) ) )
            # y.append(la)
        print("length of X", len(X))
        # 0 の要素を増やしておきますわよ
        sampled = random.sample(usr_utt, len(X)//3)
        for sample in sampled:
            X.append( InputExample(guid="" , texts=[sample], label=0.0 ) )
            # y.append(0)
        print("added length of X", len(X))

    return X


In [11]:
X = make_dataset(df)

length of X 1138
added length of X 1517


In [12]:
X_train, X_test = train_test_split(X, train_size=0.8, random_state=4)

In [18]:
BATCH_SIZE = 32
NUM_EPOCHS = 3
EVAL_STEPS = 1000
WARMUP_STEPS = int(len(X_train) // BATCH_SIZE * 0.1) 
OUTPUT_PATH = "../../corpus/sbert_stair"

In [19]:
train_data = SentenceLabelDataset(X_train, samples_per_label=BATCH_SIZE//2)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE)
train_loss = losses.BatchAllTripletLoss(model)

In [20]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
         epochs=NUM_EPOCHS,
         evaluation_steps=EVAL_STEPS,
         warmup_steps=WARMUP_STEPS,
         output_path=OUTPUT_PATH
         )

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/38 [00:00<?, ?it/s]

Iteration:   0%|          | 0/38 [00:00<?, ?it/s]

Iteration:   0%|          | 0/38 [00:00<?, ?it/s]

In [21]:
os.listdir(OUTPUT_PATH)

['eval',
 '1_Pooling',
 'tokenizer_config.json',
 'sentence_bert_config.json',
 'README.md',
 'config.json',
 'modules.json',
 'vocab.txt',
 'config_sentence_transformers.json',
 'pytorch_model.bin',
 'special_tokens_map.json']

参考

1. Sentence transformerで日本語モデルを学習して文章の分散表現を得る方法  
https://www.subcul-science.com/post/20210203sbert/#%E5%88%86%E6%95%A3%E8%A1%A8%E7%8F%BE%E3%81%AE%E5%9F%BA%E7%A4%8E%E3%82%92%E5%AD%A6%E7%BF%92%E3%81%99%E3%82%8B

2.  第9回 Sentence BERT による類似文章検索の検証  
https://www.ogis-ri.co.jp/otc/hiroba/technical/similar-document-search/part9.html

3. Sentence BERT論文-和訳
https://www.vareal.co.jp/column/sentence-bert%E8%AB%96%E6%96%87-%E5%92%8C%E8%A8%B3/