In [13]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [14]:
path = "../hand_labeled/"
datalist = ['DCM', 'DIT', 'IRS']
convs = read_conv(path, datalist)

In [15]:
def make_Xy_4test(convs, N=4):
    errors = ["Topic transition error", 'Lack of information', 'Unclear intention']
    # errors = ["Unclear intention"]
    X = []
    y = []
    for conv in convs:
        dialogue = [""]*N
        for i, ut in enumerate( conv ) :
            # ユーザ発話駆動
            dialogue.append(clean_text( ut.utt) )
            if ut.is_exist_error():
                X.append( dialogue[-N:] )
                    # X.append(dialogue[-N:])
                if ut.is_error_included(errors) :
                    y.append(1)
                else:
                    y.append(0)
        
    return X, y

In [17]:
N = 2
X_str, y = make_Xy_4test(convs, N=N)
y.count(1)

715

In [18]:
import transformers
transformers.BertTokenizer = transformers.BertJapaneseTokenizer

In [19]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models

download_path = "../../corpus/"
# download_path = ""
transformer = models.Transformer(download_path+'cl-tohoku/bert-base-japanese-whole-word-masking')
pooling = models.Pooling(transformer.get_word_embedding_dimension(),    
  pooling_mode_mean_tokens=True,
  pooling_mode_cls_token=False, 
  pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[transformer, pooling])

sentences = ['吾輩は猫である',  '本日は晴天なり']
embeddings = model.encode(sentences)

for i, embedding in enumerate(embeddings):
  print("[%d] : %s" % (i, embedding.shape, ))

Some weights of the model checkpoint at ../../corpus/cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[1975] 2022-01-13 13:07:31,441 Info sentence_transformers.SentenceTransformer :Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0] : (768,)
[1] : (768,)


In [20]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models ,losses
from sentence_transformers.readers import InputExample
from sentence_transformers.losses import TripletDistanceMetric, SoftmaxLoss
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.readers import TripletReader
from sentence_transformers.datasets import SentencesDataset
from torch.utils.data import DataLoader

In [21]:
X = [ InputExample(texts=pair, label=la )  for pair, la in zip(X_str, y)]

In [22]:
X_train, X_test = train_test_split(X, train_size=0.7, random_state=4)

In [23]:
BATCH_SIZE = 32
NUM_EPOCHS = 3
EVAL_STEPS = 1000
WARMUP_STEPS = int(len(X_train) // BATCH_SIZE * 0.1) 
OUTPUT_PATH = "../../corpus/pretrained/sbert_context_form2"

In [24]:
train_data = SentencesDataset(X_train, model=model)
train_dataloader = DataLoader(train_data, shuffle=True,  batch_size=BATCH_SIZE)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)

[1975] 2022-01-13 13:07:31,787 Info sentence_transformers.losses.SoftmaxLoss :Softmax loss: #Vectors concatenated: 3


In [25]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
         epochs=NUM_EPOCHS,
         evaluation_steps=EVAL_STEPS,
         warmup_steps=WARMUP_STEPS,
         output_path=OUTPUT_PATH
         )

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

[1975] 2022-01-13 13:08:07,226 Info sentence_transformers.SentenceTransformer :Save model to ../../corpus/pretrained/sbert_context_form2
