In [1]:
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import collections
from sklearn.model_selection import train_test_split
from sklearn import metrics

import sys
sys.path.append("../")
from datatools.analyzer import *
from utterance.error_tools import *

from datatools.maneger import DataManager
from datatools.preproc import Preprocessor

In [2]:
corpus_path = "../../corpus/SNLI/"
data_name = "dev_sentence.csv"
data_name = "train_sentence.csv"
# data_name = "train_wo_filtering.tsv"

In [3]:
df = pd.read_csv(corpus_path+data_name)

In [4]:
df.pre.values

array(['pre', 'ガレージで、壁にナイフを投げる男。', '茶色のドレスを着た女性がベンチに座っています。', ...,
       '口ひげ、フェイスペイント、それに合った赤い帽子とシャツを着た男性がバイオリンを弾いています。',
       'マヤ遺跡を訪れている間、ガイドはより多くのアトラクションの方向を指し示します。',
       'ハト、モーターサイクリスト、ベンチの上の女性のいる通り。'], dtype=object)

In [5]:
df['label'].value_counts()

contradiction    182964
entailment       182583
neutral          182467
label                 1
Name: label, dtype: int64

In [6]:
df2 = df.dropna(how="any")

In [7]:
df

Unnamed: 0,label,pre,hypo
0,label,pre,hypo
1,neutral,ガレージで、壁にナイフを投げる男。,男は魔法のショーのためにナイフを投げる行為を練習しています。
2,contradiction,茶色のドレスを着た女性がベンチに座っています。,女性が畑で踊っています。
3,contradiction,ラップトップコンピューターを使用して机に座っている若い白人男。,黒人はデスクトップコンピューターを使用します。
4,entailment,海の波に倒れる男,海に転がる男。
...,...,...,...
548010,entailment,犬がパドリングプールに飛び込むと、ホースで潮吹きされます。,犬がジャンプします。
548011,entailment,ブルゴーニュのシャツと黒いズボンを着た男が、楽器を持っている人形を操っています。,男は人形と楽器を使って仕事をしています。
548012,contradiction,口ひげ、フェイスペイント、それに合った赤い帽子とシャツを着た男性がバイオリンを弾いています。,男は仕事を失った後、口ひげを剃ります。
548013,neutral,マヤ遺跡を訪れている間、ガイドはより多くのアトラクションの方向を指し示します。,人々はもっとやることを探している観光客です。


In [8]:
df2['label'].value_counts()

contradiction    182963
entailment       182583
neutral          182467
label                 1
Name: label, dtype: int64

In [9]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models ,losses
from sentence_transformers.readers import InputExample
from sentence_transformers.losses import TripletDistanceMetric, SoftmaxLoss
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.readers import TripletReader
from sentence_transformers.datasets import SentencesDataset
from torch.utils.data import DataLoader

In [10]:
label = ["entailment", "neutral", "contradiction"]
label2id = dict( zip(label, range(len(label))) )

def make_dataset_snli(df, max_n = 300):
    X = []
    each_label_num = [0, 0 ,0]
    for la, pre, hypo in zip(df.label, df.pre, df.hypo):
        if la not in label2id:
            continue
        if each_label_num[label2id[la]] >= max_n:
            continue
        X.append( InputExample(texts=[pre, hypo], label=label2id[la] ) )
        each_label_num[label2id[la]] += 1
    return X


In [11]:
X = make_dataset_snli(df2, max_n=500000)

In [12]:
len(X)

548013

In [13]:
X_train, X_test = train_test_split(X, train_size=0.8, random_state=4)

In [14]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import models

download_path = "../../corpus/"
# download_path = ""
transformer = models.Transformer(download_path+'cl-tohoku/bert-base-japanese-whole-word-masking')
pooling = models.Pooling(transformer.get_word_embedding_dimension(),    
  pooling_mode_mean_tokens=True,
  pooling_mode_cls_token=False, 
  pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[transformer, pooling])

sentences = ['吾輩は猫である',  '本日は晴天なり']
embeddings = model.encode(sentences)

for i, embedding in enumerate(embeddings):
  print("[%d] : %s" % (i, embedding.shape, ))

Some weights of the model checkpoint at ../../corpus/cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[15263] 2022-01-20 11:43:26,037 Info sentence_transformers.SentenceTransformer :Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0] : (768,)
[1] : (768,)


In [15]:
len(X)

548013

In [16]:
BATCH_SIZE = 64
NUM_EPOCHS = 2
EVAL_STEPS = 1000
WARMUP_STEPS = int(len(X_train) // BATCH_SIZE * 0.1) 
OUTPUT_PATH = "../../corpus/sbert_snli2"

In [17]:
train_data = SentencesDataset(X_train, model=model)
train_dataloader = DataLoader(train_data, shuffle=True,  batch_size=BATCH_SIZE)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2id))

[15263] 2022-01-20 11:43:30,309 Info sentence_transformers.losses.SoftmaxLoss :Softmax loss: #Vectors concatenated: 3


In [18]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
        epochs=NUM_EPOCHS,
        evaluation_steps=EVAL_STEPS,
        warmup_steps=WARMUP_STEPS,
        output_path=OUTPUT_PATH
    )

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6851 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6851 [00:00<?, ?it/s]

[15263] 2022-01-20 13:10:52,530 Info sentence_transformers.SentenceTransformer :Save model to ../../corpus/sbert_snli2


In [19]:
import torch

torch.cuda.is_available()

True