https://www.ogis-ri.co.jp/otc/hiroba/technical/similar-document-search/part9.html

In [14]:
import transformers
from sentence_transformers import SentenceTransformer
from sentence_transformers import models

# SentenceBERTはトークナイザにBertTokenizerを決め打ちで使うので、ここで修正
transformers.BertTokenizer = transformers.BertJapaneseTokenizer

transformer = models.Transformer('cl-tohoku/bert-base-japanese-whole-word-masking')
pooling = models.Pooling(transformer.get_word_embedding_dimension(),
                         pooling_mode_mean_tokens=True,
                         pooling_mode_cls_token=False, 
                         pooling_mode_max_tokens=False)
model=SentenceTransformer(modules=[transformer,pooling])

#文章を固定長ベクトルに変換
sentences=["吾輩は猫である","本日は晴天なり"]
embeddings=model.encode(sentences)

for i,embedding in enumerate(embeddings):
    print("[%d]:%s"%(i,embedding.shape,))

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[0]:(768,)
[1]:(768,)


In [15]:
#学習データセットの準備
!git clone https://github.com/STAIR-Lab-CIT/STAIR-captions
!tar zxvf STAIR-captions/stair_captions_v1.2.tar.gz
!ls -lh *.json

import json
with open("stair_captions_v1.2_val.json", "r") as f:
  json_data_val = json.load(f)
with open("stair_captions_v1.2_train.json", "r") as f:
  json_data_train = json.load(f)

fatal: destination path 'STAIR-captions' already exists and is not an empty directory.
x stair_captions_v1.2_train.json
x stair_captions_v1.2_train_tokenized.json
x stair_captions_v1.2_val.json
x stair_captions_v1.2_val_tokenized.json
-rw-r--r--  1 gentle  staff    66M Jan 26  2018 stair_captions_v1.2_train.json
-rw-r--r--  1 gentle  staff   105M Jan 26  2018 stair_captions_v1.2_train_tokenized.json
-rw-r--r--  1 gentle  staff    32M Jan 26  2018 stair_captions_v1.2_val.json
-rw-r--r--  1 gentle  staff    51M Jan 26  2018 stair_captions_v1.2_val_tokenized.json


In [16]:
#dataset は画像ID をキー、画像に付与された5つのキャプションの(ID, 文字列)のタプルのリストを値に持つ辞書
#ids はキャプションIDのリスト
#captions はキャプション文字列のリスト

dataset={}
ids,captions=[],[]

def build_dataset(dataset,json_data):
    num_samples=len(json_data["annotations"])
    for i in range(num_samples):
        anno=json_data["annotations"][i]
        image_id=anno["image_id"]
        image_captions = dataset.get(image_id, [])
        image_captions.append((anno["id"], anno["caption"]))
        ids.append(anno["id"])
        captions.append(anno["caption"])
        dataset[image_id] = image_captions

build_dataset(dataset, json_data_val)
build_dataset(dataset, json_data_train)

#キャプションID と ids, captions のインデックスの変換テーブルを作る
id2idx = {id:idx for idx, id in enumerate(ids)}

In [17]:
import numpy as np
import spacy
import pkg_resources, imp

#キャプション文字列を、単語ベクトルの平均を取ることで文章ベクトルに変換

imp.reload(pkg_resources)
nlp=spacy.load("ja_ginza")

vectors=[]
for caption in captions:
    doc=nlp(caption,disable=["ner"])
    vectors.append(doc.vector)
del nlp

KeyboardInterrupt: 

In [None]:
from skleran.metrics.pairwise import cosine_similarity
import random
# コサイン類似度


def similarity(id1, id2):
    return cosine_similarity([vectors[id2idx[id1]]], [vectors[id2idx[id2]]])[0][0]


def make_triplets(dataset, threshold=0.85, seed=7, max_tries=25):
    triplets = []
    random.seed(seed)
    neg_candidate_indices = list(range(len(ids)))
    random.shuffle(neg_candidate_indices)

    def log(i, str):
        if i % 5000 == 0:
            print(str)

    for i, image_id in enumerate(list(dataset.keys())):
        log(i, "### %d ###" % (image_id))

        # pickup positive pair.
        score = 0.0
        tries = 0
        while score < threshold:
            #画像に付与したキャプションからランダムに2つ選んで類似度が閾値以上ならアンカーとポジティブにする
            [(id, caption), (id_pos, caption_pos)] = random.sample(dataset[image_id], 2)
            score = similarity(id, id_pos)
            tries += 1
            if tries > max_tries:
                break
        if score < threshold:
            continue
        
        #pickup negative one.
        id_neg=id
        current_caption_ids=[id_cap[0] for id_cap in dataset[image_id]]
        while id_neg in current_caption_ids:
            idx_neg=neg_candidate_indices.pop()
            id_neg=ids[idx_neg]
        caption_neg=captions[id2idx[id_neg]]
        
        log(i, "  pos:  score: %4.2f [%s]:[%s]" % (score, caption, caption_pos))
        log(i, "  neg:  score: %4.2f [%s]:[%s]" % (similarity(id, id_neg), caption, caption_neg))
        triplets.append({
          "image_id": image_id,
          "id": id,
          "id_pos": id_pos,
          "id_neg": id_neg,
          "caption": caption,
          "caption_pos": caption_pos,
          "caption_neg": caption_neg  
        })
    return triplets

In [None]:
triplets = make_triplets(dataset)

from sklearn.model_selection import train_test_split
train, dev_test = train_test_split(triplets, train_size=0.8, random_state=4)
dev, test = train_test_split(dev_test, train_size=0.5, random_state=7)

def to_tsv(fname, triplet):
  with open(fname, "w") as f:
    lines = ["%s\t%s\t%s" % (example["caption"], example["caption_pos"], example["caption_neg"]) for example in triplet]
    f.write("\n".join(lines)+"\n")

to_tsv("triplet_train.tsv", train)
to_tsv("triplet_dev.tsv", dev)
to_tsv("triplet_test.tsv", test)

del triplets
del ids
del captions
del dataset