In [1]:
from parlai.core.params import ParlaiParser
from parlai.core.dict import DictionaryAgent
from parlai.scripts.train_model import setup_args as tm_setupargs
from parlai.scripts.train_model import TrainLoop
from parlai.core.agents import create_agent
from parlai.core.worlds import create_task
from itertools import islice
from ja_sentpiece_tokenizer import FullTokenizer
import json
import time
import os

In [2]:
SHARED = {}
def setup_interactive(shared):
    parser = tm_setupargs()
    parser.set_params(batchsize=5, fixed_candidates_path='/installation/~/ParlAI/data/rachel/topical_cands.json')
    SHARED['opt'] = parser.parse_args([], print_args=True)

#    SHARED['opt']['task'] = 'projects.jp_dialogue.tasks.agents'
    SHARED['opt']['task'] = 'parlai.agents.local_human.local_human:LocalHumanAgent'
    SHARED['opt']['model'] = 'projects.jp_dialogue.jp_retrieval.retrieval_agents:BertJPRanker'
#     SHARED['opt']['dict_class'] = 'projects.jp_dialogue.jp_retrieval.retrieval_agents:UniBiDictionaryAgent'
#     SHARED['opt']['dict_build_first'] = True
#     SHARED['opt']['dict_minfreq'] = 5
    SHARED['opt']['model_file'] = '/installation/~/ParlAI/data/models/rachel/bibert_poly_ranker'
    SHARED['opt']['no_cuda'] = True
    SHARED['opt']['history_size'] = 1
    SHARED['opt']['truncate'] = -1
    SHARED['opt']['interactive_mode'] = True
    SHARED['opt']['candidates'] = 'batch'
    SHARED['opt']['eval_candidates'] = 'fixed'
    SHARED['opt']['encode_candidate_vecs'] = True
    SHARED['opt']['fixed_candidates_path'] = '/installation/~/ParlAI/data/rachel/topical_cands.json'
    SHARED['opt']['fixed_candidate_vecs'] = 'fixed'
#     SHARED['opt']['scoring_func'] = 'scaled'
#     SHARED['opt']['embedding_size'] = 300
#     SHARED['opt']['h_layer_num'] = 3
#     SHARED['opt']['h_dim'] = 512
#     SHARED['opt']['h_act_func'] = 'swish'
#     SHARED['opt']['linear_dim'] = 1024
#     SHARED['opt']['n_heads'] = 12
#     SHARED['opt']['n_layers'] = 1
#     SHARED['opt']['ffn_size'] = 3072
#     SHARED['opt']['dropout'] = 0.1
#     SHARED['opt']['attention_dropout'] = 0.1
#     SHARED['opt']['relu_dropout'] = 0.0
#     SHARED['opt']['learn_positional_embeddings'] = True
#     SHARED['opt']['embeddings_scale'] = False
#     SHARED['opt']['activation'] = 'gelu'
#     SHARED['opt']['variant'] = 'xlm'
#     SHARED['opt']['output_scaling'] = 1.0
#     SHARED['opt']['eval_candidates'] = ''
#     SHARED['opt']['learningrate'] = 5e-05
#     SHARED['opt']['momentum'] = 0
#     SHARED['opt']['optimizer'] = 'sgd'
    SHARED['opt']['lr_scheduler'] ='reduceonplateau'
    SHARED['opt']['out_dim'] =768
    SHARED['opt']['add_transformer_layer']=False
    SHARED['opt']['pull_from_layer']=-1
    SHARED['opt']['bert_aggregation']='mean'
    SHARED['opt']['dict_maxexs']=0
    SHARED['opt']['train_folder']='rachel'
    # Create model and assign it to the specified task
    SHARED['tokenizer'] = FullTokenizer(SHARED['opt']['datapath'] + '/models/')
    SHARED['agent'] = create_agent(SHARED.get('opt'), requireModelExists=True)
    SHARED['world'] = create_task(SHARED.get('opt'), [SHARED['agent']])

In [3]:
def get_responses(agent, text, topic):
    # topic = classify(text)
    obs = {"text": text, 'episode_done': False, 'topic': topic}
    agent.observe(obs)
    batch = agent.batchify([agent.observation])
    batchsize = (
        batch.text_vec.size(0)
        if batch.text_vec is not None
        else batch.image.size(0)
    )
    agent.model.eval()
    cands = agent.fixed_candidates[topic]
    cand_vecs = agent.fixed_candidate_vecs[topic]
    cand_encs = agent.fixed_candidate_encs[topic]
    scores = agent.score_candidates(batch, cand_vecs, cand_encs=cand_encs)

    if agent.rank_top_k > 0:
        sorted_scores, ranks = scores.topk(
            min(agent.rank_top_k, scores.size(1)), 1, largest=True
        )
    else:
        sorted_scores, ranks = scores.sort(1, descending=True)

    # sorted_scores = F.softmax(sorted_scores, dim=-1)

    ranks = ranks.cpu()
    max_preds = agent.opt['cap_num_predictions']
    cand_preds = []
    for i, ordering in enumerate(ranks):
        if cand_vecs.dim() == 2:
            cand_list = cands
        elif cand_vecs.dim() == 3:
            cand_list = cands[i]
        # using a generator instead of a list comprehension allows
        # to cap the number of elements.
        cand_preds_generator = (
            cand_list[rank] for rank in ordering if rank < len(cand_list)
        )
        cand_preds.append(list(islice(cand_preds_generator, max_preds)))

    if (
        agent.opt.get('repeat_blocking_heuristic', True)
        and agent.eval_candidates == 'fixed'
    ):
        cand_preds = agent.block_repeats(cand_preds)

    preds = [cand_preds[i][0] for i in range(batchsize)]
    return preds, cand_preds

In [4]:
setup_interactive(SHARED)

[ Main ParlAI Arguments: ] 
[  batchsize: 5 ]
[  datapath: d:\installation\~\parlai\data ]
[  datatype: train ]
[  download_path: d:\installation\~\parlai\downloads ]
[  hide_labels: False ]
[  image_mode: raw ]
[  init_opt: None ]
[  multitask_weights: [1] ]
[  numthreads: 1 ]
[  show_advanced_args: False ]
[  task: None ]
[ ParlAI Model Arguments: ] 
[  dict_class: None ]
[  init_model: None ]
[  model: None ]
[  model_file: None ]
[ Training Loop Arguments: ] 
[  aggregate_micro: False ]
[  display_examples: False ]
[  eval_batchsize: None ]
[  evaltask: None ]
[  load_from_checkpoint: False ]
[  max_train_time: -1 ]
[  metrics: default ]
[  num_epochs: -1 ]
[  save_after_valid: False ]
[  save_every_n_secs: -1 ]
[  short_final_eval: False ]
[  validation_cutoff: 1.0 ]
[  validation_every_n_epochs: -1 ]
[  validation_every_n_secs: -1 ]
[  validation_max_exs: -1 ]
[  validation_metric: accuracy ]
[  validation_metric_mode: None ]
[  validation_patience: 10 ]
[  validation_share_agent

  warn_once("Installing APEX can give a significant speed boost.")


[ Using CUDA ]
Dictionary: loading dictionary from /installation/~/ParlAI/data/models/rachel/bibert_poly_ranker.dict
[ num words =  9 ]
[BertJPRanker: full interactive mode on.]
Total parameters: 222415872
Trainable parameters:  222415872
Loading existing model parameters from /installation/~/ParlAI/data/models/rachel/bibert_poly_ranker
[ Loading fixed candidate set from /installation/~/ParlAI/data/rachel/topical_cands.json ]
[ Loading fixed candidate set vectors from /installation/~/ParlAI/data/models/rachel\bibert_poly_ranker.topical_cands.vecs ]
[ Loading fixed candidate set encodings from /installation/~/ParlAI/data/models/rachel\bibert_poly_ranker.topical_cands.encs ]




[creating task(s): parlai.agents.local_human.local_human:LocalHumanAgent]
Enter [DONE] if you want to end the episode.

Enter [DONE] if you want to end the episode.

Enter [DONE] if you want to end the episode.

Enter [DONE] if you want to end the episode.

Enter [DONE] if you want to end the episode.

Enter [DONE] if you want to end the episode.



In [5]:
text = SHARED['tokenizer'].parse("夢のこと話しましょう")
preds, cand_preds = get_responses(SHARED['agent'], text, "dream")
print(preds)
print(cand_preds)

['▁ 夢 について ですか 。 いい です ね 。 あなた の 夢 が 知り たい です 。']
[['▁ 夢 について ですか 。 いい です ね 。 あなた の 夢 が 知り たい です 。', '▁ は い 。 宇宙 に行く の が 私の 夢 です 。', '▁今 は お金 がある 人 しか 行 け ません が 、 後 何 年 か で 自由に 宇宙 に 行 ける 日 が 来 そう です 。', '▁ は い 。 宇宙 に行く ことは 私の 夢 です 。', '▁ は い 。 とにかく どんな 世界 なのか 見て みたい です 。', '▁ いい です ね 。 そういう 話 好き です 。', '▁それは すごい です ね 。', '▁ そう ですか 。 私の 夢 は 宇宙 に行く ことで す 。', '▁ 解明 されて ないこと が 、 たくさん あって お もし ろう そう じゃない ですか 。', '▁ は い 。 宇宙 に行く ことが 私の 夢 です 。', '▁ そう です ね 。 お金 があれば もう 行 け ます し 。', '▁それは 行って みて から 考え ます 。 とにかく 行って みたい です ね 。', '▁ あ ー 。 私 も 宇宙 は大 好き です 。', '▁ すごい 。 か っこ いい です ね 。', '▁ そう ですか 。 私は 新しい もの を見て みたい です 。', '▁ え 。 言う の 恥 ず か しい です 。 先に あなた の 夢 を教え て ください 。', '▁韓国 が お 好きな んです ね 。', '▁ 分かり ました 。 私の 夢 はず ば り 宇宙 に行く ことで す 。', '▁ ふ ふ ふ 。 私は 宇宙 好きな んです 。', '▁ 宇宙 から 地球 を見て みたい です 。', '▁ 言語 がある のか も 謎 です ね 。', '▁それは ずる い です ね 。', '▁ は い 、 聞き たい です 。', '▁ そうな んです か 。 自分 探し 中 です ね 。', '▁ ない です ね 。 いつか 行って みたい です 。', '▁ すごい です ね 。 山 登り が 好きな んです ね 。', '▁ すごい です ね 。 その 話 もっと 聞かせ て ください 。', '▁ は

In [9]:
SHARED['agent'].opt['person_tokens']

True

In [7]:
SHARED['agent'].observe({"text": "夢 は 何 です か", 'episode_done': False, 'topic': "dream"})
batch = SHARED['agent'].batchify([SHARED['agent'].observation])
SHARED['agent'].model.eval()
vecs = SHARED['agent'].fixed_candidate_vecs['dream']
encs = SHARED['agent'].fixed_candidate_encs['dream']

In [8]:
batch

Batch(text_vec=tensor([[    4,  4764,  4764,     0,  4764,  4764,     9,  1392,  3361,  9016,
         24438,     5,  4764,  4764,     0,  4764,  4764,  1392,    11,  1059,
             0,    95,     5]], device='cuda:0'), text_lengths=[23], label_vec=None, label_lengths=None, labels=None, valid_indices=[0], candidates=None, candidate_vecs=None, image=None, observations=[{'text': '夢 は 何 です か', 'episode_done': False, 'topic': 'dream', 'full_text': '__p1__ ▁ 夢 のこと 話し ましょう\n__p1__ 夢 は 何 です か', 'text_vec': tensor([    4,  4764,  4764,     0,  4764,  4764,     9,  1392,  3361,  9016,
        24438,     5,  4764,  4764,     0,  4764,  4764,  1392,    11,  1059,
            0,    95,     5]), 'added_start_end_tokens': True}], topics=['dream'])

In [None]:
batch_act = world.batch_act(0, None)

In [None]:
world.world.get_agents()

In [None]:
batch_act

In [None]:
batch_obs = world.batch_observe(1, batch_act, 0)

In [None]:
batch_obs

In [None]:
model_agent = world.world.get_agents()[1]

In [None]:
dict_class = model_agent.dict

In [None]:
v =[   1, 1869,  378,    5,    9,    2]
for t in v:
    print(dict_class.ind2bi[t])

In [None]:
batch = model_agent.batchify(batch_obs)

In [None]:
batch

In [None]:
cands, cand_vecs, label_inds = model_agent._build_candidates(
            batch, source='batch', mode='train'
        )

In [None]:
label_inds

In [None]:
cand_vecs

In [None]:
batch.text_vec.size(0)

In [None]:
scores = model_agent.model(batch.text_vec, batch.bi_text_vec, cand_vecs['uni'], cand_vecs['bi'])

In [None]:
cand_vecs

In [None]:
x_uni = batch.text_vec
x_bi = batch.bi_text_vec
y_uni = cand_vecs['uni']
y_bi = cand_vecs['bi']

In [None]:
x_emb, y_emb = model_agent.model.encode(x_uni, x_bi, y_uni.unsqueeze(1), y_bi.unsqueeze(1))

In [None]:
y_emb

In [None]:
y_emb.expand(y_emb.size(0), x_uni.size(0), -1).transpose(0,1).contiguous()

In [None]:
bsz, cand_num, seq_len = y_uni.unsqueeze(1).shape

In [None]:
y_emb.view(bsz, cand_num, -1).expand(y_emb.size(0), x_uni.size(0), -1).transpose(0,1).contiguous()

In [None]:
y_uni_emb = model_agent.model.y_unigram_encoder(y_uni.unsqueeze(1).view(bsz*cand_num, -1))
y_bi_emb = model_agent.model.y_bigram_encoder(y_bi.unsqueeze(1).view(bsz*cand_num, -1))

In [None]:
y_uni_emb.shape

In [None]:
model_agent.model.C

In [None]:
x_emb.unsqueeze(1).expand(y_emb.view(bsz, cand_num, -1).expand(y_emb.size(0), x_uni.size(0), -1).transpose(0,1).contiguous().shape)

In [None]:
y_emb = y_emb.expand(cand_num, bsz, -1).transpose(0,1).contiguous()

In [None]:
y_emb.shape

In [1]:
from transformers import BertTokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained('/installation/~/ParlAI/data/models/bert_models/bert-wiki-ja/vocab.txt', tokenize_chinese_chars=False)

In [3]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize('そっち かぁ 。 どっち に しろ あめ は ふる な だ な 。'))

[None, None, 8, None, 17, 9444, None, 11, 14045, 57, 40, 57, 8]