In [1]:
!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.9.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.9.0

In [2]:
!pip install elasticsearch -q
!pip install transformers -q

[K     |████████████████████████████████| 225kB 4.8MB/s 
[K     |████████████████████████████████| 1.0MB 4.4MB/s 
[K     |████████████████████████████████| 3.0MB 12.7MB/s 
[K     |████████████████████████████████| 890kB 39.0MB/s 
[K     |████████████████████████████████| 1.1MB 34.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
from subprocess import Popen, PIPE, STDOUT
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from transformers import BertTokenizer, TFBertModel

import os
import time
import pprint
import numpy as np

In [4]:
es_server = Popen(
  ['elasticsearch-7.9.0/bin/elasticsearch'], 
  stdout = PIPE, stderr = STDOUT,
  preexec_fn = lambda: os.setuid(1))

In [5]:
!curl -X GET "localhost:9200/"

{
  "name" : "5f31eadd2030",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "7lBIjTG7R16HDVAuF4UA-Q",
  "version" : {
    "number" : "7.9.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "a479a2a7fce0389512d6a9361301708b92dff667",
    "build_date" : "2020-08-11T21:36:48.204330Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.0",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [6]:
def gen_data():
  with open('/content/gdrive/My Drive/finch/es/free_chat/data/basic.txt') as f:
    for line in f:
      line = line.rstrip()
      q, a = line.split('<SEP>')
      bert_inp = ['[CLS]'] + list(q) + ['[SEP]']
      bert_inp = tokenizer.convert_tokens_to_ids(bert_inp)
      bert_seg = [0] * len(bert_inp)
      bert_mask = [1] * len(bert_inp)
      res = encoder([np.asarray([bert_inp]),
                     np.asarray([bert_mask]),
                     np.asarray([bert_seg])])
      yield {
        '_index': 'chatbot',
        'question': q,
        'answer': a,
        'question_embedding': res[1][0].numpy(),}

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                           lowercase = True,
                                           add_special_tokens = True)

encoder = TFBertModel.from_pretrained('bert-base-chinese', trainable = False)

In [9]:
es = Elasticsearch()
print(es.ping())

es.indices.create(index='chatbot')

mapping = {
  'properties': {
    'question': {
      'type': 'text',
    },
    'question_embedding': {
      'type': 'dense_vector',
      'dims': 768,
    },
  }
}
es.indices.put_mapping(body=mapping, index='chatbot')

helpers.bulk(es, gen_data())

True


(366, [])

In [None]:
while True:
  text_inp = input('Input:')
  t0 = time.time()
  bert_inp = ['[CLS]'] + list(text_inp) + ['[SEP]']
  bert_inp = tokenizer.convert_tokens_to_ids(bert_inp)
  bert_seg = [0] * len(bert_inp)
  bert_mask = [1] * len(bert_inp)
  res = encoder([np.asarray([bert_inp]),
                 np.asarray([bert_mask]),
                 np.asarray([bert_seg])])
  query_vector = res[1][0].numpy()
  script_query = {
  'script_score': {
    'query': {'match_all': {}},
    'script': {
      'source': "cosineSimilarity(params.query_vector, doc['question_embedding']) + 1.0",
      'params': {'query_vector': query_vector},
      }
    }
  }
  dsl = {
    'query': script_query,
    '_source': {'excludes': ['question_embedding']},
  }
  hits = es.search(index='chatbot', body=dsl)['hits']['hits']
  print('Output:', hits[0]['_source']['answer'])
  print('%.2f sec' % (time.time() - t0))
  print()

Input:你好




Output: 你好呀
0.44 sec

Input:早上好
Output: 早上好
0.23 sec

Input:晚上好
Output: 晚上好
0.21 sec

Input:再见
Output: 再见
0.21 sec

Input:好久不见
Output: 好久不见
0.20 sec

Input:想死你了
Output: 我也想死你了
0.20 sec

Input:谢谢你
Output: 不用谢
0.21 sec

Input:爱你
Output: 爱你
0.20 sec

Input:我喜欢你
Output: 我也爱你
0.19 sec

Input:我爱你
Output: 我也爱你
0.19 sec

Input:天气
Output: 天气还行 要查具体的吗
0.20 sec

Input:几点了
Output: 要查一下现在的具体时间吗
0.20 sec

Input:几岁了
Output: 你猜呢
0.20 sec

Input:笑话
Output: 白天文明 但不精神 晚上精神 但不文明
0.20 sec

Input:天气
Output: 天气还行 要查具体的吗
0.21 sec

Input:天气好
Output: 适合出门走动走动
0.20 sec

Input:天气糟糕
Output: 最好不要出门
0.19 sec

Input:冷
Output: 多穿衣服哈
0.19 sec

Input:热
Output: 对啊, 热死人了
0.19 sec

Input:热死了
Output: 瞧把你开心的
0.20 sec

Input:激动死了
Output: 发生什么好事了 说来听听
0.20 sec

Input:我伤心了
Output: 怎么了 身体不舒服吗
0.19 sec

Input:厉害了
Output: 怎么了 别害怕 我陪你
0.20 sec

Input:我想回家
Output: 那就回呗
0.20 sec

