In [None]:
!pip install transformers elasticsearch 

import numpy as np 
from transformers import AutoTokenizer, AutoModel 
from elasticsearch import Elasticsearch 
import torch 

# 인증정보를 활용하여 엘라스틱서치 커넥션 정보를 정의합니다.
es = Elasticsearch(
    ['https://host:port'],
    http_auth=('username', 'password'),
    verify_certs=False
)
 

# Ddense vector field를 위한 맵핑을 정의합니다. 
mapping = { 
    'properties': { 
        'embedding': { 
            'type': 'dense_vector', 
            'dims': 768, # Ddense vector field의 차원을 정의합니다. 
            'index': 'true',
            "similarity": "cosine"
        } 
    } 
} 

# 정의한 맵핑으로 인덱스를 생성합니다. 
es.indices.create(index='jokes-index', body={'mappings': mapping}) 

# 색인 할 유머 데이터셋을 구성합니다.
jokes = [ 
    { 
        'text': 'Why do cats make terrible storytellers? Because they only have one tail.', 
        'category': 'cat' 
    }, 
    { 
        'text': 'What did the cat say when he lost all his money? I am paw.', 
        'category': 'cat' 
    }, 
    { 
        'text': 'Why don\'t cats play poker in the jungle? Too many cheetahs.', 
        'category': 'cat' 
    },
    { 
        'text': 'Why did the tomato turn red? Because it saw the salad dressing!', 
        'category': 'vegetable' 
    },
    { 
        'text': 'Why did the scarecrow win an award? Because he was outstanding in his field.', 
        'category': 'farm' 
    },
    { 
        'text': 'Why did the hipster burn his tongue? Because he drank his coffee before it was cool.', 
        'category': 'hipster' 
    },    
    {
        'text': 'Why did the tomato turn red? Because it saw the salad dressing!', 
        'category': 'food' 
    },
    {
        'text': 'Why did the scarecrow win an award? Because he was out-standing in his field!', 
        'category': 'puns' 
    },
    {
        'text': 'What do you call a fake noodle? An impasta!', 
        'category': 'food' 
    },
    {
        'text': 'What do you call a belt made out of watches? A waist of time!', 
        'category': 'puns' 
    },
    {
        'text': 'Why did the math book look sad? Because it had too many problems!', 
        'category': 'math' 
    },
    {
        'text': 'Why did the gym close down? It just didn\'t work out!', 
        'category': 'exercise' 
    },
    {
        'text': 'Why don\'t scientists trust atoms? Because they make up everything!', 
        'category': 'science' 
    },
    {
        'text': 'What do you call a fake noodle? An impasta!', 
        'category': 'food' 
    },
    {
        'text': 'Why did the chicken cross the playground? To get to the other slide!', 
        'category': 'kids' 
    },
    {
        'text': 'Why did the frog call his insurance company? He had a jump in his car!', 
        'category': 'puns' 
    }

] 

#BERT 토크나이저와 모델을 로드합니다. 
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 
model = AutoModel.from_pretrained('bert-base-uncased') 

# BERT를 활용하여 유머 데이터에 대한 임베딩을 생성합니다. 
for joke in jokes: 
    text = joke['text'] 
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True) 
    with torch.no_grad(): 
        output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy() 
        joke['embedding'] = output.tolist() 

# 엘라스틱서치에 유머 데이터를 색인 합니다. 
for joke in jokes: 
    es.index(index='jokes-index', body=joke) 

# 쿼리 벡터를 생성하는 부분입니다. 
# 쿼리 텍스트를 정의하고 BERT를 활용해 쿼리 텍스트를 벡터로 변환합니다.
query = "What do you get when you cross a snowman and a shark?"
inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy()
query_vector = output

# 일래스틱서치 kNN 검색 쿼리를 정의합니다. 
search = {
    "knn": {
        "field": "embedding",
        "query_vector": query_vector.tolist(),
        "k": 3,
        "num_candidates": 100
    },
    "fields": [ "text" ]
}

# kNN 검색을 수행하고 결과를 출력합니다. 
response = es.search(index='jokes-index', body=search)
for hit in response['hits']['hits']:
    print(f"Joke: {hit['_source']['text']}")
