# Elastic Cloud  
- [Elastic Cloud] https://cloud.elastic.co
- [Elastic Cloud Developments]https://cloud.elastic.co/developments

## Hidden Units  
- Each unit contains **weights** and **bias terms** that are updated during training.  
- These values determine how hidden units process inputs and generate outputs.  
- The number of hidden units is a **hyperparameter** that significantly impacts network performance.  
- **NLP Use Case**:  
  - A higher number of hidden units is beneficial as the network can learn more complex representations of input data.  
  - However, this increases computational costs for training and evaluation.

In [20]:
# !pip install transformers elasticsearch 

import numpy as np 
from transformers import AutoTokenizer, AutoModel 
from elasticsearch import Elasticsearch 
import torch 

# 인증정보를 사용해 일래스틱서치 접속 정보 정의
es = Elasticsearch(
    ['https://ea9fe54e452f481793fff7cbe9f3a720.us-central1.gcp.cloud.es.io:443'],
    http_auth=('elastic', 'Hg89Dmhkcmv9i5IP7FnPOCUm'),
    verify_certs=False
)
 

  es = Elasticsearch(


In [None]:
# 데이터를 저장할 인덱스의 매핑 정의
mapping = { 
    'properties': { 
        'embedding': { 
            'type': 'dense_vector', 
            'dims': 768, # Ddense vector field의 차원을 정의합니다. 
            'index': 'true',
            "similarity": "cosine"
        } 
    } 
} 

# 정의한 매핑으로 인덱스 생성
es.indices.create(index='jokes-index', body={'mappings': mapping}) 


In [22]:
# 색인 할 유머 데이터 세트 구성
jokes = [ 
    { 
        'text': 'Why do cats make terrible storytellers? Because they only have one tail.', 
        'category': 'cat' 
    }, 
    { 
        'text': 'What did the cat say when he lost all his money? I am paw.', 
        'category': 'cat' 
    }, 
    { 
        'text': 'Why don\'t cats play poker in the jungle? Too many cheetahs.', 
        'category': 'cat' 
    },
    { 
        'text': 'Why did the tomato turn red? Because it saw the salad dressing!', 
        'category': 'vegetable' 
    },
    { 
        'text': 'Why did the scarecrow win an award? Because he was outstanding in his field.', 
        'category': 'farm' 
    },
    { 
        'text': 'Why did the hipster burn his tongue? Because he drank his coffee before it was cool.', 
        'category': 'hipster' 
    },    
    {
        'text': 'Why did the tomato turn red? Because it saw the salad dressing!', 
        'category': 'food' 
    },
    {
        'text': 'Why did the scarecrow win an award? Because he was out-standing in his field!', 
        'category': 'puns' 
    },
    {
        'text': 'What do you call a fake noodle? An impasta!', 
        'category': 'food' 
    },
    {
        'text': 'What do you call a belt made out of watches? A waist of time!', 
        'category': 'puns' 
    },
    {
        'text': 'Why did the math book look sad? Because it had too many problems!', 
        'category': 'math' 
    },
    {
        'text': 'Why did the gym close down? It just didn\'t work out!', 
        'category': 'exercise' 
    },
    {
        'text': 'Why don\'t scientists trust atoms? Because they make up everything!', 
        'category': 'science' 
    },
    {
        'text': 'What do you call a fake noodle? An impasta!', 
        'category': 'food' 
    },
    {
        'text': 'Why did the chicken cross the playground? To get to the other slide!', 
        'category': 'kids' 
    },
    {
        'text': 'Why did the frog call his insurance company? He had a jump in his car!', 
        'category': 'puns' 
    }

] 

In [23]:
# BERT 토크나이저 및 모델 로드
# 'bert-base-uncased' 사전 훈련된 모델을 사용하여 토큰화 및 임베딩 생성
# 'uncased'는 대소문자를 구별하지 않음을 의미함
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 
model = AutoModel.from_pretrained('bert-base-uncased') 

# BERT를 활용하여 유머 데이터에 대한 임베딩 생성
for joke in jokes:  # jokes 리스트의 각 유머(joke) 데이터에 대해 반복
    text = joke['text']  # 유머 텍스트 추출
    
    # 텍스트를 토큰화하고 텐서 형태로 변환
    # padding=True: 입력 길이를 맞춤
    # truncation=True: 최대 길이를 초과하는 경우 자름
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True) 
    
    # 모델을 통해 임베딩 생성 (그래디언트 계산 비활성화로 메모리 절약)
    with torch.no_grad(): 
        output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy()  # 평균 풀링 적용
        
        # 생성된 임베딩을 리스트 형태로 저장
        joke['embedding'] = output.tolist()


In [None]:
# 일래스틱서치에 유머 데이터 색인
for joke in jokes: 
    es.index(index='jokes-index', body=joke) 

# 질의 벡터 생성
# 질의 텍스트를 정의하고 BERT를 활용해 질의 텍스트를 벡터로 변환
query = "What do you get when you cross a snowman and a shark?"
inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    output = model(**inputs).last_hidden_state.mean(dim=1).squeeze(0).numpy()
query_vector = output



In [25]:
# 일래스틱서치 kNN 검색 쿼리 정의
search = {
    "knn": {
        "field": "embedding",
        "query_vector": query_vector.tolist(),
        "k": 3,
        "num_candidates": 100
    },
    "fields": [ "text" ]
}

# kNN 검색 수행 및 결과 출력
response = es.search(index='jokes-index', body=search)
for hit in response['hits']['hits']:
    print(f"Joke: {hit['_source']['text']}")



Joke: What did the cat say when he lost all his money? I am paw.
Joke: What do you call a fake noodle? An impasta!
Joke: What do you call a fake noodle? An impasta!
