## キーワード検索

In [None]:
import elasticsearch, json
import pandas as pd

HOST = "192.168.1.31"
PORT = "9200"
INDEX_NAME = "mlhw"
SCROLL_TIME = '1m'
SCROLL_SIZE = 10
pd.set_option("display.max_colwidth", 100)

search_word = "カバー"

client = elasticsearch.Elasticsearch(HOST + ":" + PORT)

results = []
scroll_results = client.search(
    index = INDEX_NAME,
    scroll = SCROLL_TIME,
    size = SCROLL_SIZE,
    body = {"query": { "match": {"文章":search_word} }}
)
sid = scroll_results['_scroll_id']
scroll_results = scroll_results['hits']['hits']
scroll_size = len(scroll_results)

for scroll_result in scroll_results:
    results.append({"id":scroll_result['_id'], "category": scroll_result['_source']['業種(大分類)'], "sentence":scroll_result['_source']['文章']})

while scroll_size > 0:
    scroll_results = client.scroll(
        scroll_id = sid,
        scroll = SCROLL_TIME )
    sid = scroll_results['_scroll_id']
    scroll_results = scroll_results['hits']['hits']
    scroll_size = len(scroll_results)
    
    for scroll_result in scroll_results:
        results.append({"id":scroll_result['_id'], "category": scroll_result['_source']['業種(大分類)'], "sentence":scroll_result['_source']['文章']})

df = pd.DataFrame(results)
df

## 類似検索したい文書を指定する

In [None]:
# 特定の行を参照する。0始まり
target_id = 10
target = df.loc[target_id]
target

## 類似する文書を検索し、トップ10を表示する

In [None]:
# 対象の文章ベクトルを取得する。

id = target['id']
target_detail = client.search(
    index = INDEX_NAME,
    body = {"query": { "match": {"_id":id} }}
)
target_vector = target_detail['hits']['hits'][0]['_source']['scdv_vector']
target_vector

script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, doc['scdv_vector']) + 1.0",
            "params": {"query_vector": target_vector}
        }
    }
}

# 類似する文書を検索する。
responses = client.search(
    index=INDEX_NAME,
    body={
        "size": SCROLL_SIZE,
        "query": script_query,
        "_source": {"includes": ["_id", "業種(大分類)", "文章"]}
    }
)
# 検索結果を整形する。1件目は検索対象の自分自身である。
responses = responses['hits']['hits']
similar = []
for response in responses:
    id = response['_id']
    score = response['_score']
    category = response['_source']['業種(大分類)']
    sentence = response['_source']['文章']
    similar.append({'ID':id, 'SCORE':score, "分類":category, "文章":sentence})

sim_pd = pd.DataFrame(similar)

sim_pd