# Environment Setting

In [None]:
!pip install pandas numpy tqdm elasticsearch

In [24]:
# Information for Elasticsearch
ELASTIC_USERNAME = "elastic"
ELASTIC_PASSWORD = "skkudbp"
ELASTIC_PATH = "https://es01:9200"

In [26]:
import pandas as pd
import numpy as np
import json
from elasticsearch import Elasticsearch, helpers
from tqdm import tqdm

In [28]:
def connect_to_elastic() -> Elasticsearch:
    client = Elasticsearch(
        ELASTIC_PATH,
        ca_certs="/home/jovyan/certs/ca/ca.crt",
        basic_auth=(ELASTIC_USERNAME, ELASTIC_PASSWORD),
    )
    return client
client = connect_to_elastic()

In [29]:
INDEX_NAME = "week12_movie_review"

mapping = {
    "movieId": {
        "type": "integer"
    },
    "title": {
        "type": "text"
    },
    "genres": {
        "type": "text"
    },
    "imdbId": {
        "type": "integer"
    },
    "tmdbId": {
        "type": "integer"
    },
    "userId": {
        "type": "integer"
    },
    "rating": {
        "type": "float"
    },
    "timestamp": {
        "type": "date"
    }}

In [None]:
# Delete index if exists
client.indices.delete(index=INDEX_NAME, ignore=[400, 404])
if not client.indices.exists(index=INDEX_NAME):
    client.indices.create(index=INDEX_NAME, body={'mappings': {'properties': mapping}})

In [31]:
# Load data
with open("/home/jovyan/work/week12/data/movie_data.json", "r") as f:
    movie_json = json.load(f)

In [32]:
# Insert data
for id_doc, document in enumerate(tqdm(movie_json)):
######## EDIT HERE ########
    pass
######## EDIT HERE ########

100%|██████████| 1000/1000 [00:06<00:00, 164.53it/s]


### Multi match

In [34]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
for hit in response['hits']['hits']:
    print("movidId: ", hit['_source']['movieId'], "/ userId: ", hit['_source']['userId'])

movidId:  3256 / userId:  165
movidId:  165 / userId:  46


### Match phrase

In [37]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
for hit in response['hits']['hits']:
    print(hit['_source']['title'])

Toy Story (1995)
Toy Story (1995)
Toy Story (1995)
Toy Story 2 (1999)


### Query String

In [None]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
for hit in response['hits']['hits']:
    print(hit['_source']['title'])

### Should

In [38]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
for hit in response['hits']['hits']:
    print(hit['_source']['title'])

Philadelphia Story, The (1940)
Straight Story, The (1999)
Everything or Nothing: The Untold Story of 007 (2012)


## Aggregations

### Avg

In [39]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
print(response['aggregations']['avg_rating']['value'])

3.4835


### Max / min

In [40]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
response['aggregations']['latest_review']

{'value': 1537109198000.0, 'value_as_string': '2018-09-16T14:46:38.000Z'}

### Sum

In [42]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

response = client.search(index=INDEX_NAME, body=query)
for hit in response['hits']['hits']:
    print(hit['_source']['title'], "/", hit['_source']['rating'])
response['aggregations']['sum_of_ratings']['value']

Toy Story (1995) / 3.0
Toy Story (1995) / 3.0
Toy Story (1995) / 3.0
Toy Story 2 (1999) / 5.0


14.0

## BULK

In [57]:
# Load data for bulk
with open("/home/jovyan/work/week12/data/movie_data_big.json", "r") as f:
    movie_json = json.load(f)

INDEX_NAME += '_helper'

In [58]:
docs = []
for id_doc, document in enumerate(tqdm(movie_json)):
######## EDIT HERE ########
    pass
######## EDIT HERE ########
res = helpers.bulk(client, docs)

100%|██████████| 100789/100789 [00:00<00:00, 2099109.72it/s]


In [59]:
client.index(index=INDEX_NAME, body=movie_json[0], id='0')

ObjectApiResponse({'_index': 'week12_movie_review_helper', '_id': '0', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 100789, '_primary_term': 1})

In [61]:
upd = docs[0]['_source']
upd['rating'] = 3.0

######## EDIT HERE ########
delete_doc = {
}

create_doc = {
}

index_doc = {
}

update_doc = {
}
######## EDIT HERE ########

In [62]:
delete_actions = []
index_actions = []

for i in range(10):
######## EDIT HERE ########
    pass
######## EDIT HERE ########
actions = delete_actions + delete_actions + index_actions

In [63]:
response = helpers.bulk(client, actions, raise_on_error=False)
print(response)

(20, [{'delete': {'_index': 'week12_movie_review_helper', '_id': '0', '_version': 4, 'result': 'not_found', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 100800, '_primary_term': 1, 'status': 404}}, {'delete': {'_index': 'week12_movie_review_helper', '_id': '1', '_version': 3, 'result': 'not_found', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 100801, '_primary_term': 1, 'status': 404}}, {'delete': {'_index': 'week12_movie_review_helper', '_id': '2', '_version': 3, 'result': 'not_found', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 100802, '_primary_term': 1, 'status': 404}}, {'delete': {'_index': 'week12_movie_review_helper', '_id': '3', '_version': 3, 'result': 'not_found', '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 100803, '_primary_term': 1, 'status': 404}}, {'delete': {'_index': 'week12_movie_review_helper', '_id': '4', '_version': 3, 'result': 'not_found', '_shards': {'total': 2, 'successful': 2,

## SCAN

In [65]:
query = {
######## EDIT HERE ########
}
######## EDIT HERE ########

In [66]:
page_size = 10000
scroll = helpers.scan(client, query=query, index=INDEX_NAME, scroll='1m', size=page_size)
count = 0
for hit in scroll:
    count += 1
print(count)

6781
