# Exemplo de carga em massa para ElasticSearch

### Instalando lib python

In [1]:
!pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-7.16.2-py2.py3-none-any.whl (385 kB)
[K     |████████████████████████████████| 385 kB 1.3 MB/s eta 0:00:01
Installing collected packages: elasticsearch
Successfully installed elasticsearch-7.16.2


### Importando dependencias

In [3]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
import pandas as pd

### Configurações de conexão host com ElasticSearch

In [5]:
host='127.0.0.1'
port='9200'
user='elastic'
password='CMbci8P5tR3OlhCDqPlE'
es = Elasticsearch(hosts=f"http://{user}:{password}@{host}:{port}/")

### Ler csv 
Fonte: https://www.kaggle.com/johnharshith/hollywood-theatrical-market-synopsis-1995-to-2021?select=HighestGrossers.csv

In [20]:
df = pd.read_csv ('HighestGrossers.csv',sep=',')
df.head()

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD
0,1995,Batman Forever,Drama,PG-13,Warner Bros.,"$184,031,112","$387,522,978",42306002
1,1996,Independence Day,Adventure,PG-13,20th Century Fox,"$306,169,255","$634,504,608",69269062
2,1997,Men in Black,Adventure,PG-13,Sony Pictures,"$250,650,052","$500,207,943",54607854
3,1998,Titanic,Adventure,PG-13,Paramount Pictures,"$443,319,081","$865,842,808",94524324
4,1999,Star Wars Ep. I: The Phantom Menace,Adventure,PG,20th Century Fox,"$430,443,350","$776,153,749",84732942


### Tratando dados

In [21]:
def clean_currency(x):
    """ 
    se o valor é uma string, então remove o simbolo e separador de decimais
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', ''))
    return(x)

to_convert= ['TOTAL FOR YEAR','TOTAL IN 2019 DOLLARS','TICKETS SOLD']
for r in to_convert:
    df[r] = df[r].apply(clean_currency).astype('float')
df.head()

Unnamed: 0,YEAR,MOVIE,GENRE,MPAA RATING,DISTRIBUTOR,TOTAL FOR YEAR,TOTAL IN 2019 DOLLARS,TICKETS SOLD
0,1995,Batman Forever,Drama,PG-13,Warner Bros.,184031112.0,387522978.0,42306002.0
1,1996,Independence Day,Adventure,PG-13,20th Century Fox,306169255.0,634504608.0,69269062.0
2,1997,Men in Black,Adventure,PG-13,Sony Pictures,250650052.0,500207943.0,54607854.0
3,1998,Titanic,Adventure,PG-13,Paramount Pictures,443319081.0,865842808.0,94524324.0
4,1999,Star Wars Ep. I: The Phantom Menace,Adventure,PG,20th Century Fox,430443350.0,776153749.0,84732942.0


### Criando indice com mapping

In [24]:
INDEX="index-hollywood"
TYPE= "highestgrossers"
#apagando caso exista
es.indices.delete(index=INDEX, ignore=[400, 404])
mappings= {
  "dynamic": "strict",
  "properties": {
    "YEAR": {
      "type": "integer"
    },
    "MOVIE": {
      "type": "text",
      "fields": {
        "keyword": {
          "type": "keyword",
          "ignore_above": 256
        }
      }
    },
    "GENRE": {
      "type": "keyword"
    },
    "MPAA RATING": {
      "type": "keyword"
    },    
    "DISTRIBUTOR": {
      "type": "keyword"
    },  
    "TOTAL FOR YEAR": {
        "type": "float"
    },  
    "TOTAL IN 2019 DOLLARS": {
        "type": "float"
    },  
    "TICKETS SOLD": {
        "type": "float"
    },
  }
}
#criando novo indice
es.indices.create(index=INDEX, ignore=400, mappings=mappings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'index-hollywood'}

### Importando para elastic

In [25]:
%%time
def rec_to_actions(df):
    for record in df.replace({np.nan:None}).to_dict(orient="records"):
        doc={
            '_source': record
        }
        yield doc

bulk(es, rec_to_actions(df),index=INDEX)

CPU times: user 2.16 ms, sys: 668 µs, total: 2.83 ms
Wall time: 6.94 ms


(27, [])