## Export CSV files to Elasticsearch

Change input_file_name, elastic_endpoint, index_name, max_number, batch_size variables here:

To access notebook via ssh: `$ ssh -N -L 8888:localhost:8888 {user}@{server_ip}`

In [2]:
import os
import csv
import json
import sys
import requests
from pprint import pprint
import psycopg2
import psycopg2.extras  

In [8]:

os.getenv('RGPASS')

''

In [80]:
username = 'admin'
password = os.getenv('RGPASS')
DSN = os.getenv('RGDSN')
DSN = 'host=rg-db-prod port=5432 dbname=rgdb user=root password=rosgas2011 sslmode=disable'
print(DSN)
# input directory
source_dir = "/home/jovyan/work/csv/"

# elastic endpoint
elastic_endpoint = "http://rg-corpus-caddy:8080/elasticsearch/"




def save_batch(lines: list, elastic_endpoint:str, index_name:str):
    """saves batch of lines to database"""
    data = '\n'.join(lines)+'\n'
#     print(data)
    return
#     print('----------------------------------')
    r = requests.post(f'{elastic_endpoint}{index_name}/_bulk', 
                      headers = {'Content-Type': 'application/x-ndjson; charset=UTF-8'}, 
                      auth=(username,password),
                      data=data.encode('utf-8'))
    try:
        rjson=r.json()
        if rjson.get('errors') is not False:
            pprint(rjson)
    except:
        pprint(r)
        
#     lines.clear()
    
    
def save_csv_file_to_elastic(input_file_name: str, elastic_endpoint:str, index_name:str,  max_number=0, batch_size=1000):
    """Saves CSV file to elasticsearch
    - input_file_name - name of CSV file
    - max_number - max number of records to save
    - batch_size  - number of records in a batch 
    """
    # to process long fields in CSS file
    csv.field_size_limit(sys.maxsize)
    
    
    counter =0    # aka record id 
    lines =[]     # list of text lines to save
    
    with open(input_file_name) as input_file:
        reader = csv.DictReader(input_file)

        for row in reader:
            if counter >= max_number: break
            lines.append('{ "index" : {"_id" : "'+str(counter)+'" }}')
            lines.append(json.dumps(row, ensure_ascii=False))
            counter += 1
            if counter % batch_size ==0:
                print(f'counter = {counter}----------------')
                save_batch(lines, elastic_endpoint, index_name)
                lines.clear()
                
        print(f'counter = {counter}----------------')
        save_batch(lines, elastic_endpoint, index_name)
        lines.clear()

def save_table_to_elastic(table_name: str, idname: str, elastic_endpoint:str, index_name:str,  max_number=0, batch_size=1000):
    """Saves table to elasticsearch
    - table_name - name of postgres table
    - max_number - max number of records to save
    - batch_size  - number of records in a batch 
    """
    # to process long fields in CSS file
    csv.field_size_limit(sys.maxsize)
    
    
    counter =0    # aka record id 
    lines =[]     # list of text lines to save
    
    

    try:
        conn = psycopg2.connect(DSN)
        with conn:
            with conn.cursor('servercursor') as curs:
                curs.execute(f"SELECT {idname}, row_to_json(r,FALSE)::text FROM {table_name} r LIMIT {max_number}")
#                 curs.itersize=4000
                for record in curs:
                    if counter >= max_number: break
                    elastic_id = counter if record[0] is None else record[0]
                    lines.append('{"index" : {"_id" : "'+str(elastic_id)+'"}}')
                    lines.append(record[1])
                    counter += 1
                    if counter % batch_size ==0:
                        print(f'counter = {counter}----------------')
                        save_batch(lines, elastic_endpoint, index_name)
                        lines.clear()
                print(f'counter = {counter}----------------')
                save_batch(lines, elastic_endpoint, index_name)
                lines.clear()
       
    except Exception as ex:
        print(ex)
    finally:
        conn.close()    
    

# save_table_to_elastic('rubrics','id', elastic_endpoint, 'rubrics', 16 , 3)        

host=rg-db-prod port=5432 dbname=rgdb user=root password=rosgas2011 sslmode=disable


## Проверки соединений

In [67]:
# проверки
r = requests.get(elastic_endpoint, auth=(username, password))
display(r.status_code)


401

## Импорт таблиц в Эластик 

In [76]:
%%time

save_table_to_elastic('rubrics', 'id', elastic_endpoint, 'rubrics', 2000 , 500)

counter = 1153----------------
CPU times: user 4.04 ms, sys: 6.93 ms, total: 11 ms
Wall time: 23.9 ms


In [79]:
%%time

save_table_to_elastic('rubrics_objects', 'NULL', elastic_endpoint, 'rubrics_objects', 5000000 , 10000)

counter = 3053116----------------
CPU times: user 2.37 s, sys: 285 ms, total: 2.66 s
Wall time: 9.46 s


In [None]:
%%time
# Dockertest Wall time: 21min 43s 
save_table_to_elastic('articles', 'obj_id', elastic_endpoint, 'articles', 1250000 , 5000)