Экспорт таблиц из Postgres в Elasticsearch
========================

Таблицы копируются как есть, без изменений в базе данных Postgres.


To access notebook via ssh: `$ ssh -N -L 8888:localhost:8888 {user}@{server_ip}`

In [1]:
import os
import csv
import json
import sys
import requests
from pprint import pprint
import psycopg2
import psycopg2.extras  
import ipywidgets as widgets
import time

In [2]:
print(os.getenv('RGDSN'))
print(os.getenv('RGPASS'))

host=rg-db-prod port=5432 dbname=rgdb user=root password=rosgas2011 sslmode=disable
rosgas2011


In [3]:
RGUSER = 'admin'
RGPASS = os.getenv('RGPASS')
RGDSN = os.getenv('RGDSN')
# input directory
source_dir = "/home/jovyan/work/csv/"

# elastic endpoint
ELASTIC_ENDPOINT = "http://rg-corpus-caddy:8080/elasticsearch/"




def save_batch(lines: list, elastic_endpoint:str, index_name:str):
    """saves batch of lines to database"""
    data = '\n'.join(lines)+'\n'
    r = requests.post(f'{elastic_endpoint}{index_name}/_bulk', 
                      headers = {'Content-Type': 'application/x-ndjson; charset=UTF-8'}, 
                      auth=(RGUSER,RGPASS),
                      data=data.encode('utf-8'))
    try:
        rjson=r.json()
        if rjson.get('errors') is not False:
            pprint(rjson)
    except:
        pprint(r)
        
    
def set_progress(p1,p2, val):
    if p1 is not None:
        p1.value = val
    if p2 is not None:
        p2.value = str(val)

def save_table_to_elastic(table_name: str, idname: str, elastic_endpoint:str, index_name:str,  max_number=0, batch_size=1000):
    """Копировать таблицу в elasticsearch, как есть, без изменений в базе данных Postgres.

    - table_name - name of postgres table
    - max_number - max number of records to save
    - batch_size  - number of records in a batch 
    """
    p1 = widgets.IntProgress(min=0, max=max_number) 
    p2 = widgets.Label()
    box = widgets.HBox([p1,p2])
    display(box)
    
    start = time.time()
    counter =0    # aka record id 
    lines =[]     # list of text lines to save
    conn = psycopg2.connect(RGDSN)
    try:
        with conn:
            with conn.cursor('servercursor') as curs:
                curs.execute(f"SELECT {idname}, row_to_json(r,FALSE)::text FROM {table_name} r LIMIT {max_number}")
#                 curs.itersize=4000
                for record in curs:
                    if counter >= max_number: break
                    elastic_id = counter if record[0] is None else record[0]
                    lines.append('{"index" : {"_id" : "'+str(elastic_id)+'"}}')
                    lines.append(record[1])
                    counter += 1
                    if counter % batch_size ==0:
                        duration = (time.time()-start)/60
#                         print(f'counter = {counter}----------------')
#                         set_progress(p1,p2, counter)
                        p1.value=counter; p2.value = f'{counter}/{max_number}. Время {duration:.2f} мин'
                        save_batch(lines, elastic_endpoint, index_name)
                        lines.clear()
#                 print(f'counter = {counter}----------------')
#                 set_progress(p1,p2, counter)
                p1.value=counter; p2.value = f'{counter}/{max_number}. Время {duration:.2f} мин'
                save_batch(lines, elastic_endpoint, index_name)
                lines.clear()
       
    except Exception as ex:
        print(ex)
    finally:
        conn.close()    
    


## Проверки соединений

In [4]:
# проверки
r = requests.get(ELASTIC_ENDPOINT, auth=(RGUSER, RGPASS))
display("Есть контакт!" if r.status_code==200 else r.status_code)


'Есть контакт!'

## Импорт таблиц в Эластик 

In [5]:
save_table_to_elastic('rubrics', 'id', ELASTIC_ENDPOINT, 'rubrics', 2000 , 500)

HBox(children=(IntProgress(value=0, max=2000), Label(value='')))

In [6]:
save_table_to_elastic('rubrics_objects', "kind || '-' || rubric_id || '-' || object_id", ELASTIC_ENDPOINT, 'rubrics_objects', 3500000 , 10000)

HBox(children=(IntProgress(value=0, max=3500000), Label(value='')))

In [7]:
save_table_to_elastic('articles', 'obj_id', ELASTIC_ENDPOINT, 'articles', 1250000 , 5000)

HBox(children=(IntProgress(value=0, max=1250000), Label(value='')))