## Export CSV files to Elasticsearch

Change input_file_name, elastic_endpoint, index_name, max_number, batch_size variables here:

To access notebook via ssh: `$ ssh -N -L 8888:localhost:8888 {user}@{server_ip}`

In [1]:
!pip install psycopg2-binary


Collecting psycopg2-binary
  Downloading psycopg2_binary-2.8.6-cp38-cp38-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 730 kB/s eta 0:00:01
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.8.6


In [2]:
import os
import csv
import json
import sys
import requests
from pprint import pprint
import psycopg2
import psycopg2.extras  

In [4]:
username = 'admin'
password = os.getenv('RGPASS')
DSN = os.getenv('RGDSN')
print(DSN)
# input directory
source_dir = "/home/jovyan/work/csv/"

# elastic endpoint
elastic_endpoint = "http://rg-corpus-caddy:8080/elasticsearch/"



def save_batch(lines: list, elastic_endpoint:str, index_name:str):
    """saves batch of lines to database"""
    data = ''.join(lines)
#     print(data)
#     print('----------------------------------')
    r = requests.post(f'{elastic_endpoint}{index_name}/_bulk', 
                      headers = {'Content-Type': 'application/x-ndjson; charset=UTF-8'}, 
                      auth=(username,password),
                      data=data.encode('utf-8'))
    try:
        rjson=r.json()
        if rjson.get('errors') is not False:
            pprint(rjson)
    except:
        pprint(r)
        
    lines.clear()
    
    
def save_csv_file_to_elastic(input_file_name: str, elastic_endpoint:str, index_name:str,  max_number=0, batch_size=1000):
    """Saves CSV file to elasticsearch
    - input_file_name - name of CSV file
    - max_number - max number of records to save
    - batch_size  - number of records in a batch 
    """
    # to process long fields in CSS file
    csv.field_size_limit(sys.maxsize)
    
    
    counter =0    # aka record id 
    lines =[]     # list of text lines to save
    
    with open(input_file_name) as input_file:
        reader = csv.DictReader(input_file)

        for row in reader:
            if counter >= max_number: break
            lines.append('{ "index" : {"_id" : "'+str(counter)+'" } }\n')
            lines.append(json.dumps(row, ensure_ascii=False)+'\n')
            counter += 1
            if counter % batch_size ==0:
                print(f'counter = {counter}----------------')
                save_batch(lines, elastic_endpoint, index_name)
                
        print(f'counter = {counter}----------------')
        save_batch(lines, elastic_endpoint, index_name)

def save_table_to_elastic(table_name: str, elastic_endpoint:str, index_name:str,  max_number=0, batch_size=1000):
    """Saves table to elasticsearch
    - table_name - name of postgres table
    - max_number - max number of records to save
    - batch_size  - number of records in a batch 
    """
    # to process long fields in CSS file
    csv.field_size_limit(sys.maxsize)
    
    
    counter =0    # aka record id 
    lines =[]     # list of text lines to save
    
    
    conn = psycopg2.connect(DSN)
    try:
        # connection usage
        pass
    finally:
        conn.close()    
    
#     with open(input_file_name) as input_file:
#         reader = csv.DictReader(input_file)

#         for row in reader:
#             if counter >= max_number: break
#             lines.append('{ "index" : {"_id" : "'+str(counter)+'" } }\n')
#             lines.append(json.dumps(row, ensure_ascii=False)+'\n')
#             counter += 1
#             if counter % batch_size ==0:
#                 print(f'counter = {counter}----------------')
#                 save_batch(lines, elastic_endpoint, index_name)
                
#         print(f'counter = {counter}----------------')
#         save_batch(lines, elastic_endpoint, index_name)
        

None


## Проверки соединения и наличия csv файлов

In [40]:
# проверки
r = requests.get(elastic_endpoint, auth=(username, password))
display(r.status_code)
try: display(r.json())
except:pass
with open(source_dir+'articles.csv') as f:
    print('----------\n',f.readline())


200

{'name': '61eeb9f222df',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'MfRelN7QTXedUsrgMLejWA',
 'version': {'number': '7.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '81a1e9eda8e6183f5237786246f6dced26a10eaf',
  'build_date': '2020-05-12T02:01:37.602180Z',
  'build_snapshot': False,
  'lucene_version': '8.5.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

----------
 obj_id,announce,authors,date_modified,full_text,images,index_priority,is_active,is_announce,is_paid,link_title,links,obj_kind,projects,release_date,spiegel,title,uannounce,url,migration_status,process_status,lemmatized_text,entities_text,entities_grouped



## Импорт csv в Эластик 

In [3]:
%%time

# save_csv_file_to_elastic(source_dir+'rubrics.csv', elastic_endpoint, 'rubrics', 2000 , 500)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.39 µs


In [None]:
# save_csv_file_to_elastic(source_dir+'rubrics_objects.csv', elastic_endpoint, 'rubrics_objects', 5000000 , 10000)

In [None]:
%%time
# Dockertest Wall time: 21min 43s 
save_csv_file_to_elastic(source_dir+'articles.csv', elastic_endpoint, 'articles', 1250000 , 5000)