## Export CSV files to Elasticsearch

Change input_file_name, elastic_endpoint, index_name, max_number, batch_size variables here:

In [1]:
import os
import csv
import json
import sys
import requests
from pprint import pprint

In [3]:
username = 'admin'
password = os.getenv('RGPASS')


def save_batch(lines: list, elastic_endpoint:str, index_name:str):
    """saves batch of lines to database"""
    data = ''.join(lines)
#     print(data)
#     print('----------------------------------')
    r = requests.post(f'{elastic_endpoint}{index_name}/_bulk', 
                      headers = {'Content-Type': 'application/x-ndjson; charset=UTF-8'}, 
                      auth=(username,password),
                      data=data.encode('utf-8'))
    try:
        rjson=r.json()
        if rjson.get('errors') is not False:
            pprint(rjson)
    except:
        pprint(r)
        
    lines.clear()
    
    
def save_csv_file_to_elastic(input_file_name: str, elastic_endpoint:str, index_name:str,  max_number=0, batch_size=1000):
    """Saves CSV file to elasticsearch
    - input_file_name - name of CSV file
    - max_number - max number of records to save
    - batch_size  - number of records in a batch 
    """
    # to process long fields in CSS file
    csv.field_size_limit(sys.maxsize)
    
    
    counter =0    # aka record id 
    lines =[]     # list of text lines to save
    
    with open(input_file_name) as input_file:
        reader = csv.DictReader(input_file)

        for row in reader:
            if counter >= max_number: break
            lines.append('{ "index" : {"_id" : "'+str(counter)+'" } }\n')
            lines.append(json.dumps(row, ensure_ascii=False)+'\n')
            counter += 1
            if counter % batch_size ==0:
                print(f'counter = {counter}----------------')
                save_batch(lines, elastic_endpoint, index_name)
                
        print(f'counter = {counter}----------------')
        save_batch(lines, elastic_endpoint, index_name)


Saving to **Ilmira** computer

In [None]:
%%time

# name of CSV file
input_file_name = "/Volumes/ssd/dumps/articles_i.csv"

# end point of Elasticsearch
elastic_endpoint = "http://134.0.107.93:9094/elasticsearch/"

# max number of records to save
max_number = 1210000

# number of records in a batch
batch_size = 5000

# save_csv_file_to_elastic(input_file_name, elastic_endpoint, index_name, max_number, batch_size)

Saving to **Azure**

In [5]:
%%time

# input directory
source_dir = "/Volumes/ssd/dumps-2020-10-09/"

# Azure elastic endpoint
azure_endpoint = "http://13.79.79.34:9094/elasticsearch/"

save_csv_file_to_elastic(source_dir+'rubrics.csv', azure_endpoint, 'rubrics', 2000 , 500)

counter = 500----------------
counter = 1000----------------
counter = 1153----------------
CPU times: user 59.5 ms, sys: 11.6 ms, total: 71 ms
Wall time: 2.44 s


In [6]:
save_csv_file_to_elastic(source_dir+'rubrics_objects.csv', azure_endpoint, 'rubrics_objects', 5000000 , 10000)

counter = 10000----------------
counter = 20000----------------
counter = 30000----------------
counter = 40000----------------
counter = 50000----------------
counter = 60000----------------
counter = 70000----------------
counter = 80000----------------
counter = 90000----------------
counter = 100000----------------
counter = 110000----------------
counter = 120000----------------
counter = 130000----------------
counter = 140000----------------
counter = 150000----------------
counter = 160000----------------
counter = 170000----------------
counter = 180000----------------
counter = 190000----------------
counter = 200000----------------
counter = 210000----------------
counter = 220000----------------
counter = 230000----------------
counter = 240000----------------
counter = 250000----------------
counter = 260000----------------
counter = 270000----------------
counter = 280000----------------
counter = 290000----------------
counter = 300000----------------
counter = 310000---

In [7]:
%%time

save_csv_file_to_elastic(source_dir+'articles.csv', azure_endpoint, 'articles', 1250000 , 5000)

counter = 5000----------------
counter = 10000----------------
counter = 15000----------------
counter = 20000----------------
counter = 25000----------------
counter = 30000----------------
counter = 35000----------------
counter = 40000----------------
counter = 45000----------------
counter = 50000----------------
counter = 55000----------------
counter = 60000----------------
counter = 65000----------------
counter = 70000----------------
counter = 75000----------------
counter = 80000----------------
counter = 85000----------------
counter = 90000----------------
counter = 95000----------------
counter = 100000----------------
counter = 105000----------------
counter = 110000----------------
counter = 115000----------------
counter = 120000----------------
counter = 125000----------------
counter = 130000----------------
counter = 135000----------------
counter = 140000----------------
counter = 145000----------------
counter = 150000----------------
counter = 155000--------------