Количество записей в таблицах Postgres и статей в Elastic
--------------------

In [None]:
import pandas as pd
import psycopg2 as pg
from elasticsearch import Elasticsearch
# from pandasticsearch import Select
import os
import sys
import ipywidgets as w
import time
import matplotlib.pyplot as plt
import matplotlib as mpl
# import plotly.graph_objects as go

# Import seaborn
import seaborn as sns

# Apply the default theme
sns.set_theme()

pg_con = pg.connect(os.getenv('RGDSN'))

RGUSER = 'admin'
RGPASS = os.getenv('RGPASS')
ELASTIC_ENDPOINT = 'http://dockertest.rgwork.ru:9094/elasticsearch/' if os.path.exists('local-file.txt') else 'http://es01:9200/'
es_con = Elasticsearch(ELASTIC_ENDPOINT, http_auth=(RGUSER, RGPASS))


def get_counts():
    """ Возвращает число записей в таблицах Postgress и индексах Elastic"""
    counts = {
        "PG: rubrics" : pd.read_sql('SELECT count(1) FROM rubrics', pg_con)['count'][0],
        "PG: rubrics_objects" : pd.read_sql('SELECT count(1) FROM rubrics_objects', pg_con)['count'][0],
        "PG: articles" : pd.read_sql_query('SELECT count(1) FROM articles', pg_con)['count'][0],
        "articles: migration_status = 'error'" : pd.read_sql("SELECT count(1) FROM articles WHERE migration_status = 'error'", pg_con)['count'][0],
        "articles: process_status   = 'error'" : pd.read_sql("SELECT count(1) FROM articles WHERE process_status = 'error'", pg_con)['count'][0],
        "articles: process_status IS NULL    " : pd.read_sql("SELECT count(1) FROM articles WHERE process_status IS NULL", pg_con)['count'][0],
        "articles: process_status   = 'success'" : pd.read_sql("SELECT count(1) FROM articles WHERE process_status = 'success'", pg_con)['count'][0],
        "articles: elastic_status   = 'indexed'" : pd.read_sql("SELECT count(1) FROM articles WHERE elastic_status = 'indexed'", pg_con)['count'][0],
        "ES: articles" : es_con.count(index='articles')['count'],
    }
    return counts

def display_counts():
    counts = get_counts()
    display( pd.DataFrame.from_dict(counts,orient='index', columns=['count']) )
    

In [None]:
%%time
display_counts()

In [None]:

body = {
    "size": 0,
    "query": {
        "range": {
            "date_modified": {
                "from": "now-2d/h"
           }
        }
    },
    "aggs": {
        "articles_over_time": {
            "date_histogram": {
                "field": "date_modified",
                "calendar_interval": "hour"
            }
        }
    }
}


result = es_con.search(body,'articles')
buckets = result['aggregations']['articles_over_time']['buckets']
df = pd.DataFrame.from_dict(buckets)
# display(df)


mpl.rcParams['figure.figsize'] = [15.0, 7.0]
# mpl.rcParams['figure.dpi'] = 72
# mpl.rcParams['savefig.dpi'] = 100
# mpl.rcParams['font.size'] = 12
# mpl.rcParams['legend.fontsize'] = 'medium'
# mpl.rcParams['figure.titlesize'] = 'medium'
%config InlineBackend.figure_format = 'svg'
df.plot(kind='bar',x='key_as_string',y='doc_count');


## Очистка ошибок в поле articles.migration_status

In [None]:
pd.read_sql_query("DELETE from articles WHERE migration_status = 'error' RETURNING obj_id, migration_status", pg_con)

Загружаем идентификаторы статей из Postgres
-----------------------

In [None]:
%%time
ids_pg_df = pd.read_sql('SELECT obj_id FROM articles limit 2000000', pg_con)
# mem_mb = ids_pg_df.memory_usage(deep=True).sum()/1024/1024
mem_mb = sys.getsizeof(ids_pg_df)/1024/1024
print(f'Память под идентификаторы = {mem_mb:.2f} Mb')
display(ids_pg_df)

Загружаем идентификаторы статей из Еластик
-----------------------

In [None]:
def load_field_values(index_name:str, field_name:str, fetch_size:int) ->list:
    """Возвращает список значений поля из инедкса Эластик"""
    
    # Количество объектов
    es_count = es_con.count(index=index_name)['count']
    progress = w.IntProgress(min=0, max=es_count)
    label = w.Label()
    hbox = w.HBox([progress, label])
    display(hbox)
    start = time.time()
    
    def show_progress():
        progress.value = len(ids)
        label.value = f'{len(ids)}/{es_count} время {time.time()-start:.1f}/{(time.time()-start)*es_count/len(ids):.1f} sec'

        
    
    ids = []

    res = es_con.sql.query({"query": f"select {field_name} from {index_name}", "fetch_size":fetch_size })
    ids.extend(row[0] for row in res['rows'])
    cursor = res.get('cursor')
    show_progress()

    while cursor:
        res = es_con.sql.query({"cursor": cursor})
        ids.extend(row[0] for row in res['rows'])
        cursor = res.get('cursor')
        show_progress()

    cursor_clearing_result = "didn't try"
    if cursor:
        cursor_clearing_result = es_con.sql.clear_cursor({"cursor":cursor})

    return ids



In [None]:
%%time
ids_es = load_field_values(index_name='articles', field_name='obj_id', fetch_size='10000')
mb = sys.getsizeof(ids_es)/1024/1024
print(f'len={len(ids_es)} {mb} mb')

In [None]:
ids_es_set = set(ids_es)
len(ids_es_set)

In [None]:
ids_pg_set = set(ids_pg_df['obj_id'].to_list())
len(ids_pg_set)

In [None]:
ids_pg_set - ids_es_set