# Setup Elastic Client

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch(["http://localhost:9200"], http_auth=('elastic', 'xxxxxx'), timeout=30)
es.cluster.health(wait_for_status='yellow', request_timeout=10)

# Load data

In [None]:
!pip install pysbd

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

In [None]:
# file = "Трудовой кодекс+8.02.2023.xlsx"
file = "fixed 10.02.23.xlsx"
df = pd.read_excel(file, engine="openpyxl")
df.head()

## Some cleanup

In [None]:
df.text = df.text.apply(lambda x: x.replace("_x000D_\n", " "))
df.section_text = df.section_text.apply(lambda x: x.replace("_x000D_\n", " "))
df.chapter_text = df.chapter_text.apply(lambda x: x.replace("_x000D_\n", " "))
df.article_text = df.article_text.apply(lambda x: x.replace("_x000D_\n", " "))

df.section_text = df.section_text.apply(lambda x: x.replace("\n", " "))
df.chapter_text = df.chapter_text.apply(lambda x: x.replace("\n", " "))
df.article_text = df.article_text.apply(lambda x: x.replace("\n", " "))

In [None]:
df.head()

In [None]:

note_cleanup_re: re.Pattern = re.compile(r'КонсультантПлюс\: примечание\.\n.*?\n', flags=re.MULTILINE)
ref_cleanup_re: re.Pattern = re.compile(r'\(.*?N\s+\d+-ФЗ.*?\)\n')
removed_cleanup_re: re.Pattern = re.compile(r'абзац утратил силу\..*?(;|\.)\n')
removed2_cleanup_re: re.Pattern = re.compile(r'Часть \w+ утратила силу(\sс.*?года)?. - Федеральный закон от .*?\n')
def general_cleanup(s):
    s = note_cleanup_re.sub('', s)
    s = ref_cleanup_re.sub('', s)
    s = removed_cleanup_re.sub('', s)
    s = removed2_cleanup_re.sub('', s)
    return s


# print(general_cleanup(s))


In [None]:
df.text = df.text.apply(lambda x: general_cleanup(x))

In [None]:
df[df.text.str.contains('утратила силу')]

In [None]:
# def remove_link(row):
#     for r in [x for x in re.findall(r"\((.*?)\)", row) if re.search(r"\d\d\.\d\d\.\d\d\d\d", x)]:
#         row = row.replace(f"({r})", "")
#     return row

import pysbd

def get_sentence_spans(text):
    seg = pysbd.Segmenter(language="ru", clean=True, char_span=False)
    r = seg.segment(text)
    r.reverse()
    bullets_fixed_r = []
    
    def reccurent_bullet_list(prefix=''):
        while len(r)>0:
            x = r.pop()
            if x.endswith(';'):
                bullets_fixed_r.append(prefix+' '+x)
            elif x.endswith(':'):
                reccurent_bullet_list(prefix+' '+x[:-1])
                break
            else:
                bullets_fixed_r.append(prefix+' '+x)
                break
        

    while len(r)>0:
        s = r.pop()
        if s.endswith(':'):
            reccurent_bullet_list(s[:-1])
        else:
            bullets_fixed_r.append(s)
                
    return bullets_fixed_r
        

print(df.text[73])
# get_sentence_spans(remove_link(df.text[338]))
get_sentence_spans(general_cleanup(df.text[73]))

In [None]:


df.insert(loc=len(df.columns), column='text2embed', value='', allow_duplicates=True)
df.text2embed = df.text.apply(lambda x: get_sentence_spans(x))

In [None]:
df.head(10)

In [None]:
df.to_excel("fixed.xlsx")

In [None]:
!pip list | grep tensorflow

# Prepare indices

In [None]:
index_name = 'labor_law'

# source_no_vecs = ['title', 'db_id']
# vector_dims = 512
vector_dims = 768
# vector_dims = 312

source_law = 'Трудовой кодекс'

columns_to_add_as_text = [
    "party",
    "section_number",
    "section_text",
    "chapter_number",
    "chapter_text",
    "article_number",
    "article_text",
    "href",
    # "text",
]

#### Optional: clear indices

In [None]:
remove_indices = True
if remove_indices:
    es.indices.delete(index=index_name, ignore=[400, 404])
    es.indices.flush(index="*")
    # es.indices.shrink(index="*")
if es.indices.exists(index=index_name):
    es.indices.get_mapping(index=index_name)

## Settings

In [None]:
settings = {
    "number_of_shards": 1,
    "number_of_replicas": 0,
    "elastiknn": True,    
  # "settings": {
  #   "number_of_shards": 1,
  #   "number_of_replicas": 0,
  #   "elastiknn": True,    
  # }
    
    "analysis": {
      "filter": {
        "russian_stop": {
          "type":       "stop",
          "stopwords":  "_russian_"
        },
        "russian_keywords": {
          "type":       "keyword_marker",
          "keywords":   ["пример", "кодекс"]
        },
        "russian_stemmer": {
          "type":       "stemmer",
          "language":   "russian"
        }
      },
      "analyzer": {
        "rebuilt_russian": {
          "tokenizer":  "standard",
          "filter": [
            "lowercase",
            "russian_stop",
            "russian_keywords",
            "russian_stemmer"
          ]
        }
      }
    }
}

mapping = {
  "dynamic": False,
  "properties": {
    "source_law" : {"type": "keyword"},
    "original_row": {"type": "long"},
      
    "doc_structure_tags": {"type": "keyword"},
    
    "party": {"type": "keyword" },
    "section_number": {"type": "keyword" },
    "section_text": {"type": "text", "analyzer": "rebuilt_russian" },
    "chapter_number": {"type": "keyword" },
    "chapter_text": {"type": "text", "analyzer": "rebuilt_russian" },
    "article_number": {"type": "keyword" },
    "article_text": {"type": "text", "analyzer": "rebuilt_russian" },
    "href": {"type": "text" },
    "text": {"type": "text", "store": True, "analyzer": "rebuilt_russian" },
      

    "text_emb": {
        "type": "elastiknn_dense_float_vector", # 1
        "elastiknn": {
            "dims": vector_dims,                        # 2
            "model": "lsh",                     # 3
            "similarity": "cosine",             # 4
            "L": 300,                            # 5
            "k": 10                              # 6
        }
    },

  }
}

In [None]:
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, settings=settings)
    es.indices.put_mapping(mapping, index=index_name,  )

In [None]:
es.cluster.put_settings({
    "persistent": { "cluster.max_shards_per_node": "5000" },
    "transient": {
        "cluster.routing.allocation.total_shards_per_node": 5100
    }
})

In [None]:
es.indices.get_settings(index=index_name)

In [None]:
!curl -X POST "localhost:9200/labor_law/_analyze?pretty" -H 'Content-Type: application/json' -d'{"field": "text", "text": "трудового кодекса"}'
!curl -X POST "localhost:9200/labor_law/_analyze?pretty" -H 'Content-Type: application/json' -d'{"field": "text","text": "трудовым кодексом"}'


In [None]:
es.indices.get_mapping(index=index_name)

# Generate and Index Embeddings

In [None]:

import numpy as np

from tqdm import tqdm

## prepare embedder

In [None]:
# import tensorflow_hub as hub
# import tensorflow_text
# model_name = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
# model_name = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
# embed = hub.load(model_name)

from transformers import AutoTokenizer, AutoModel
import torch


# model_name = 'symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli'
# model_name = 'inkoziev/sbert_pq'
model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v1'

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_sentence_embeddings(sentences):
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, mean pooling.
    return mean_pooling(model_output, encoded_input['attention_mask'])


In [None]:
get_sentence_embeddings('Лица, замещающие указанные должности и достигшие возраста семидесяти лет, переводятся с их письменного согласия на иные должности, соответствующие их квалификации.').shape #.numpy()[0]

In [None]:
structure_columns = ['party', 'section_number', 'chapter_number', 'article_number']
def collect_row_tags(row):
    structure_path = ''
    structure_tags = []
    for c in structure_columns:
        structure_path += row[c]    
        structure_tags.append(structure_path)
        structure_path += '/'
    return structure_tags

collect_row_tags({structure_columns[0]:'ЧАСТЬ ПЕРВАЯ',structure_columns[1]:'Раздел I',structure_columns[2]:'Глава 1',structure_columns[3]:'Статья 1'})

In [None]:
tqdm.pandas(desc="pandas processing...")

def heads_actions():
    for index, row in df.iterrows():
        embs=row['text2embed']
        tags = collect_row_tags(row)
        for e in embs:
            d = {
              "_op_type": "index", 
              "_index": index_name, 

              "original_row": index, 
              "source_law": source_law, 
            }
            for c in columns_to_add_as_text:
                d[c] = row[c]

            d['doc_structure_tags'] = tags
            d['text'] = e
            # d['text_emb'] = embed(e).numpy()[0]
            d['text_emb'] = get_sentence_embeddings(e).numpy()[0]
            yield d

bulk(es, heads_actions(), chunk_size=1000, max_retries=2)

In [None]:
es.indices.refresh(index=index_name)
es.indices.forcemerge(index=index_name, max_num_segments=1, request_timeout=120)

In [None]:
def vector_actions():
    for index, row in df.iterrows():
        yield { 
            "_op_type": "update", 
            "_index": index_name, 
            
            "_id": index, 
            "doc": { 
              "text_emb": { "values": embed(row['text2embed']).numpy()[0] }
            }
        }
        

# bulk(es, vector_actions(), chunk_size=50, max_retries=10, request_timeout=60)

In [None]:
es.indices.refresh(index=index_name)
es.indices.forcemerge(index=index_name, max_num_segments=1, request_timeout=300)

In [None]:
es.get(index=index_name, id="1")


# Search experiments

In [None]:
def display_hits(res):
    print(f"Found {res['hits']['total']['value']} hits in {res['took']} ms. Showing top {len(res['hits']['hits'])}.")
    print("")
    for hit in res['hits']['hits']:
        s = hit['_source']
        # print(f"Id:{s.get('db_id', None)} Title   {s.get('title', None)}")
        print(f"({hit.get('_id')}) {s.get('source_law')}: {s.get('party')} > {s.get('section_number')} {s.get('section_text')} > {s.get('chapter_number')} {s.get('chapter_text')} >")
        print(s.get('text'))
        print(f"Score   {hit.get('_score', None)}")


## Keyword search

In [None]:

body = {
  "query": {
    "multi_match": {
      "query": "Основные начала",
      "fields": ["text^2"]
    }
  }
}

res = es.search(index=index_name, body=body, size=5, _source=columns_to_add_as_text+['source_law', 'text'])
display_hits(res)

In [None]:
res

## Terms Aggregation

In [None]:

body = {
  "aggs": {
    "parties": {
      "terms": { 
          "field": "party",
          "order": { "_key": "asc" },
      }
    },
    "sections": {
        "terms": {
            "field": "section_number", 
            "order": { "_key": "asc" },
        },
        
        "aggs": {
            "chapter_numbers": {  
                "terms": {
                    "field": "chapter_number", 
                    "order": { "_key": "asc" },
                },
            }
      }

    }
    
  }
}
body = {
  "aggs": {
    "genres": {
      "terms": { "field": "party" }
    }
  }
}
es.search(index=index_name, body=body, size=0 )

## Semantic search

In [None]:
# query_vec = embed("Как устанавливается оплата на севере").numpy()[0]
query_vec = get_sentence_embeddings("Как устанавливается оплата на севере").numpy()[0]

In [None]:
body = {
    "query": {
        "elastiknn_nearest_neighbors": {
            "field": "text_emb",                     # 1
            "vec": {                               # 2
                "values": query_vec
            },
            "model": "lsh",                        # 3
            "similarity": "cosine",                # 4
            "candidates": 50                       # 5
        }
    }
}

res = es.search(index=index_name, body=body, size=10, _source=columns_to_add_as_text+['source_law','text'])
display_hits(res)

In [None]:
body = {
    "query": {
        "elastiknn_nearest_neighbors": {
            "field": "text_emb",                     # 1
            "vec": {                               # 2
                "values": query_vec
            },
            "model": "exact",                        # 3
            "similarity": "cosine",                # 4
            "candidates": 50                       # 5
        }
    }
}

res = es.search(index=index_name, body=body, size=10, _source=columns_to_add_as_text+['text', 'source_law'])
display_hits(res)

## Faceted search

In [None]:
aggs_sequence = ["party", "section_number", "chapter_number", "article_number"]

def display_aggs(hits):
    aggs = hits['aggregations']
    
    def reccurent_aggs(aggs, level = 0):
        buckets = aggs['buckets']
        for b in buckets:
            indent = '\t'*level
            print(f"{indent}{b['key']}: {b['doc_count']}")
            if level+1 < len(aggs_sequence) and aggs_sequence[level+1]+'_a' in b.keys():
                reccurent_aggs(b[aggs_sequence[level+1]+'_a'], level+1)
                
    reccurent_aggs(aggs[aggs_sequence[0]+'_a'], 0)

In [None]:
top_k = 50
min_score = 1.
def filter_body(filters):
    fs = []
    for x in filters:
        if type(x) is tuple:
            fs.append({"term":{x[0]:x[1]}})
        elif type(x) is str:
            t = x.split(':', maxsplit=1)
            fs.append({"term":{t[0]:t[1]}})
    return fs
        
body = {
    "size": top_k, "min_score": min_score, 
    "query": {
        "bool": {
            "must":{
                "elastiknn_nearest_neighbors": {
                    "field": "text_emb",                     # 1
                    "vec": {                               # 2
                        "values": query_vec
                    },
                    "model": "exact",                        # 3
                    "similarity": "cosine",                # 4
                    "candidates": 50                       # 5
                }
            },
            "filter": [
                { "terms": {"article_number": ["Статья 130"]} },
                {"term": {"party":"ЧАСТЬ ТРЕТЬЯ"}}
            ] 
                # filter_body({"chapter_number": "Глава 21"}),
                # filter_body({"section_number": "Раздел VI"}),
            # filter_body(['article_number:Статья 130', 'article_number:Статья 131']),
            # [
            #     { "term": {"chapter_number": "Глава 21"} },
            # ]
        }
    },
    
    "aggs": {
        "party_a": {
            "terms": { "field": "party" },
            "aggs": {
                "section_number_a": {
                    "terms": { "field": "section_number" },
                    "aggs": {
                        "chapter_number_a": {
                            "terms": { "field": "chapter_number" },
                            "aggs": {
                                "article_number_a": {
                                    "terms": { "field": "article_number" }
                                }        
                            }
                        },
                    },  
                },
            }

        },

    },
}

res = es.search(index=index_name, body=body, size=10, _source=columns_to_add_as_text)
# display_hits(res)
res['aggregations']

In [None]:
display_hits(res)

In [None]:
display_aggs(res)