In [30]:
# Install elasticsearch
!pip install elasticsearch==7.10.0
!pip install elasticsearch_dsl

# Create a directory containing bills
!mkdir Act_directory

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘Act_directory’: File exists


In [31]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.0-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.0-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-7.10.0-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.10.0/
shasum -a 512 -c elasticsearch-7.10.0-linux-x86_64.tar.gz.sha512

elasticsearch-7.10.0-linux-x86_64.tar.gz: OK


In [None]:
%%bash

sudo elasticsearch-7.10.0/bin/elasticsearch-plugin install pl.allegro.tech.elasticsearch.plugin:elasticsearch-analysis-morfologik:7.10.0

In [None]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.10.0/bin/elasticsearch

In [33]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "136fde9d307e",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "W7CbFiEoRN-_Qh_c_DybZA",
  "version" : {
    "number" : "7.10.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "51e9d6f22758d0374a0f3f5c6e8f3a7997850f96",
    "build_date" : "2020-11-09T21:30:33.964949Z",
    "build_snapshot" : false,
    "lucene_version" : "8.7.0",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [34]:
import elasticsearch
import elasticsearch_dsl
import os
import regex as reg
import tarfile
import tqdm
from pathlib import Path
import requests
from google.colab import files
uploaded = files.upload() #importing tar file

Saving ustawy.tar.gz to ustawy.tar (1).gz


In [35]:
# open file
tar = tarfile.open('ustawy.tar.gz')
  
# extracting file
tar.extractall('/content/Act_directory')
tar.close()

In [36]:
elastic = elasticsearch.Elasticsearch()
elasticsearch_dsl.connections.add_connection('python_client', elastic)
elastic.info()

{'name': '136fde9d307e',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'W7CbFiEoRN-_Qh_c_DybZA',
 'version': {'number': '7.10.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '51e9d6f22758d0374a0f3f5c6e8f3a7997850f96',
  'build_date': '2020-11-09T21:30:33.964949Z',
  'build_snapshot': False,
  'lucene_version': '8.7.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [37]:
analyzer = elasticsearch_dsl.analyzer(
    'analyzer',
    type='custom',
    tokenizer='standard',                             
    filter=[
        'lowercase',                                  
        elasticsearch_dsl.analysis.token_filter(      
            'synonym_filter', 
            type='synonym', 
            synonyms=[
                'kpk,kodeks postępowania karnego',
                'kpc,kodeks postępowania cywilnego',
                'kk,kodeks karny',
                'kc,kodeks cywilny',
            ],
        ),
        'morfologik_stem',                          
    ]
)

In [38]:
class Bill(elasticsearch_dsl.Document):
    file_name = elasticsearch_dsl.Text()
    year = elasticsearch_dsl.Integer()
    content = elasticsearch_dsl.Text(
        analyzer=analyzer,
    )
    
    class Index:
        name = 'acts'

In [39]:
if Bill._index.exists(using=elastic):
    Bill._index.delete(using=elastic)

Bill.init(using=elastic)

In [40]:
path = Path('./Act_directory')
for act in tqdm.tqdm(path.iterdir(), desc="Indexing acts", total=len(list(path.iterdir()))):
  act_id = act.stem
  requests.post(
      url=f"http://localhost:9200/acts/_doc/{act_id}",
      json={
          "content": act.read_text(encoding="utf8")
      }
  )

Indexing acts: 100%|██████████| 1179/1179 [00:39<00:00, 29.89it/s]


In [41]:
elastic.search(index = "acts")['hits']['total']['value']

1179

In [42]:
len(os.listdir(path)) == elastic.search(index = "acts")['hits']['total']['value']

True

In [43]:
ustawa = elastic.search(
      index="acts", 
      body={
        "query": {
            "match_phrase": {
                "content": {
                    "query": "ustawa"
                }
            }
        }
      }
  )

print(f"The number of files containing the word ustawa is {ustawa['hits']['total']['value']}")

The number of files containing the word ustawa is 1178


In [44]:
response = requests.get(
    url="http://localhost:9200/acts/_termvectors/2004_894",
    json={
        "fields": ["content"],
        "term_statistics": True
    }
)

print(f"The number of times the word ustawa occurs is {response.json()['term_vectors']['content']['terms']['ustawa']['ttf']}")

The number of times the word ustawa occurs is 24934


In [46]:
response = requests.get(
    url="http://localhost:9200/acts/_termvectors/2004_894",
    json={
        "fields": ["content"],
        "term_statistics": True
    }
)



print(f"The number of times the word ustaw occurs is {response.json()['term_vectors']['content']['terms']['ustawić']['ttf']}")

The number of times the word ustaw occurs is 913


In [47]:
kpc = elastic.search(
      index="acts", 
      body={
        "query": {
            "match_phrase": {
                "content": {
                    "query": "kodeks postępowania cywilnego"
                }
            }
        }
      }
  )

print(f"The number of files containing the word kodeks postępowania cywilnego is {kpc['hits']['total']['value']}")

The number of files containing the word kodeks postępowania cywilnego is 99


In [48]:
wwz = elastic.search(
    index="acts", 
    body={
      "query": {
          "match_phrase": {
              "content": {
                  "query": "wchodzi w życie",
                  "slop": 2
              }
          }
      }
    }
)

print(f"The number of files containing the word  wchodzi w życie is {wwz['hits']['total']['value']}")

The number of files containing the word  wchodzi w życie is 1174


In [49]:
response = requests.get(
    url="http://localhost:9200/acts/_search?filter_path=hits.hits._id,hits.hits._score",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "konstytucja"
                }
            }
        },
        "size": 10
    }
)
print('The 10 documents that are the most relevant for the phrase konstytucja are:')
res = response.json()['hits']['hits']
result = [res[i]['_id'] for i in range(len(res))]
print(result)

The 10 documents that are the most relevant for the phrase konstytucja are:
['1997_629', '2000_443', '1997_604', '1996_350', '1997_642', '2001_23', '1996_199', '1999_688', '2001_1082', '1997_681']


In [50]:
resposne = requests.get(
    url="http://localhost:9200/acts/_search?filter_path=hits.hits._id,hits.hits.highlight",
    json={
        "query": {
            "match": {
                "content": {
                    "query": "konstytucja"
                }
            }
        },
        "highlight": {
            "fields": {
                "content": {
                    "number_of_fragments": 3
                }
            }
        },
        "size": 10
    }
)

resposne.json()['hits']['hits']

[{'_id': '1997_629',
  'highlight': {'content': ['o zmianie ustawy konstytucyjnej o trybie przygotowania\n           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej',
    'W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i \nuchwalenia <em>Konstytucji</em>',
    'Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz \n                obywateli popierających zgłoszenie']}},
 {'_id': '2000_443',
  'highlight': {'content': ['umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89\n     ust. 1 lub art. 90 <em>Konstytucji</em>',
    'międzynarodowej lub załącznika nie\n     wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>',
    'co do zasadności wyboru\n  trybu ratyfikacji umowy międzynarodowej, o którym mowa w art. 89 ust. 2\n  <em>Konstytucji</em>']}},
 {'_id': '1997_604',
  'highlight': {'content': ['Jeżeli Trybunał Konstytucyjny wyda orzeczenie o sprzeczności celów partii \n   politycznej z <em>Konstyt