In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import time
import os
import gzip
import json
import torch
import pandas as pd

## **Corpus**

## **Retriever - Rerank**

In [None]:
%%capture

!pip install elasticsearch==7.14.0
!apt install default-jdk > /dev/null

In [None]:
try:
  import os
  import elasticsearch
  from elasticsearch import Elasticsearch
  import numpy as np
  import pandas as pd
  import sys
  import json
  from ast import literal_eval
  from tqdm import tqdm 
  import datetime
  from elasticsearch import helpers
  
except Exception as e:
  print(f"error: {e}")

In [None]:
# Download & extract Elasticsearch 7.0.0

!wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.0.0-linux-x86_64.tar.gz -q
!tar -xzf elasticsearch-7.0.0-linux-x86_64.tar.gz
!chown -R daemon:daemon elasticsearch-7.0.0

In [None]:
# Creating daemon instance of elasticsearch
import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.0.0/bin/elasticsearch'], 
                  stdout=PIPE, stderr=STDOUT,
                  preexec_fn=lambda: os.setuid(1)  # as daemon
                 )

In [None]:
# This part is important, since it takes a little amount of time for instance to load
import time
time.sleep(25)

In [None]:
%%bash
# If you get 1 root & 2 daemon process then Elasticsearch instance has started successfully
ps -ef | grep elasticsearch

daemon       544      73 99 16:47 ?        00:00:32 /content/elasticsearch-7.0.0/jdk/bin/java -Xms1g -Xmx1g -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Djava.io.tmpdir=/tmp/elasticsearch-3056460647091196697 -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m -Djava.locale.providers=COMPAT -Dio.netty.allocator.type=unpooled -Des.path.home=/content/elasticsearch-7.0.0 -Des.path.conf=/content/elasticsearch-7.0.0/config -Des.distribution.flavor=default 

In [None]:
# Check if elasticsearch is running
!curl -sX GET "localhost:9200/"

{
  "name" : "8ed652afc16c",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "-FlTG-7ySxazVAtcBqx2Tg",
  "version" : {
    "number" : "7.0.0",
    "build_flavor" : "default",
    "build_type" : "tar",
    "build_hash" : "b7e28a7",
    "build_date" : "2019-04-05T22:55:32.697037Z",
    "build_snapshot" : false,
    "lucene_version" : "8.0.0",
    "minimum_wire_compatibility_version" : "6.7.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [None]:
es = Elasticsearch(hosts = [{"host":"localhost", "port":9200}])
# Check if python is connected to elasticsearch
es.ping()

True

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/DS310/data/wiki/256/passages_256_lite.csv")

In [None]:
# Define settings & mappings of Elasticsearch index
Settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "title":{
                "type":"text",
                "analyzer": "standard", 
                "similarity": "BM25"
            },
            "text":{
                "type":"text",
                "analyzer": "standard", 
                "similarity": "BM25"
            }
        }
    }
}

In [None]:
def json_formatter(dataset, index_name, index_type='_doc'):
    try:
        List = []
        columns = dataset.columns
        for idx, row in dataset.iterrows():
            dic = {}
            dic['_index'] = index_name
            source = {}
            for i in dataset.columns:
                source[i] = row[i]
            dic['_source'] = source
            List.append(dic)
        return List
    
    except Exception as e:
        print("There is a problem: {}".format(e))

In [None]:
MY_INDEX = es.indices.create(index="wiki_index", ignore=[400,404], body=Settings)

In [None]:
json_Formatted_dataset = json_formatter(dataset=dataset, index_name='wiki_index', index_type='_doc')

In [None]:
try:
    res = helpers.bulk(es, json_Formatted_dataset)
    print("successfully imported to elasticsearch.")
except Exception as e:
    print(f"error: {e}")

successfully imported to elasticsearch.


In [None]:
def elastic_search(q='', ner=[], top_k=200):

  def search(q, k):
    return es.search(
      index="wiki_index",
      body={
          "size":top_k,
          "query": {"multi_match": {
                 "query": q,
                 "fields": ["text", "raw_text"],
                 "type": "cross_fields",
             },}
      }
  )
    
  hits = []
  query = search(q,200)
  hits.extend(query['hits']['hits'])

  ner_hits = []
  if ner:
    for e in ner:
      query = search(e, 10)
      ner_hits.extend(query['hits']['hits'])
  return hits, ner_hits

## **Anvil**

In [None]:
!pip install anvil-uplink

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting anvil-uplink
  Downloading anvil_uplink-0.4.1-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 3.5 MB/s 
[?25hCollecting ws4py
  Downloading ws4py-0.5.1.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 160 kB/s 
[?25hCollecting argparse
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25l[?25hdone
  Created wheel for ws4py: filename=ws4py-0.5.1-py3-none-any.whl size=45229 sha256=f0dee6155ece2fe0666baffcfa123525acd80f746c5d7a75fa951206facb2b09
  Stored in directory: /root/.cache/pip/wheels/ea/f9/a1/34e2943cce3cf7daca304bfc35e91280694ced9194a487ce2f
Successfully built ws4py
Installing collected packages: ws4py, argparse, anvil-uplink
Successfully installed anvil-uplink-0.4.1 argparse-1.4.0 ws4py-0.5.1


In [None]:
import anvil.server
anvil.server.connect('JIHIFFI4OTSU5G7R75U5KA6P-VXVRAFVWEL3OX27P')

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


In [None]:
!pip install timeout-decorator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timeout-decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
Building wheels for collected packages: timeout-decorator
  Building wheel for timeout-decorator (setup.py) ... [?25l[?25hdone
  Created wheel for timeout-decorator: filename=timeout_decorator-0.5.0-py3-none-any.whl size=5028 sha256=dab876a9609f3339f53a2af11ff52021e8974376c195c8038a016c6440b19f2c
  Stored in directory: /root/.cache/pip/wheels/38/05/4e/161d1463ca145ec1023bd4e5e1f31cbf9239aa8f39a2a2b643
Successfully built timeout-decorator
Installing collected packages: timeout-decorator
Successfully installed timeout-decorator-0.5.0


In [None]:
import time
import timeout_decorator
import string


@anvil.server.callable
def elastic(query='', ner=[], top_k = 200):

  start_time = time.time()
  hits, ner_hits = elastic_search(query, ner, top_k)
  hits.extend(ner_hits)
  hits = sorted(hits, key=lambda x: x['_score'], reverse=True)
  return hits, (time.time() - start_time)