In [1]:
!pip install tensorflow-io
!pip install elasticsearch==8.4.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-io
  Downloading tensorflow_io-0.27.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (25.0 MB)
[K     |████████████████████████████████| 25.0 MB 1.4 MB/s 
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.27.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting elasticsearch==8.4.3
  Downloading elasticsearch-8.4.3-py3-none-any.whl (384 kB)
[K     |████████████████████████████████| 384 kB 3.1 MB/s 
[?25hCollecting elastic-transport<9,>=8
  Downloading elastic_transport-8.4.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.1 MB/s 
[?25hCollecting urllib3<2,>=1.26.2
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 9.2 MB/s 
Installing collected packages: urllib3, elastic-trans

In [2]:
import os
import time
from sklearn.model_selection import train_test_split
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import tensorflow_io as tfio
import json
from tqdm import tqdm
import pickle



In [3]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.4.3-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.4.3-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-8.4.3-linux-x86_64.tar.gz

sudo chown -R daemon:daemon elasticsearch-8.4.3/
shasum -a 512 -c elasticsearch-8.4.3-linux-x86_64.tar.gz.sha512

elasticsearch-8.4.3-linux-x86_64.tar.gz: OK


In [32]:
%%bash --bg

sudo -H -u daemon /content/elasticsearch-8.4.3/bin/elasticsearch

In [24]:
# Sleep for few seconds to let the instance start.
time.sleep(20)

In [33]:
%%bash

ps -ef | grep elasticsearch

root         873     871  0 20:27 ?        00:00:00 sudo -H -u daemon /content/elasticsearch-8.4.3/bin/elasticsearch
daemon       874     873 99 20:27 ?        00:00:02 /content/elasticsearch-8.4.3/jdk/bin/java -Xms4m -Xmx64m -XX:+UseSerialGC -Dcli.name=server -Dcli.script=/content/elasticsearch-8.4.3/bin/elasticsearch -Dcli.libs=lib/tools/server-cli -Des.path.home=/content/elasticsearch-8.4.3 -Des.path.conf=/content/elasticsearch-8.4.3/config -Des.distribution.type=tar -cp /content/elasticsearch-8.4.3/lib/*:/content/elasticsearch-8.4.3/lib/cli-launcher/* org.elasticsearch.launcher.CliToolLauncher
root         903     901  0 20:27 ?        00:00:00 grep elasticsearch


In [34]:
%%bash

curl -sX GET "localhost:9200/"

CalledProcessError: ignored

In [None]:
x²df = pd.read_csv('/content/drive/MyDrive/search-engine/dataset_with_features/extracted_features_sub_dataset_1_reducted.csv')

In [None]:
ES_NODES = "http://localhost:9200"
index = 'open-images'

es_client = Elasticsearch(hosts = [ES_NODES])

In [None]:
print("creating the '{}' index.".format(index))
res = es_client.indices.create(index=index)
print("Response from server: {}".format(res))

creating the 'open-images' index.
Response from server: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'open-images'}


In [None]:
settings = {
  "settings": {
    "elastiknn": True,
    "number_of_shards": 1,
    "number_of_replicas": 0
  }
}

mapping = {
  "dynamic": False,
  "properties": {
      "ImageID": { "type": "keyword" },
      "featureVec": {
          "type": "elastiknn_dense_float_vector",
          "elastiknn": {
            "dims": 512,
            "model": "lsh",
            "similarity": "l2",
            "L": 99,
            "k": 3,
            "w": 2
          }
    },
    "Title": { "type": "text" },
    "AuthorID": { "type": "text","index":False},
    "Title": { "type": "text" },
    "Tags": { "type": "text" },
    "OriginalURL":{"type":"text","index":False}
  }
}


if not es_client.indices.exists(index):
    es_client.indices.create(index, json.dumps(settings))
    es_client.indices.put_mapping(json.dumps(mapping), index)
es_client.indices.get_mapping(index)

{'open-images': {'mappings': {}}}

In [None]:
records = df.to_dict(orient="records")

In [None]:
def image_infos():
    for record in tqdm(records):
        yield {
          "_op_type": "index",
          "_index": index, 
          "_id": record["ImageID"], 
          "ImageID":  record["ImageID"],
          "Title": record["Title"],
          "AuthorID": record["AuthorID"],
          "Tags": record["tags"],
          "OriginalURL":record["OriginalURL"]

        }



In [None]:
bulk(es_client, image_infos(), chunk_size=2000, max_retries=2)

100%|██████████| 74222/74222 [00:16<00:00, 4458.90it/s]


(74222, [])

In [None]:
es_client.indices.refresh(index=index)
es_client.indices.forcemerge(index=index, max_num_segments=1, request_timeout=300)

{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}

In [None]:
def vector_infos(df):
    for record in tqdm(df.iterrows()):
      record = record[1]
      yield { "_op_type": "update",
              "_index": index, 
              "_id": record['ImageID'], 
              "doc": { 
                "featureVec": { "values": record[["f {}".format(i+1) for i in range(512)]].tolist()}
      }}

In [None]:
bulk(es_client, vector_infos(df), chunk_size=50, max_retries=10, request_timeout=60)

74222it [06:01, 205.04it/s]


(74222, [])

In [None]:
source_no_vecs = ["ImageID", "Title", "AuthorID", "Tags", "OriginalURL"]

In [None]:
def search_by_query(q,size=5):
    body = {"query" : {
        "multi_match": {
          "query": q,
          "fields": ["Title", "Tags"]
        }
      }}

    res = es_client.search(index=index, body = body, size=5, _source=source_no_vecs)
    return res

In [None]:
search_by_query("tunis")

{'took': 18,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 14.620897,
  'hits': [{'_index': 'open-images',
    '_type': '_doc',
    '_id': 'a44732f36852b641',
    '_score': 14.620897,
    '_source': {'OriginalURL': 'https://farm2.staticflickr.com/29/65920093_30333f588f_o.jpg',
     'AuthorID': 'sharif',
     'ImageID': 'a44732f36852b641',
     'Title': 'tunis',
     'Tags': 'Plant, Tree, Cannon'}},
   {'_index': 'open-images',
    '_type': '_doc',
    '_id': 'a74844f0b190931a',
    '_score': 12.75223,
    '_source': {'OriginalURL': 'https://c3.staticflickr.com/1/181/472352685_0c3edf7cac_o.jpg',
     'AuthorID': 'andredea',
     'ImageID': 'a74844f0b190931a',
     'Title': 'Tunis cartage',
     'Tags': 'House, Tree, Building'}},
   {'_index': 'open-images',
    '_type': '_doc',
    '_id': '60327f49489c4697',
    '_score': 10.156154,
    '_source': {'OriginalURL': 'https://farm8

In [None]:
def search_by_image_query(feature_vector,size=5):
    query = {"query" : {
            "elastiknn_nearest_neighbors": {
                "field": "featureVec",
              "vec": {"values": feature_vector},
              
              "model": "lsh",
              "similarity": "l2",
              "candidates": 150
            }
          }
          }
    
    return es_client.search(index=index, body = query, size=5)


In [None]:
search_by_image_query(final[0].tolist())




RequestError: ignored

In [None]:
!./elasticsearch-7.9.2/bin/elasticsearch-plugin install https://github.com/alexklibisz/elastiknn/releases/download/8.4.3.0/elastiknn-8.4.3.0.zip

-> Installing https://github.com/alexklibisz/elastiknn/releases/download/8.4.3.0/elastiknn-8.4.3.0.zip
-> Downloading https://github.com/alexklibisz/elastiknn/releases/download/8.4.3.0/elastiknn-8.4.3.0.zip
-> Failed installing https://github.com/alexklibisz/elastiknn/releases/download/8.4.3.0/elastiknn-8.4.3.0.zip
-> Rolling back https://github.com/alexklibisz/elastiknn/releases/download/8.4.3.0/elastiknn-8.4.3.0.zip
-> Rolled back https://github.com/alexklibisz/elastiknn/releases/download/8.4.3.0/elastiknn-8.4.3.0.zip
Exception in thread "main" java.lang.IllegalArgumentException: Plugin [elastiknn] was built for Elasticsearch version 8.4.3 but version 7.9.2 is running
	at org.elasticsearch.plugins.PluginsService.verifyCompatibility(PluginsService.java:349)
	at org.elasticsearch.plugins.InstallPluginCommand.loadPluginInfo(InstallPluginCommand.java:811)
	at org.elasticsearch.plugins.InstallPluginCommand.installPlugin(InstallPluginCommand.java:866)
	at org.elasticsearch.plugins.InstallP

In [None]:
from feature_extractor import FeatureExtractor
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO, StringIO
import requests


In [None]:
loaded_model = pickle.load(open("/content/drive/MyDrive/search-engine/dataset_with_features/pca_model.pkl", 'rb'))

fe = FeatureExtractor()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
img = Image.open(BytesIO(requests.get("https://assets.afcdn.com/recipe/20131023/6366_w1024h1024c1cx1872cy2808.jpg").content))

In [None]:
final = loaded_model.transform(np.expand_dims(fe.extract(img), axis=0))



In [None]:
final[0].tolist()



RequestError: ignored

In [None]:
!elasticsearch-plugin install https://github.com/alexklibisz/elastiknn/archive/refs/tags/8.4.3.0.tar.gz

/bin/bash: elasticsearch-plugin: command not found


In [None]:
!elasticsearch-head

/bin/bash: elasticsearch-head: command not found


In [None]:
!bin/plugin --install knapsack --url https://github.com/alexklibisz/elastiknn/archive/refs/tags/8.4.3.0.tar.gz

/bin/bash: bin/plugin: No such file or directory
