In [1]:
import pandas as pd
from dotenv import load_dotenv
import sys
import os

# Add the path to folder B to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


In [2]:
df = pd.read_csv("Marine_Fish_Species_Full_Description_test.csv")
df

Unnamed: 0,Fish Name,Summary Description,Image Links
0,Brownbanded bamboo shark,This small carpet sharkis commonly found insha...,https://object-storage.example.com/brownbanded...
1,Whale shark,The world's largest livingshark and fish speci...,https://object-storage.example.com/whale_shark...
2,Zebra shark,A distinctive carpetshark with fivelongitudina...,https://object-storage.example.com/zebra_shark...
3,Coral catshark,A small bottom-dwellingshark found in coral re...,https://object-storage.example.com/coral_catsh...
4,Blacktip reef shark,A medium-sized reef shark easily identified by...,https://object-storage.example.com/blacktip_re...
...,...,...,...
86,Moorish idol,An iconic reef fishwith distinctiveblack and w...,https://object-storage.example.com/moorish_ido...
87,Little dragonfish,"A small, dragon-like fish withwing-like pector...",https://object-storage.example.com/little_drag...
88,Indian mackerel,"A small, schooling fishimportant in commercial...",https://object-storage.example.com/indian_mack...
89,Trumpetfish,An elongated fish with a tube-like snout used ...,https://object-storage.example.com/trumpetfish...


# Embedding Service

In [None]:
from INGESTION.embedding_service import EmbeddingService


  from .autonotebook import tqdm as notebook_tqdm


Loading environment variables...


In [26]:
# choose model to embed text
# emb = EmbeddingService('watsonx')
emb = EmbeddingService('sentence_transformer')

embeddings = emb.embed_text(df['Summary Description'])

Using embedding type: sentence_transformer
Embedding input: 0     This small carpet sharkis commonly found insha...
1     The world's largest livingshark and fish speci...
2     A distinctive carpetshark with fivelongitudina...
3     A small bottom-dwellingshark found in coral re...
4     A medium-sized reef shark easily identified by...
                            ...                        
86    An iconic reef fishwith distinctiveblack and w...
87    A small, dragon-like fish withwing-like pector...
88    A small, schooling fishimportant in commercial...
89    An elongated fish with a tube-like snout used ...
90    A delicate, leaf-like fishthat drifts with oce...
Name: Summary Description, Length: 91, dtype: object


In [29]:
print(type(embeddings))
print(embeddings.shape)
list_of_embeddings = list(embeddings)
print("No data:", len(list_of_embeddings))
print("Embedding Dimension:", len(list_of_embeddings[0]))

<class 'numpy.ndarray'>
(91, 1024)
No data: 91
Embedding Dimension: 1024


# ElasticsearchManager Service

In [5]:
from INGESTION.elasticsearch_manager import ElasticsearchManager
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
print("Loading environment variables...")

es_endpoint = os.environ["es_endpoint"]
es_cert_path = os.environ["es_cert_path"]
es_username = os.environ["es_username"]
es_password = os.environ["es_password"]

Loading environment variables...


In [6]:
esm = ElasticsearchManager(es_endpoint, es_username, es_password)

In [None]:
#create index
esm.create_index('indexname')

# ingest data (also automatically create index if not yet exists)
# ** df must have a column named 'embedding' with the embeddings
esm.ingest_df_to_elasticsearch(df, index_name='indexname')

# list all indices (optional variable: creator="user"/"system"/"all")
esm.list_all_index()

# delete index
esm.delete_index('indexname') 

# get index info (no. row, no. column, data type, sample document)
esm.get_index_info('indexname')

# get document count (no. of row in the index)
esm.get_document_count('indexname')


In [9]:
# example of get_index_info
esm.get_index_info('17july2331')



📊 17july2331: 91 rows
Columns:
  fish_name: str
  general_description: str
  image_links: str
  embedding: list


{'rows': 91,
 'columns': ['fish_name', 'general_description', 'image_links', 'embedding'],
 'sample': {'fish_name': 'Brownbanded bamboo shark',
  'general_description': 'This small carpet sharkis commonly found inshallow coastal watersand coral reefsthroughout the Indo\x02West Pacific. Youngindividuals have distinctbrown banding thatfades to solid brown asthey mature. They arenocturnal feeders thatcan survive up to 12hours out of water.',
  'image_links': 'https://object-storage.example.com/brownbanded_bamboo_shark_1.jpg, https://object-storage.example.com/brownbanded_bamboo_shark_2.jpg',
  'embedding': [-0.038465630263090134,
   0.0282822884619236,
   -0.029576130211353302,
   -0.033045776188373566,
   -0.053246576339006424,
   -3.344085780554451e-05,
   -0.050171397626399994,
   -0.049581993371248245,
   -0.042542051523923874,
   0.056465912610292435,
   0.051204584538936615,
   0.032839998602867126,
   -0.016734551638364792,
   0.07672103494405746,
   0.08306562155485153,
   -0.0005

# ElasticQuery Service

In [3]:
from BE.elasticsearch_query import ElasticsearchQuery
from dotenv import load_dotenv

load_dotenv()
print("Loading environment variables...")

es_endpoint = os.environ["es_endpoint"]
es_cert_path = os.environ["es_cert_path"]
es_username = os.environ["es_username"]
es_password = os.environ["es_password"]

Loading environment variables...


In [None]:
esq = ElasticsearchQuery(es_endpoint, es_username, es_password)


# Search text in specific field
esq.search_text(index_name='indexname', field='fish_name', text='nemo', size=10)

# Search exact match
esq.search_exact(index_name='indexname', field='fish_name', text='nemo', size=10)

# Search similar vectors using kNN 
# ** query_vector must be a list of floats with the same dimension as the embeddings
esq.search_embedding(index_name='indexname', embedding_field='embedding', query_vector=[], size=10)


  _transport = transport_class(
