# Neural search demo - initial indexing

Code in this notebook shows how to prepare data for indexing in a vector search engine.

It contains the following steps:

* Downloading text data which we want to search
* Initialization of pre-trained text vectorization models (with SentenceTransformer)
* Converting text data into vectors and saving it.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# We use SentenceTransformer pre-trained models to convert our text into vectors.
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=0bfb6fd9a18e5073361c66675d42d5e96dbddf66b54071d26a783db9a4ec178e
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
import re

import pandas as pd
from tqdm.notebook import tqdm

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob


In [4]:
# This code will download and create a pre-trained sentence encoder

# all-MiniLM-L6-v2 - is a distilated (lightweight) version of MPNet model.
# It is optimized for the fast inference.
# Full list of available models could be found here https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
df = pd.read_csv('/content/drive/MyDrive/bb_chaabi.csv')
df.head()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product                27555 non-null  object
 1   category               27555 non-null  object
 2   sub_category           27555 non-null  object
 3   brand                  27555 non-null  object
 4   sale_price             27555 non-null  object
 5   market_price           27555 non-null  object
 6   type                   27555 non-null  object
 7   rating                 27555 non-null  object
 8   description            27555 non-null  object
 9   clean_description      27555 non-null  object
 10  my_description         27555 non-null  object
 11  description_sentiment  27555 non-null  object
dtypes: object(12)
memory usage: 2.5+ MB


In [7]:
df.fillna({'rating': 0}, inplace=True)
df.fillna("NA", inplace=True)
df = df.astype(str)
df.info()

In [32]:

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    return text


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)


df['product'] = df['product'].apply(clean_text)
df['category'] = df['category'].apply(clean_text)
df['sub_category'] = df['sub_category'].apply(clean_text)
df['brand'] = df['brand'].apply(clean_text)
df['type'] = df['type'].apply(clean_text)
df['clean_description'] = df['description'].apply(clean_text)
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product                27555 non-null  object
 1   category               27555 non-null  object
 2   sub_category           27555 non-null  object
 3   brand                  27555 non-null  object
 4   sale_price             27555 non-null  object
 5   market_price           27555 non-null  object
 6   type                   27555 non-null  object
 7   rating                 27555 non-null  object
 8   description            27555 non-null  object
 9   clean_description      27555 non-null  object
 10  my_description         27555 non-null  object
 11  description_sentiment  27555 non-null  object
dtypes: object(12)
memory usage: 2.5+ MB


In [12]:
df['my_description'] = df['clean_description'].apply(remove_stopwords)
df['description_sentiment'] = df['clean_description'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

In [11]:
df.to_csv("/content/drive/MyDrive/preprocessed_bb_products.csv",index = False)

In [12]:
# Here we encode all startup descriptions

vectors = model.encode([
    str(row.product) + " " + str(row.category) + " " + str(row.sub_category) + " " + str(row.type) + " " + str(row.brand) + " " + str(row.my_description)
    for row in df.itertuples()
], show_progress_bar=True)

Batches:   0%|          | 0/862 [00:00<?, ?it/s]

In [16]:
# vectors[0]

In [13]:
# Now we have all our descriptions converted into vectors.
# We have 27555 vectors of 384 dimentions. The output layer of the model has this dimension
vectors.shape

(27555, 384)

In [14]:
np.save('/content/drive/MyDrive/bb_chaabi_vectors.npy', vectors, allow_pickle=False)

In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/bb_chaabi_vectors.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
# Take a random description as a query
sample_query = df.iloc[4321].description
print(sample_query)

Tasty and incredibly healthy, packed with the goodness of India's ancient wonder grain, Ragi. 
Source of iron and calcium
Source of dietary fibre
Makes 10-12 dosas
Ready to Cook pre-mix.
Only requires water to be added.
Prepare as per instructions provided on the pack.


str

In [40]:
query_vector = model.encode(sample_query)  # Convert query description into a vector.

In [41]:
scores = cosine_similarity([query_vector], vectors)[0]  # Look for the most similar vectors, manually score all vectors
top_scores_ids = np.argsort(scores)[-5:][::-1]  # Select top-5 with vectors the largest scores

In [42]:
# Check if result similar to the query
for top_id in top_scores_ids:
  print(df.iloc[top_id].description)
  print("-----")

Tasty and incredibly healthy, packed with the goodness of India's ancient wonder grain, Ragi. 
Source of iron and calcium
Source of dietary fibre
Makes 10-12 dosas
Ready to Cook pre-mix.
Only requires water to be added.
Prepare as per instructions provided on the pack.
-----
Ragi (finger millet), a native plant of Ethiopia and Uganda is a hardy crop that can withstand drought and the passing of time. Ragi finds a mention in ancient Sanskrit texts as ‘rajika’ and has been present in India for over thousands of years. Ragi is an excellent source of nutrition, full of calcium and protein, and is synonymous with health, particularly in rural areas. Graminway’s Gluten-Free Ragi Dosa is made from ragi that’s sourced from organic ragi farmers who cultivate the crop using sustainable farming methods, without chemicals or pesticides.
Our ragi dosa batter is made from clean whole grains and milled in-house with cumin seeds, which gives rich texture and flavour to the dosa. This ready-to-make mix

In [26]:
!pip install qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.6.9-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.2/182.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.59.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx[http2]>=0.14.0 (from qdrant-client)
  Downloading httpx-0.25.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting urllib3<2.0.0,>=1.26.14 (from qdrant-client)
  Downloading urllib3-1.26.18-py2.py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [27]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from qdrant_client import models
qdrant_client = QdrantClient(host = "localhost", port = 6333)
qdrant_client

<qdrant_client.qdrant_client.QdrantClient at 0x7f1be5e41c90>

In [47]:
original_df = pd.read_csv('/content/drive/MyDrive/bb_chaabi.csv')

27555

In [None]:
payload = originanl_df.to_dict('records')
payload[1:3]

In [None]:
qdrant_client.create_collection(
    collection_name="products_colab",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

In [18]:
# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("/content/drive/MyDrive/bb_vectors.npy")

In [None]:
qdrant_client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)

In [None]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer


class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        # initialize Qdrant client
        self.qdrant_client = QdrantClient("http://localhost:6333")

    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()

        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            query_filter=None,  # If you don't want any filters for now
            limit=5  # 5 the most closest results is enough
        )
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [hit.payload for hit in search_result]
        return payloads

In [None]:
from qdrant_client.models import Filter

rating_above = 5

# Define a filter for cities
rating_filter = Filter(**{
    "must": [{
        "key": "rating", # Store city information in a field of the same name
        "match": { # This condition checks if payload field has the requested value
            "value": rating_above
        }
    }]
})

search_result = self.qdrant_client.search(
    collection_name=self.collection_name,
    query_vector=vector,
    query_filter=city_filter,
    limit=5
)


In [None]:
results = [hit.payload for hit in search_result]
results