##Data Preprocessing


In [None]:
#importing libraries
import numpy as np
import re

import pandas as pd
from tqdm.notebook import tqdm

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df=pd.read_csv('bigBasketProducts.csv')

In [None]:
df.head(5)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 2.1+ MB


In [None]:
df.shape

(27555, 10)

In [None]:
df.isnull().sum()

index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [None]:
#filling na values
df['rating'].fillna(df['rating'].mean())
df.fillna("NA", inplace=True)


In [None]:
#converting datatype of df to be string
df = df.astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         27555 non-null  object
 1   product       27555 non-null  object
 2   category      27555 non-null  object
 3   sub_category  27555 non-null  object
 4   brand         27555 non-null  object
 5   sale_price    27555 non-null  object
 6   market_price  27555 non-null  object
 7   type          27555 non-null  object
 8   rating        27555 non-null  object
 9   description   27555 non-null  object
dtypes: object(10)
memory usage: 2.1+ MB


In [None]:
def removing_stopwords(text, stop_words=None, language='english'):
    if stop_words is None:
        stop_words = set(stopwords.words(language))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

In [None]:
columns_to_clean = ['product', 'category', 'sub_category', 'brand', 'type']
for column in columns_to_clean:
    df[column] = df[column].apply(cleaning_text)

df['c_description'] = df['description'].apply(cleaning_text)


In [None]:
def clean_text_combined(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    return text

columns_to_clean = ['product', 'category', 'sub_category', 'brand', 'type']
for column in columns_to_clean:
    df[column] = df[column].apply(clean_text_combined)

df['c_description'] = df['description'].apply(clean_text_combined)


In [None]:
#created a new column my_description having text after removing stopwords
df['my_description'] = df['c_description'].apply(removing_stopwords)
#created a new column description_sentiment
df['description_sentiment'] = df['c_description'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description,clean_description,c_description,my_description,description_sentiment
0,1,garlic oil vegetarian capsule mg,beauty hygiene,hair care,sri sri ayurveda,220.0,220.0,hair oil serum,4.1,This Product contains Garlic Oil that is known...,this product contains garlic oil that is known...,this product contains garlic oil that is known...,product contains garlic oil known help proper ...,0.166667
1,2,water bottle orange,kitchen garden pets,storage accessories,mastercook,180.0,180.0,water fridge bottles,2.3,"Each product is microwave safe (without lid), ...",each product is microwave safe without lid ref...,each product is microwave safe without lid ref...,product microwave safe without lid refrigerato...,0.45
2,3,brass angle deep plain no,cleaning household,pooja needs,trm,119.0,250.0,lamp lamp oil,3.4,"A perfect gift for all occasions, be it your m...",a perfect gift for all occasions be it your mo...,a perfect gift for all occasions be it your mo...,perfect gift occasions mother sister inlaws bo...,0.522619
3,4,cereal flip lid containerstorage jar assorted...,cleaning household,bins bathroom ware,nakoda,149.0,176.0,laundry storage baskets,3.7,Multipurpose container with an attractive desi...,multipurpose container with an attractive desi...,multipurpose container with an attractive desi...,multipurpose container attractive design made ...,0.285776
4,5,creme soft soap for hands body,beauty hygiene,bath hand wash,nivea,162.0,162.0,bathing bars soaps,4.4,Nivea Creme Soft Soap gives your skin the best...,nivea creme soft soap gives your skin the best...,nivea creme soft soap gives your skin the best...,nivea creme soft soap gives skin best care mus...,0.383333


In [None]:
df.shape

(27555, 14)

In [None]:
df.columns

Index(['index', 'product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description', 'clean_description',
       'c_description', 'my_description', 'description_sentiment'],
      dtype='object')

In [None]:
#saving data preprocessing in preprocessed_file
df.to_csv("preprocessed_file.csv", index=False)

## model building

In [None]:
# using here  SentenceTransformer pre-trained models to convert  text into vectors.
!pip install sentence-transformers




In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
#using all-MiniLM-L6-v2 - is a distilated (lightweight) version of MPNet model.
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

In [None]:

def concatenate_columns(row):
    """
    Concatenate relevant columns of a DataFrame row into a single string.
    Adjust the columns based on your data structure.
    """
    return f"{row.product} {row.category} {row.sub_category} {row.type} {row.brand} {row.my_description}"

# Use list comprehension to create a list of strings by concatenating columns for each row
text_data = [concatenate_columns(row) for row in df.itertuples()]

# Encode the list of strings into vectors using the pre-trained model
vectors = model.encode(text_data, show_progress_bar=True,device="cpu")


Batches:   0%|          | 0/862 [00:00<?, ?it/s]

In [None]:
vectors.shape

(27555, 384)

In [None]:
np.save('model_vectors.npy', vectors, allow_pickle=False)

##Qdrant


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
qdrant_client = QdrantClient(":memory:")

In [None]:
# Take a random description as a query
sample_query = df.iloc[1000].description
print(sample_query)

Celebrate a merry Christmas with this pretty Christmas tree. It is 58 inches in length and made of artificial branches. It is portable and compact for easy home use. Decorate the tree with hanging ornaments like bells, stars, and Santa elements to enhance its beauty.


In [None]:
query_vector = model.encode(sample_query)

In [None]:
payload = df.to_dict('records')

In [None]:
 # Look for the most similar vectors, manually score all vectors
scores = cosine_similarity([query_vector], vectors)[0]
 # Select top 3 with vectors the largest scores
top_scores_ids = np.argsort(scores)[-3:][::-1]

In [None]:
# Check if result similar to the query
for top_id in top_scores_ids:
  print(df.iloc[top_id].description)
  print("-----")

Celebrate a merry Christmas with this pretty Christmas tree. It is 58 inches in length and made of artificial branches. It is portable and compact for easy home use. Decorate the tree with hanging ornaments like bells, stars, and Santa elements to enhance its beauty.
-----
Make the best of Christmas celebration with cute decoration like this Merry Christmas Santa ornament. You can easily hang it on the Christmas tree, a window or door. It is ideal to gift to friends and family.
-----
Decorate the empty space of your home, cafe or room etc with this beautiful-looking merry Christmas tree. It is crafted using high-quality paper that offers to be lightweight yet strong. It comes with a hangable thread that lets you hang it easily on the roof or wall. This paper decoration adds to the aesthetics of the decor.
-----


In [None]:
qdrant_client.create_collection(
    collection_name="products",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
)

ValueError: ignored

In [None]:
qdrant_client.upload_collection(
    collection_name="products",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)

In [None]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer


class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        # initialize Qdrant client
        self.qdrant_client = QdrantClient("http://localhost:6333")

    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()

        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            query_filter=None,  # If you don't want any filters for now
            limit=3 # 3 the most closest results is enough
        )
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [hit.payload for hit in search_result]
        return payloads

In [None]:
from qdrant_client.models import Filter

rating_above = 3

# Define a filter for cities
rating_filter = Filter(**{
    "must": [{
        "key": "rating", # Store city information in a field of the same name
        "match": { # This condition checks if payload field has the requested value
            "value": rating_above
        }
    }]
})
search_result = qdrant_client.search(
collection_name=collection_name,
query_vector=query_vector,
query_filter=rating_filter,
limit=3
)

In [None]:
results = [hit.payload for hit in search_result]
results


[]