In [None]:
!pip install sentence-transformers qdrant-client

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import nltk
from qdrant_client import QdrantClient, models
import re,os

In [None]:
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")

In [7]:
product = pd.read_csv('/content/bigBasketProducts.csv')

In [None]:
product.info()

Rating column has some null entries filing those null entries with 0,
description column also has null entries filing it to avoid error

In [None]:
product['rating'] = product['rating'].fillna(0)

# Next, handling missing values in all other columns
# These are filled with "NA" to indicate data unavailability
product = product.applymap(lambda x: "NA" if pd.isna(x) else x)

# Converting all data types to string
for column in product.columns:
    product[column] = product[column].astype(str)

# Display the DataFrame's structure and summary
# This includes column data types, non-null counts, etc.
print(product.info())

In [14]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    return text.lower()

def filter_stopwords(text):
    english_stopwords = set(stopwords.words('english'))
    filtered_words = [word for word in text.split() if word.lower() not in english_stopwords]
    return ' '.join(filtered_words)

cleaning our data to make efficient vector embeddings

In [15]:
# function to apply both cleaning operations: removing characters/numbers and filtering stopwords
def clean_and_filter_text(column_data):
    cleaned_data = clean_text(column_data)
    return filter_stopwords(cleaned_data)

# Apply the 'clean_text' function to selected columns
for column in ['product', 'category', 'sub_category', 'brand', 'type']:
    product[column] = product[column].apply(clean_text)

# For the 'description' column, apply both cleaning and filtering stopwords
product['description'] = product['description'].apply(clean_and_filter_text)

# Storing the modified DataFrame in 'original_df' for backup, comparison, or further processing purposes
original_df = product

saving the processed data fro future use


In [17]:
product.to_csv("/content/preprocessed_products.csv",index = False)

concatenating the columns to reduce the size of vector stores

In [19]:
concated_text=[str(row.product) + " " + str(row.category) + " " + str(row.sub_category) + " " + str(row.type) + " " + str(row.brand) + " " + str(row.description)
    for row in product.itertuples()]

converting the text into vector embeddings

In [None]:
vectors_bigbasket = model.encode(concated_text, show_progress_bar=True,device='cpu')

In [21]:
vectors_bigbasket.shape


(27555, 384)

saving the vector file as numpy array

In [22]:
np.save('/content/vectors_bigbasket.npy', vectors_bigbasket, allow_pickle=False)

performing a query search on the vectors to check the vectors

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sample_query1 = product.iloc[0].description
sample_query2=product.iloc[441].description
sample_query3=product.iloc[3211].description
print(sample_query1)
print(sample_query2)
print(sample_query3)

 check for any of the 3 query to verify the vectors embeddings

In [None]:
query = model.encode(sample_query1,device='cpu')
similarity_score = cosine_similarity([query], vectors_bigbasket)[0]

# Retrieve the indices of the top 3 most similar items
scores_id = np.argsort(similarity_score)[-3:][::-1]
# Displaying the top 3 similar items
for top_id in scores_id:
  print(top_id)
  print(product.iloc[top_id].description)
  print("/n")  # Ensuring a newline for better readability