In [77]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Used The Existing Dataset From Kaggle - https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-dataset

In [78]:
# Load CSV file into a pandas DataFrame
df = pd.read_csv("similarity_app/dataset.csv")

# Preprocess Text Data

In [79]:
# Define a function to clean and preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        # Convert text to lowercase
        text = text.lower()
        # Tokenize the text
        tokens = word_tokenize(text)
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [token for token in tokens if token not in stop_words]
        # Lemmatize the tokens
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
        # Join the tokens back into a single string
        preprocessed_text = ' '.join(lemmatized_tokens)
        return preprocessed_text
    else:
        return ''
# Apply the preprocessing function to the correct text column in the DataFrame
df['cleaned_text'] = df['productDisplayName'].apply(preprocess_text)
# Save the preprocessed data to a new CSV file
df.to_csv('preprocessed_data.csv', index=False)

# Measure Similarity

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
data = pd.read_csv("preprocessed_data.csv")

In [82]:
# Fill missing values with an empty string
data['cleaned_text'].fillna('', inplace=True)

In [83]:
# Extract features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(data['cleaned_text'])

In [84]:
# Compute similarity between input text and texts in the database
def compute_similarity(input_text, text_features):
    input_vector = vectorizer.transform([input_text])
    similarity_scores = cosine_similarity(input_vector, text_features)
    similarity_scores = similarity_scores.flatten()
    return similarity_scores

In [85]:
# Example usage
input_text = "men"
similarity_scores = compute_similarity(input_text, text_features)

In [86]:
# Find top-k similar texts
k = 5
top_k_indices = similarity_scores.argsort()[-k:][::-1]
similar_texts = data.loc[top_k_indices, 'cleaned_text']

In [87]:
# Print the similar texts
for text in similar_texts:
     print(text)

mr men men white blue shirt
mr men men purple shirt
inkfruit men men blue white shirt
mr men men green black shirt
mr men men navy blue shirt


# Ranked Results

In [88]:
def find_similar_items(input_text, database_file, n):
    # Load preprocessed CSV file into a pandas DataFrame
    data = pd.read_csv(database_file)
    # Fill missing values with an empty string
    data['cleaned_text'].fillna('', inplace=True)
    # Extract features using TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    text_features = vectorizer.fit_transform(data['cleaned_text'])
    # Compute similarity between input text and texts in the database
    input_vector = vectorizer.transform([input_text])
    similarity_scores = cosine_similarity(input_vector, text_features)
    similarity_scores = similarity_scores.flatten()
    # Find top-N similar items
    top_n_indices = similarity_scores.argsort()[-n:][::-1]
    similar_items = data.loc[top_n_indices, 'link']
    return similar_items.tolist()

In [90]:
# Example usage
input_text = "women"
database_file = "preprocessed_data.csv"
n = 5
similar_items = find_similar_items(input_text, database_file, n)
# Print the URLs of the most similar items
for url in similar_items:
     print(url)

http://assets.myntassets.com/assets/images/51623/2016/5/20/11463745850786-Fossil-Women-Pink-Dial-Chronograph-Watch-ES3050-9371463745850648-1.jpg
http://assets.myntassets.com/v1/images/style/properties/Colorbar-Neutral-Triple-Act-Compact-001_9f89991d3bdb3129ea3d5e7d5d0ce5eb_images.jpg
http://assets.myntassets.com/v1/images/style/properties/d96f9ac926f29dde74fb606a6ee8a7a8_images.jpg
http://assets.myntassets.com/v1/images/style/properties/OTLS-Unisex-Beige-Bag_3be5576a798000bf84ba1829260e2e86_images.jpg
http://assets.myntassets.com/v1/images/style/properties/07dbb0a65f012de46ce5df53e5cce0aa_images.jpg
