In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')
# Load the dataset
data = pd.read_csv('Myntra Fasion Clothing.csv')

# Extract relevant columns
descriptions = data['Description'].tolist()
urls = data['URL'].tolist()

# Text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join tokens back to a clean text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply preprocessing to descriptions
cleaned_descriptions = [preprocess_text(desc) for desc in descriptions]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sweek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sweek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv('Myntra Fasion Clothing.csv')


In [2]:
from gensim.models import Word2Vec

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=[desc.split() for desc in cleaned_descriptions], vector_size=100, window=5, min_count=1, sg=0)

# Function to get the vector representation of a description
def get_description_vector(description):
    vector = sum([word2vec_model.wv[word] for word in description.split() if word in word2vec_model.wv])
    return vector

# Get description vectors
description_vectors = [get_description_vector(desc) for desc in cleaned_descriptions]


In [5]:
# Save the Word2Vec model to a file
import numpy as np
word2vec_model.save("word2vec_model.model")

# Save the cleaned descriptions and description vectors to files
with open("cleaned_descriptions.txt", "w") as f:
    for desc in cleaned_descriptions:
        f.write(desc + "\n")

np.save("description_vectors.npy", np.array(description_vectors))


In [6]:
user_query = "pink shirt"
user_query_cleaned = preprocess_text(user_query)
user_query_vector = get_description_vector(user_query_cleaned)


In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert user query vector to numpy array
user_query_vector = np.array([user_query_vector])

# Calculate cosine similarities
similarities = cosine_similarity(user_query_vector, description_vectors)

# Flatten the similarities array
similarities = similarities.flatten()


In [8]:
# Get indices of top 5 similar items
top_indices = similarities.argsort()[-5:][::-1]

# Get links of top 5 similar items
top_item_links = [urls[i] for i in top_indices]

print("Top 5 similar item links:")
for link in top_item_links:
    print(link)


Top 5 similar item links:
https://www.myntra.com/shirts/kazo/kazo-pink-shirt/860926/buy
https://www.myntra.com/tshirts/hypernation/hypernation-pink-t-shirt/1064683/buy
https://www.myntra.com/shirts/only/only-mint-green-shirt/1206465/buy
https://www.myntra.com/tshirts/the-pink-moon/the-pink-moon-woman-pink-yoga-t-shirt/17100756/buy
https://www.myntra.com/tshirts/belliskey/belliskey-women-pink--white-boxy-t-shirt/17829962/buy
