In [5]:
# Step 1: Install required libraries
!pip install -q nltk scikit-learn

# Step 2: Import libraries
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from nltk.tokenize import word_tokenize


import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
# Step 3: Upload the file
from google.colab import files
uploaded = files.upload()

df = pd.read_csv(next(iter(uploaded)))
df = df.dropna(subset=['Product Description']).copy()

Saving NikeProductDescriptions.csv to NikeProductDescriptions (4).csv


In [7]:
# Step 4: Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[' + string.punctuation + ']', '', text)  # remove punctuation
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['Cleaned_Description'] = df['Product Description'].apply(preprocess)

In [11]:
# Step 5: TF-IDF + Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Cleaned_Description'])
cosine_sim = cosine_similarity(tfidf_matrix)

# Print top 3 similar product pairs (excluding self-comparison)
print("Top 3 similar product pairs using TF-IDF + Cosine Similarity:")
similar_pairs = []
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        similar_pairs.append(((i, j), cosine_sim[i][j]))

similar_pairs = sorted(similar_pairs, key=lambda x: x[1], reverse=True)[:3]

for (i, j), score in similar_pairs:
    print(f"\n[{score:.2f}]")
    print("Product 1:", df.iloc[i]['Title'])
    print("Product 2:", df.iloc[j]['Title'])


Top 3 similar product pairs using TF-IDF + Cosine Similarity:

[0.99]
Product 1: Nike Dri-FIT Victory
Product 2: Nike Dri-FIT Victory

[0.98]
Product 1: Nike SB Ishod Premium
Product 2: Nike SB Ishod Premium

[0.97]
Product 1: Liverpool F.C. 2022/23 Stadium Third
Product 2: F.C. Barcelona 2022/23 Stadium Home


In [10]:
# Step 6: Jaccard Similarity
def jaccard_similarity(s1, s2):
    set1 = set(s1.split())
    set2 = set(s2.split())
    return len(set1 & set2) / len(set1 | set2)

# Compute Jaccard for all pairs
print("\nTop 3 similar product pairs using Jaccard Similarity:")
jaccard_scores = []
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        score = jaccard_similarity(df.iloc[i]['Cleaned_Description'], df.iloc[j]['Cleaned_Description'])
        jaccard_scores.append(((i, j), score))

jaccard_scores = sorted(jaccard_scores, key=lambda x: x[1], reverse=True)[:3]

for (i, j), score in jaccard_scores:
    print(f"\n[{score:.2f}]")
    print("Product 1:", df.iloc[i]['Title'])
    print("Product 2:", df.iloc[j]['Title'])


Top 3 similar product pairs using Jaccard Similarity:

[0.97]
Product 1: Nike SB Ishod Premium
Product 2: Nike SB Ishod Premium

[0.95]
Product 1: Nike React Infinity 3
Product 2: Nike React Infinity 3 Premium

[0.94]
Product 1: Nike Air Max 90 SE
Product 2: Nike Air Max 97
