# Nike Product Descriptions Similarity Analysis
This notebook computes TF-IDF cosine similarity and Jaccard similarity between product descriptions in the Nike dataset.

## Step 1: Install dependencies

In [1]:
!pip install -q nltk scikit-learn pandas

## Step 2: Import libraries & download NLTK data

In [2]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

## Step 3: Locate and load the Nike CSV

In [4]:
# Optional: list files in /content
!ls -lh /content


FILE_PATH = '/content/NikeProductDescriptions (1).csv'
df = pd.read_csv(FILE_PATH)
print('Original columns:', df.columns.tolist())
df = df.rename(columns={'Subtitle':'subtitle', 'Product Description':'description'})
print('Renamed columns:', df.columns.tolist())
df[['subtitle','description']].head(3)

total 132K
-rw-r--r-- 1 root root 126K Jul  7 13:35 'NikeProductDescriptions (1).csv'
drwxr-xr-x 1 root root 4.0K Jul  1 21:04  sample_data
Original columns: ['Title', 'Subtitle', 'Product Description']
Renamed columns: ['Title', 'subtitle', 'description']


Unnamed: 0,subtitle,description
0,Men's Shoes,It doesn't get more legendary than this. Desig...
1,Men's Shoes,Find out what moves you with the Air Max Dawn....
2,Skate Shoes,Pack your style—on your feet. Bringing a fresh...


## Step 4: Preprocess descriptions

In [5]:
# Prepare tokenizer, stopwords, lemmatizer
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t)>2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

df['tokens'] = df['description'].fillna('').apply(preprocess)
df['clean_text'] = df['tokens'].apply(lambda toks: ' '.join(toks))
df[['description','clean_text','tokens']].head(3)

Unnamed: 0,description,clean_text,tokens
0,It doesn't get more legendary than this. Desig...,get legendary designed turn head nike air forc...,"[get, legendary, designed, turn, head, nike, a..."
1,Find out what moves you with the Air Max Dawn....,find move air max dawn rooted sporty athletics...,"[find, move, air, max, dawn, rooted, sporty, a..."
2,Pack your style—on your feet. Bringing a fresh...,pack style foot bringing fresh twist iconic sk...,"[pack, style, foot, bringing, fresh, twist, ic..."


## Step 5: Compute TF-IDF & Cosine similarity

In [6]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_text'])
cos_sim = cosine_similarity(tfidf_matrix)
cos_df = pd.DataFrame(cos_sim, index=df.index, columns=df.index)
cos_df.to_csv('cosine_similarity.csv')

# Top-5 most similar pairs
pairs = []
for i in range(len(df)):
    for j in range(i+1, len(df)):
        pairs.append((i, j, cos_sim[i,j]))
top5 = sorted(pairs, key=lambda x: x[2], reverse=True)[:5]
print('Top 5 TF-IDF cosine-similar pairs:')
for i,j,score in top5:
    print(f'  [{i}] vs [{j}]: {score:.3f}')

Top 5 TF-IDF cosine-similar pairs:
  [52] vs [63]: 1.000
  [154] vs [340]: 0.983
  [44] vs [51]: 0.972
  [40] vs [48]: 0.970
  [233] vs [275]: 0.968


## Step 6: Compute Jaccard similarity

In [8]:
n = len(df)
jaccard = np.zeros((n, n))
lists = df['tokens'].tolist()
for i in range(n):
    set_i = set(lists[i])
    for j in range(i, n):
        set_j = set(lists[j])
        inter = set_i & set_j
        union = set_i | set_j
        score = len(inter) / (len(union) if union else 1.0)
        jaccard[i,j] = score
        jaccard[j,i] = score
jac_df = pd.DataFrame(jaccard, index=df.index, columns=df.index)
jac_df.to_csv('jaccard_similarity.csv')
print('Sample Jaccard[0,1]:', jaccard[0,1])

Sample Jaccard[0,1]: 0.03278688524590164
