## 1. Importation des données


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer

df = pd.read_csv('NikeProductDescriptions.csv')
df.head()

Unnamed: 0,Title,Subtitle,Product Description
0,Nike Air Force 1 '07,Men's Shoes,It doesn't get more legendary than this. Desig...
1,Nike Air Max Dawn SE,Men's Shoes,Find out what moves you with the Air Max Dawn....
2,Nike SB Dunk Low Pro Premium,Skate Shoes,Pack your style—on your feet. Bringing a fresh...
3,Nike Air Force 1 Mid '07 LX,Men's Shoes,The celebrations just keep coming. Unbox the A...
4,Nike Air Force 1 Mid '07,Men's Shoes,"Got your fave colour yet? No worries, the Colo..."


## 2. Pré-traitement
 - Mise en minuscules  
 - Suppression de la ponctuation  
 - Retrait des stop words  
 - Stemming via PorterStemmer  

In [3]:
stemmer = PorterStemmer()
stopwords = set(ENGLISH_STOP_WORDS)

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [stemmer.stem(tok) for tok in tokens if tok not in stopwords]
    return ' '.join(tokens)

df['clean_desc'] = df['Product Description'].astype(str).apply(preprocess)
df[['Product Description', 'clean_desc']].head()

Unnamed: 0,Product Description,clean_desc
0,It doesn't get more legendary than this. Desig...,doesnt legendari design turn head nike air for...
1,Find out what moves you with the Air Max Dawn....,move air max dawn root sporti athlet dna kick ...
2,Pack your style—on your feet. Bringing a fresh...,pack styleon feet bring fresh twist icon skate...
3,The celebrations just keep coming. Unbox the A...,celebr just come unbox af1 firesid feel lotta ...
4,"Got your fave colour yet? No worries, the Colo...",got fave colour worri colour month programm le...


## 3. Similitude TF-IDF + Cosine

In [4]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_desc'])
cosine_sim = cosine_similarity(tfidf_matrix)

# Affichage d’un sous-ensemble de la matrice
cosine_sim_df = pd.DataFrame(cosine_sim,
                             index=df.index,
                             columns=df.index)
cosine_sim_df.iloc[:5, :5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.020546,0.017998,0.098811,0.029103
1,0.020546,1.0,0.043615,0.0,0.040592
2,0.017998,0.043615,1.0,0.029052,0.030639
3,0.098811,0.0,0.029052,1.0,0.138195
4,0.029103,0.040592,0.030639,0.138195,1.0


## 4. Similitude Jaccard

In [5]:
token_sets = df['clean_desc'].apply(lambda x: set(x.split()))

def jaccard(a, b):
    return len(a & b) / len(a | b) if len(a | b) > 0 else 0.0

n = len(token_sets)
jaccard_mat = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        jaccard_mat[i, j] = jaccard(token_sets.iloc[i], token_sets.iloc[j])

jaccard_df = pd.DataFrame(jaccard_mat, index=df.index, columns=df.index)
jaccard_df.iloc[:5, :5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.034483,0.015385,0.071429,0.027397
1,0.034483,1.0,0.034483,0.0,0.029851
2,0.015385,0.034483,1.0,0.034483,0.041667
3,0.071429,0.0,0.034483,1.0,0.078125
4,0.027397,0.029851,0.041667,0.078125,1.0
