In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt 


In [2]:
data_root = pd.read_csv(r'website_dataset.csv')

In [14]:
data_root.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9168 entries, 0 to 9167
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      9168 non-null   int64  
 1   brand                   9168 non-null   object 
 2   category                9168 non-null   object 
 3   name                    9168 non-null   object 
 4   size                    9168 non-null   object 
 5   rating                  9168 non-null   float64
 6   number_of_reviews       9168 non-null   int64  
 7   love                    9168 non-null   int64  
 8   price                   9168 non-null   float64
 9   value_price             9168 non-null   float64
 10  URL                     9168 non-null   object 
 11  MarketingFlags          9168 non-null   bool   
 12  MarketingFlags_content  9168 non-null   object 
 13  options                 9168 non-null   object 
 14  details                 9168 non-null   

In [3]:
data = data_root.dropna()

In [13]:
data.shape

(9168, 21)

In [4]:
data.head()
data.to_csv("data.csv")

In [5]:
tfidf = TfidfVectorizer(analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), 
                      stop_words = 'english')

# Filling NaNs with empty string
data['details'] = data['details'].fillna('')

# Fitting the TF-IDF on the 'overview' text
tfidf_matrix = tfidf.fit_transform(data['details'])

tfidf_matrix.shape


(9168, 743673)

In [6]:
print(tfidf_matrix)

  (0, 482260)	0.04667115448828231
  (0, 183504)	0.04667115448828231
  (0, 34518)	0.04667115448828231
  (0, 183457)	0.04667115448828231
  (0, 98611)	0.04667115448828231
  (0, 183465)	0.04667115448828231
  (0, 101319)	0.04667115448828231
  (0, 183467)	0.04667115448828231
  (0, 164159)	0.04667115448828231
  (0, 684187)	0.04667115448828231
  (0, 202142)	0.04667115448828231
  (0, 433174)	0.03379046390556302
  (0, 10015)	0.034803989827272216
  (0, 478630)	0.029146974915898127
  (0, 4022)	0.03437337121359434
  (0, 150)	0.03294963492179967
  (0, 143165)	0.04466451088845632
  (0, 593221)	0.022896092491402082
  (0, 584445)	0.04667115448828231
  (0, 578189)	0.04667115448828231
  (0, 208843)	0.04466451088845632
  (0, 174663)	0.04466451088845632
  (0, 219095)	0.04466451088845632
  (0, 383816)	0.04667115448828231
  (0, 183500)	0.04667115448828231
  :	:
  (9167, 9426)	0.06014230189365319
  (9167, 196873)	0.024742589828352427
  (9167, 50667)	0.022926549566223052
  (9167, 591155)	0.1367641605777943
  (

In [7]:
# Compute the Cosine Similarity
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a pandas series with item titles as indices and indices as series values 
indices = pd.Series(data.index, index=data['name']).drop_duplicates()


In [8]:
print(indices)

name
Blu Mediterraneo MINIATURE Set                   0
Colonia                                          1
Arancia di Capri                                 2
Mirto di Panarea                                 3
Colonia Miniature Set                            4
                                              ... 
The Rose Gold Mask                            9163
Give Me Some Sugar Colorful Gloss Balm Set    9164
Weekend Warrior Tone Up Cream                 9165
Gift Card                                     9166
Happy Birthday Gift Card                      9167
Length: 9168, dtype: int64


In [9]:

title='Rosa Nobile'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(similarity_matrix[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:10]

# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(str(i[0]))
    
# Top 10 recommende movie
print(data['name'].iloc[top_10_movie_indices] + " ----0000--- " + data['URL'].iloc[top_10_movie_indices])

9       Rosa Nobile Hair Mist ----0000--- https://www....
32      Rosa Nobile Gift Set ----0000--- https://www.s...
4014    Red Roses Body Crème ----0000--- https://www.s...
37      Rose De Grasse ----0000--- https://www.sephora...
3974    Red Roses Scent Surround™ Diffuser ----0000---...
28      Peonia Nobile Gift Set ----0000--- https://www...
12      Peonia Nobile ----0000--- https://www.sephora....
4007    Red Roses Travel Candle ----0000--- https://ww...
1346    Gabrielle Chanel Shower Gel ----0000--- https:...
dtype: object


In [10]:
print(data)

           id               brand            category  \
0     2218774      Acqua Di Parma           Fragrance   
1     2044816      Acqua Di Parma             Cologne   
2     1417567      Acqua Di Parma             Perfume   
3     1417617      Acqua Di Parma             Perfume   
4     2218766      Acqua Di Parma           Fragrance   
...       ...                 ...                 ...   
9163  2208502  SEPHORA COLLECTION          Face Masks   
9164  2298909  SEPHORA COLLECTION            Lip Sets   
9165  2236750  SEPHORA COLLECTION  Tinted Moisturizer   
9166       50  SEPHORA COLLECTION         no category   
9167      304  SEPHORA COLLECTION         no category   

                                            name              size  rating  \
0                 Blu Mediterraneo MINIATURE Set    5 x 0.16oz/5mL     4.0   
1                                        Colonia     0.7 oz/ 20 mL     4.5   
2                               Arancia di Capri      5 oz/ 148 mL     4.5   
3  