In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt 


In [3]:
data_root = pd.read_csv(r'website_dataset.csv')

In [4]:
data = data_root.dropna()

In [5]:
data.head()
data.to_csv("data.csv")

In [6]:
tfidf = TfidfVectorizer(analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), 
                      stop_words = 'english')

# Filling NaNs with empty string
data['details'] = data['details'].fillna('')

# Fitting the TF-IDF on the 'overview' text
tfidf_matrix = tfidf.fit_transform(data['details'])

tfidf_matrix.shape


(9168, 743673)

In [7]:
# Compute the Cosine Similarity
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a pandas series with movie titles as indices and indices as series values 
indices = pd.Series(data.index, index=data['name']).drop_duplicates()


In [14]:

title='Rosa Nobile'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(similarity_matrix[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:10]

# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(str(i[0]))
    
# Top 10 recommende movie
print(data['name'].iloc[top_10_movie_indices] + " ----0000--- " + data['URL'].iloc[top_10_movie_indices])

9       Rosa Nobile Hair Mist ----0000--- https://www....
32      Rosa Nobile Gift Set ----0000--- https://www.s...
4014    Red Roses Body Crème ----0000--- https://www.s...
37      Rose De Grasse ----0000--- https://www.sephora...
3974    Red Roses Scent Surround™ Diffuser ----0000---...
28      Peonia Nobile Gift Set ----0000--- https://www...
12      Peonia Nobile ----0000--- https://www.sephora....
4007    Red Roses Travel Candle ----0000--- https://ww...
1346    Gabrielle Chanel Shower Gel ----0000--- https:...
dtype: object
