In [21]:
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import process, fuzz
from dotenv import load_dotenv

import os
import pandas as pd
import re
import numpy as np

In [22]:
load_dotenv()

username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host =  os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database =  os.getenv("DB_DATABASE")

connection_string = f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'

In [23]:
engine = create_engine(connection_string, echo=True)

In [24]:
table_name = 'Perfumes_v2'

df = pd.read_sql_table(table_name, con=engine)
data_list = df.to_dict(orient='records')  # Each row as a dictionary in a list

2025-05-14 18:37:25,350 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2025-05-14 18:37:25,352 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 18:37:25,400 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2025-05-14 18:37:25,402 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 18:37:25,429 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2025-05-14 18:37:25,429 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 18:37:25,473 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-14 18:37:25,474 INFO sqlalchemy.engine.Engine DESCRIBE `michals2`.`Perfumes_v2`
2025-05-14 18:37:25,474 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 18:37:25,499 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-14 18:37:25,500 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 18:37:25,530 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-14 18:37:25,531 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-14 18:37:25,564 INFO sqlalc

df #use as dataframe </br>
data_list #use as dict

define user prompt

In [25]:
user_prompt = "I like woody, tabacco notes which would be perfect for evening."

#### content based filtering

In [26]:
df['text_features'] = df['Description'].fillna('') + ' ' + df['Accords'].fillna('') + ' ' + df['Designer'].fillna('') + ' ' + df['TopNotes'].fillna('') + ' ' + df['MiddleNotes'].fillna('') + ' ' + df['BaseNotes'].fillna('')

In [27]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])

user_vec = vectorizer.transform([user_prompt])

cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()

In [28]:
top_indices = cos_similarities.argsort()[::-1][:5]
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

In [29]:
print("Top perfume recommendations:")
recommended_perfumes

Top perfume recommendations:


Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
2858,Sweet Milk,The Dua Brand,Sweet Milk by The Dua Brand is a fragrance for...,amber (100%); sweet (91.6003%); lactonic (80.4...,"['Milk', 'Marshmallow', 'Tonka Bean', 'Benzoin...",[],[],0.105903
2347,You Or Someone Like You,Etat Libre d'Orange,You Or Someone Like You by Etat Libre d'Orange...,green (100%); aromatic (69.7843%); fresh spicy...,"['Mint', 'Grapefruit', 'Bergamot', 'Anise']","['Green Notes', 'Cassis', 'Rose', 'Hedione']",['White Musk'],0.086034
2027,Sweet Diamond Pink Pepper 25,Kayali Fragrances,Sweet Diamond Pink Pepper 25 by Kayali Fragran...,rose (100%); warm spicy (83.0303%); soft spicy...,"['Pink Pepper', 'Saffron', 'Bergamot', 'Royal ...","['Bulgarian Rose', 'May Rose', 'Vanilla Orchid...","['Amber', 'Sandalwood', 'Patchouli', 'Musk']",0.077753
1098,Dia Woman,Amouage,Dia Woman by Amouage is a Floral fragrance for...,floral (100%); fresh (99.8436%); aldehydic (88...,"['Aldehydes', 'Cyclamen', 'Violet Leaf', 'Sage...","['Peony', 'Turkish Rose Oil', 'Orris Root', 'O...","['White Musk', 'Heliotrope', 'Sandalwood', 'In...",0.075209
52,Sand Desert At Sunset,Zara,Sand Desert At Sunset by Zara is a Oriental Sp...,warm spicy (100%); cinnamon (76.7838%); amber ...,"['Orange', 'Incense']","['Cinnamon', 'Tonka Bean', 'Chocolate']","['Iris', 'Cedar']",0.07407


#### sentence transformer

In [30]:
from sentence_transformers import SentenceTransformer, util

# model = SentenceTransformer('all-MiniLM-L6-v2')
# text_features = df['text_features'].tolist()

# embeddings = model.encode(text_features, convert_to_tensor=True)

model = SentenceTransformer('../models/transformer_model')
embeddings = np.load('../models/embeddings.npy')

user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
top_indices = cos_scores.argsort()[::-1][:5]

In [31]:
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

# Display
recommended_perfumes

Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
1292,Trianon Palace Versailles,Maison Francis Kurkdjian,,floral (100%); woody (92.2222%),"['Floral Notes', 'Woodsy Notes']",[],[],0.037491
2181,The Tragedy of Lord George,Penhaligon's,,woody (100%); amber (77.6319%); vanilla (52.50...,"['Woodsy Notes', 'Brandy', 'Tonka Bean', 'Amber']",[],[],0.01573
1163,Lady Million Royal,Rabanne,,woody (100%); white floral (84.9506%); fruity ...,['Pomegranate'],['White Flowers'],['Woodsy Notes'],0.017597
998,Let s Settle This Argument Like Adults In The ...,By Kilian,,woody (100%); powdery (70.7588%); fruity (62.3...,"['Bergamot', 'Lemon']","['Fig Nectar', 'Green Notes', 'Hedione']","['Vanilla', 'Sandalwood', 'Cedar', 'Orris', 'A...",0.01356
2967,Shine,Ajmal,,woody (100%); fruity (95.7116%); powdery (69.1...,"['Strawberry', 'Pomegranate']","['Peony', 'Lily-of-the-Valley']","['Powdery Notes', 'Woody Notes']",0.026302


#### extracting accords from dataframe

In [32]:
unique_accords = set()

for row in df['Accords'].dropna():
    matches = re.findall(r'([\w\s\-]+)\s*\(\d', row)
    unique_accords.update(name.strip() for name in matches)

unique_accords_list = list(unique_accords)

#### matching accords to given prompt by user

In [33]:
prompt_words = re.findall(r'\b\w+\b', user_prompt.lower())

matched_accords = set()
threshold = 80 # adjusted manually, determines sensitivity to finding matching words

for word in prompt_words:
    match, score, _ = process.extractOne(word, unique_accords_list, scorer=fuzz.partial_ratio)
    if score >= threshold:
        matched_accords.add(match)


def extract_accord_scores(accords_string, matched_accords):
    scores = {}
    for match in matched_accords:
        pattern = re.compile(rf'{re.escape(match)}\s*\(([\d\.]+)%\)', re.IGNORECASE)
        result = pattern.search(accords_string)
        if result:
            scores[match] = float(result.group(1))
    return scores

#### content based but with accords

In [34]:
df_content = df

In [35]:
df_content['matched_accord_scores'] = df_content['Accords'].apply(lambda x: extract_accord_scores(str(x), matched_accords))

df_content['accord_match_score'] = df_content['matched_accord_scores'].apply(lambda d: sum(d.values()) if d else 0)

In [36]:
print(f"Mean: {df_content['accord_match_score'].mean()}, Std: {df_content['accord_match_score'].std()}")

Mean: 105.77039406332455, Std: 62.68145383697034


In [37]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])

user_vec = vectorizer.transform([user_prompt])

cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()

df_content['similarity'] = cos_similarities

In [38]:
scaler = MinMaxScaler()

df_content['similarity_norm'] = scaler.fit_transform(df_content[['similarity']])
df_content['accord_score_norm'] = scaler.fit_transform(df_content[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df['final_score'] = alpha * df_content['similarity_norm'] + (1 - alpha) * df_content['accord_score_norm']
top_recommendations = df_content.sort_values(by='final_score', ascending=False).head(5)
top_recommendations[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
2858,Sweet Milk,The Dua Brand,0.105903,148.6669,0.844573
552,Tobacolor,Dior,0.063603,232.5541,0.646555
2027,Sweet Diamond Pink Pepper 25,Kayali Fragrances,0.077753,121.3394,0.631932
2125,Tobacco Rush,Afnan,0.056812,247.8435,0.616534
2420,Wanted by Night,Azzaro,0.046768,308.4945,0.609126


#### sentence transformer but with accords

In [39]:
df_transformer = df

In [40]:
model = SentenceTransformer('../models/transformer_model')
embeddings = np.load('../models/embeddings.npy')

user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
# top_indices = cos_scores.argsort()[::-1][:5]

df_transformer['similarity'] = cos_scores

In [41]:
scaler = MinMaxScaler()

df_transformer['similarity_norm'] = scaler.fit_transform(df_transformer[['similarity']])
df_transformer['accord_score_norm'] = scaler.fit_transform(df_transformer[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df_transformer['final_score'] = alpha * df_transformer['similarity_norm'] + (1 - alpha) * df_transformer['accord_score_norm']
top_recommendations_transformers = df_transformer.sort_values(by='final_score', ascending=False).head(5)
top_recommendations_transformers[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
2181,The Tragedy of Lord George,Penhaligon's,0.52477,177.6319,0.86278
1321,King,Armaf,0.458459,228.32,0.823713
766,More Than Words,Xerjoff,0.453095,219.3795,0.807871
2967,Shine,Ajmal,0.468706,195.7116,0.805657
950,Aoud Cafe,Mancera,0.458425,207.1295,0.803061
