### libraries

In [68]:
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import process, fuzz
from dotenv import load_dotenv

import os
import pandas as pd
import re
import numpy as np

### database connection

In [69]:
load_dotenv()

username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host =  os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database =  os.getenv("DB_DATABASE")

connection_string = f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'
engine = create_engine(connection_string, echo=True)

### loading the table

In [70]:
table_name = 'Perfumes_v2'

df = pd.read_sql_table(table_name, con=engine)

data_list = df.to_dict(orient='records')  # Each row as a dictionary in a list

df['text_features'] = df['Description'].fillna('') + ' ' + df['Accords'].fillna('') + ' ' + df['Designer'].fillna('') + ' ' + df['TopNotes'].fillna('') + ' ' + df['MiddleNotes'].fillna('') + ' ' + df['BaseNotes'].fillna('')

2025-05-15 21:51:00,500 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2025-05-15 21:51:00,501 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:00,546 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2025-05-15 21:51:00,549 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:00,583 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2025-05-15 21:51:00,584 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:00,630 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-15 21:51:00,632 INFO sqlalchemy.engine.Engine DESCRIBE `michals2`.`Perfumes_v2`
2025-05-15 21:51:00,633 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:00,660 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-15 21:51:00,661 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:00,685 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-15 21:51:00,686 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:00,710 INFO sqlalc

df #use as dataframe </br>
data_list #use as dict

#### define user prompt

In [71]:
user_prompt = "I like woody, tabacco notes which would be perfect for evening."

## basic

#### content based filtering

In [72]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])
user_vec = vectorizer.transform([user_prompt])
#get results
cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
#get top 5 recommendation indexes
top_indices = cos_similarities.argsort()[::-1][:5]
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

In [73]:
print("Top perfume recommendations:")
recommended_perfumes

Top perfume recommendations:


Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
2858,Sweet Milk,The Dua Brand,Sweet Milk by The Dua Brand is a fragrance for...,amber (100%); sweet (91.6003%); lactonic (80.4...,"['Milk', 'Marshmallow', 'Tonka Bean', 'Benzoin...",[],[],0.105903
2347,You Or Someone Like You,Etat Libre d'Orange,You Or Someone Like You by Etat Libre d'Orange...,green (100%); aromatic (69.7843%); fresh spicy...,"['Mint', 'Grapefruit', 'Bergamot', 'Anise']","['Green Notes', 'Cassis', 'Rose', 'Hedione']",['White Musk'],0.086034
2027,Sweet Diamond Pink Pepper 25,Kayali Fragrances,Sweet Diamond Pink Pepper 25 by Kayali Fragran...,rose (100%); warm spicy (83.0303%); soft spicy...,"['Pink Pepper', 'Saffron', 'Bergamot', 'Royal ...","['Bulgarian Rose', 'May Rose', 'Vanilla Orchid...","['Amber', 'Sandalwood', 'Patchouli', 'Musk']",0.077753
1098,Dia Woman,Amouage,Dia Woman by Amouage is a Floral fragrance for...,floral (100%); fresh (99.8436%); aldehydic (88...,"['Aldehydes', 'Cyclamen', 'Violet Leaf', 'Sage...","['Peony', 'Turkish Rose Oil', 'Orris Root', 'O...","['White Musk', 'Heliotrope', 'Sandalwood', 'In...",0.075209
52,Sand Desert At Sunset,Zara,Sand Desert At Sunset by Zara is a Oriental Sp...,warm spicy (100%); cinnamon (76.7838%); amber ...,"['Orange', 'Incense']","['Cinnamon', 'Tonka Bean', 'Chocolate']","['Iris', 'Cedar']",0.07407


#### loading the prelearned model

In [74]:
#load the model
cwd = os.getcwd()

project_root = os.path.abspath(os.path.join(cwd, '..'))
model_path = os.path.join(project_root, 'backend', 'recommend_models', 'transformer_model')
embeddings_path = os.path.join(project_root, 'backend', 'recommend_models', 'embeddings.npy')

### sentence transformer

In [75]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
top_indices = cos_scores.argsort()[::-1][:5]

In [76]:
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

# Display
recommended_perfumes

Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
1292,Trianon Palace Versailles,Maison Francis Kurkdjian,,floral (100%); woody (92.2222%),"['Floral Notes', 'Woodsy Notes']",[],[],0.037491
2181,The Tragedy of Lord George,Penhaligon's,,woody (100%); amber (77.6319%); vanilla (52.50...,"['Woodsy Notes', 'Brandy', 'Tonka Bean', 'Amber']",[],[],0.01573
1163,Lady Million Royal,Rabanne,,woody (100%); white floral (84.9506%); fruity ...,['Pomegranate'],['White Flowers'],['Woodsy Notes'],0.017597
998,Let s Settle This Argument Like Adults In The ...,By Kilian,,woody (100%); powdery (70.7588%); fruity (62.3...,"['Bergamot', 'Lemon']","['Fig Nectar', 'Green Notes', 'Hedione']","['Vanilla', 'Sandalwood', 'Cedar', 'Orris', 'A...",0.01356
2967,Shine,Ajmal,,woody (100%); fruity (95.7116%); powdery (69.1...,"['Strawberry', 'Pomegranate']","['Peony', 'Lily-of-the-Valley']","['Powdery Notes', 'Woody Notes']",0.026302


## accords

### extracting accords from dataframe

In [77]:
unique_accords = set()

for row in df['Accords'].dropna():
    matches = re.findall(r'([\w\s\-]+)\s*\(\d', row)
    unique_accords.update(name.strip() for name in matches)

unique_accords_list = list(unique_accords)

#### matching accords to given prompt by user

In [78]:
prompt_words = re.findall(r'\b\w+\b', user_prompt.lower())

matched_accords = set()
threshold = 80 # adjusted manually, determines sensitivity to finding matching words

for word in prompt_words:
    match, score, _ = process.extractOne(word, unique_accords_list, scorer=fuzz.partial_ratio)
    if score >= threshold:
        matched_accords.add(match)


def extract_accord_scores(accords_string, matched_accords):
    scores = {}
    for match in matched_accords:
        pattern = re.compile(rf'{re.escape(match)}\s*\(([\d\.]+)%\)', re.IGNORECASE)
        result = pattern.search(accords_string)
        if result:
            scores[match] = float(result.group(1))
    return scores

#### content based but with accords

In [79]:
df_content = df

In [80]:
df_content['matched_accord_scores'] = df_content['Accords'].apply(lambda x: extract_accord_scores(str(x), matched_accords))
df_content['accord_match_score'] = df_content['matched_accord_scores'].apply(lambda d: sum(d.values()) if d else 0)

In [81]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])
user_vec = vectorizer.transform([user_prompt])
cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
df_content['similarity'] = cos_similarities

In [82]:
scaler = MinMaxScaler()

df_content['similarity_norm'] = scaler.fit_transform(df_content[['similarity']])
df_content['accord_score_norm'] = scaler.fit_transform(df_content[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df['final_score'] = alpha * df_content['similarity_norm'] + (1 - alpha) * df_content['accord_score_norm']
top_recommendations = df_content.sort_values(by='final_score', ascending=False).head(5)
top_recommendations[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
2858,Sweet Milk,The Dua Brand,0.105903,148.6669,0.811271
52,Sand Desert At Sunset,Zara,0.07407,194.0874,0.634852
2420,Wanted by Night,Azzaro,0.046768,400.8253,0.609126
2027,Sweet Diamond Pink Pepper 25,Kayali Fragrances,0.077753,121.3394,0.604751
552,Tobacolor,Dior,0.063603,232.5541,0.594461


#### sentence transformer but with accords

In [83]:
df_transformer = df

In [84]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
top_indices = cos_scores.argsort()[::-1][:5]

df_transformer['similarity'] = cos_scores

In [85]:
scaler = MinMaxScaler()

df_transformer['similarity_norm'] = scaler.fit_transform(df_transformer[['similarity']])
df_transformer['accord_score_norm'] = scaler.fit_transform(df_transformer[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df_transformer['final_score'] = alpha * df_transformer['similarity_norm'] + (1 - alpha) * df_transformer['accord_score_norm']
top_recommendations_transformers = df_transformer.sort_values(by='final_score', ascending=False).head(5)
top_recommendations_transformers[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
2181,The Tragedy of Lord George,Penhaligon's,0.52477,177.6319,0.822989
1321,King,Armaf,0.458459,228.32,0.772568
2192,Halfeti Cedar,Penhaligon's,0.437604,262.2546,0.770176
1292,Trianon Palace Versailles,Maison Francis Kurkdjian,0.532245,92.2222,0.769024
2967,Shine,Ajmal,0.468706,195.7116,0.761815


## negative use of words

In [86]:
table_name = 'Perfumes_v2'

df = pd.read_sql_table(table_name, con=engine)

data_list = df.to_dict(orient='records')  # Each row as a dictionary in a list

df['text_features'] = df['Description'].fillna('') + ' ' + df['Accords'].fillna('') + ' ' + df['Designer'].fillna('') + ' ' + df['TopNotes'].fillna('') + ' ' + df['MiddleNotes'].fillna('') + ' ' + df['BaseNotes'].fillna('')

2025-05-15 21:51:02,802 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-15 21:51:02,804 INFO sqlalchemy.engine.Engine DESCRIBE `michals2`.`Perfumes_v2`
2025-05-15 21:51:02,805 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:02,829 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-15 21:51:02,829 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:02,854 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-15 21:51:02,855 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:02,885 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `Perfumes_v2`
2025-05-15 21:51:02,886 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-15 21:51:02,911 INFO sqlalchemy.engine.Engine SELECT `Perfumes_v2`.id, `Perfumes_v2`.`Name`, `Perfumes_v2`.`Designer`, `Perfumes_v2`.`URL`, `Perfumes_v2`.`Gender`, `Perfumes_v2`.`Accords`, `Perfumes_v2`.`Description`, `Perfumes_v2`.`ProsAndCons`, `Perfumes_v2`.`TopNotes`, `Perfumes_v2`.`MiddleNotes`, `Pe

In [271]:
# user_prompt = "however i like warm and spicy notes, I dislike woody smokey and fruity notes"
# user_prompt = "I want a sweet scent but without any fruity or candy-like notes."
# user_prompt = "I want something light and clean, not musky or heavy."
# user_prompt = "I'm a fan of floral scents, particularly rose and jasmine."
# user_prompt = "Give me something green, earthy, and natural."
# user_prompt = "I like woody and spicy fragrances, but no sweet or fruity notes."
# user_prompt = "Give me a scent with leather and vetiver, but skip anything floral."
# user_prompt = "I enjoy clean and citrusy perfumes, but I don’t want any musky or powdery notes."
# user_prompt = "I'm looking for something earthy and herbal, but avoid amber and vanilla."
# user_prompt = "I love rose and patchouli, but not interested in any aquatic or fresh notes."
# user_prompt = "I want a warm, smoky fragrance, but nothing green or citrusy."
# user_prompt = "I like sandalwood and incense, but no fruity or gourmand scents."
# user_prompt = "Please recommend a floral scent with jasmine and lily, but without any oud or leather."
user_prompt = "I enjoy sweet vanilla and tonka, but no spice or tobacco, please."
# user_prompt = "Something green and fresh, but no heavy amber or musky base."

take care of user prompt matched accords

In [272]:
def match_accords(prompt, threshold = 80):
    prompt_words = re.findall(r'\b\w+\b', prompt.lower())

    matched_accords = set()

    for word in prompt_words:
        match, score, _ = process.extractOne(word, unique_accords_list, scorer=fuzz.partial_ratio)
        if score >= threshold:
            matched_accords.add(match)

    return matched_accords

def extract_accord_scores(accords_string, matched_accords):
    scores = {}
    for match in matched_accords:
        pattern = re.compile(rf'{re.escape(match)}\s*\(([\d\.]+)%\)', re.IGNORECASE)
        result = pattern.search(accords_string)
        if result:
            scores[match] = float(result.group(1))
    return scores

#### keywords

In [273]:
INCLUDE_KEYWORDS = ['like', 'love', 'include', 'want', 'with', 'prefer', 'enjoy', 'looking']
EXCLUDE_KEYWORDS = ['not', 'don’t', 'exclude', 'avoid', 'without', 'but no', 'nothing', 'skip']

In [274]:
def parse_prompt(prompt):
    prompt = prompt.lower()
    
    include_words = []
    exclude_words = []

    tokens = re.split(r'[,.!?;]|and|but', prompt)

    for token in tokens:
        if any(kw in token for kw in EXCLUDE_KEYWORDS):
            exclude_words += re.findall(r'\b\w+\b', token)
        elif any(kw in token for kw in INCLUDE_KEYWORDS):
            include_words += re.findall(r'\b\w+\b', token)
        else:
            include_words += re.findall(r'\b\w+\b', token)
    
    stop_words = {'i', 'for', 'the', 'it', 'is', 'a', 'would', 'be'}
    include_words = [w for w in include_words if w not in stop_words]
    exclude_words = [w for w in exclude_words if w not in stop_words]

    return include_words, exclude_words

In [275]:
include_words, exclude_words = parse_prompt(user_prompt)

In [276]:
exclude_words

[]

create prompts with associated liknage to user liking based on user prompt

In [277]:
include_prompt = ' '.join(map(str, include_words))
exclude_prompt = ' '.join(map(str, exclude_words))

In [278]:
include_matched = match_accords(include_prompt)
exclude_matched = match_accords(exclude_prompt)

#### add accord columns

In [279]:
df['matched_include'] = df['Accords'].apply(lambda x: extract_accord_scores(str(x), include_matched))
df['accord_score_include'] = df['matched_include'].apply(lambda d: sum(d.values()) if d else 0)

df['matched_exclude'] = df['Accords'].apply(lambda x: extract_accord_scores(str(x), exclude_matched))
df['accord_score_exclude'] = df['matched_exclude'].apply(lambda d: sum(d.values()) if d else 0)

want

In [280]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

In [281]:
include_embedding = model.encode(include_prompt, convert_to_tensor=True)

cos_scores_include = util.cos_sim(include_embedding, embeddings).cpu().numpy().flatten()
top_indices_include = cos_scores_include.argsort()[::-1][:5]

df['similarity_like'] = cos_scores_include

In [282]:
scaler = MinMaxScaler()

df['similarity_norm_like'] = scaler.fit_transform(df[['similarity_like']])
df['accord_score_norm_like'] = scaler.fit_transform(df[['accord_score_include']])

alpha = 0.7 #adjustable to match what is more important

df['final_score_like'] = alpha * df['similarity_norm_like'] + (1 - alpha) * df['accord_score_norm_like']

don't want

In [283]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

In [284]:
exclude_embedding = model.encode(exclude_prompt, convert_to_tensor=True)

cos_scores_exclude = util.cos_sim(exclude_embedding, embeddings).cpu().numpy().flatten()
top_indices_exclude = cos_scores_exclude.argsort()[::-1][:5]

df['similarity_exclude'] = cos_scores_exclude

In [285]:
scaler = MinMaxScaler()

df['similarity_norm_dislike'] = scaler.fit_transform(df[['similarity_exclude']])
df['accord_score_norm_dislike'] = scaler.fit_transform(df[['accord_score_exclude']])

alpha = 0.7 #adjustable to match what is more important

df['final_score_dislike'] = alpha * df['similarity_norm_dislike'] + (1 - alpha) * df['accord_score_norm_dislike']

In [286]:
df['final_score'] = df['final_score_like'] - df['final_score_dislike']

In [288]:
top_recommendations_transformers = df.sort_values(by='final_score', ascending=False).head(5)
top_recommendations_transformers[['Name', 'Designer', 'similarity_like', 'similarity_exclude', 'accord_score_include', 'accord_score_exclude', 'final_score']]

Unnamed: 0,Name,Designer,similarity_like,similarity_exclude,accord_score_include,accord_score_exclude,final_score
1362,Oud Tobacco,Montale,0.60265,0.021865,393.4624,0,0.683191
747,XJ 1861 Naxos,Xerjoff,0.605667,0.007969,313.6635,0,0.674013
1174,Ange ou Demon,Givenchy,0.592937,0.009493,324.5034,0,0.661845
358,Tonka Imperiale,Guerlain,0.587277,0.003801,303.1614,0,0.65817
964,Smoking Hot,By Kilian,0.637958,0.03286,340.6235,0,0.650738
