### libraries

In [767]:
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import process, fuzz
from dotenv import load_dotenv

import os
import pandas as pd
import re
import numpy as np

### database connection

In [768]:
load_dotenv()

username = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
host =  os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database =  os.getenv("DB_DATABASE")

connection_string = f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'
engine = create_engine(connection_string, echo=True)

### loading the table

In [769]:
table_name = 'Perfumes_v2'

df = pd.read_sql_table(table_name, con=engine)

data_list = df.to_dict(orient='records')  # Each row as a dictionary in a list

df['text_features'] = df['Description'].fillna('') + ' ' + df['Accords'].fillna('') + ' ' + df['Designer'].fillna('') + ' ' + df['TopNotes'].fillna('') + ' ' + df['MiddleNotes'].fillna('') + ' ' + df['BaseNotes'].fillna('')

2025-05-24 14:03:35,921 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2025-05-24 14:03:35,923 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:35,969 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2025-05-24 14:03:35,971 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:35,994 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2025-05-24 14:03:35,995 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:36,044 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-24 14:03:36,046 INFO sqlalchemy.engine.Engine DESCRIBE `michals2`.`Perfumes_v2`
2025-05-24 14:03:36,049 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:36,075 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-24 14:03:36,078 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:36,103 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-24 14:03:36,105 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:36,144 INFO sqlalc

df #use as dataframe </br>
data_list #use as dict

#### define user prompt

In [770]:
user_prompt = "I like woody, tabacco notes which would be perfect for evening."

## basic

#### content based filtering

In [771]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])
user_vec = vectorizer.transform([user_prompt])
#get results
cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
#get top 5 recommendation indexes
top_indices = cos_similarities.argsort()[::-1][:5]
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

In [772]:
print("Top perfume recommendations:")
recommended_perfumes

Top perfume recommendations:


Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
2858,Sweet Milk,The Dua Brand,Sweet Milk by The Dua Brand is a fragrance for...,amber (100%); sweet (91.6003%); lactonic (80.4...,"['Milk', 'Marshmallow', 'Tonka Bean', 'Benzoin...",[],[],0.105903
2347,You Or Someone Like You,Etat Libre d'Orange,You Or Someone Like You by Etat Libre d'Orange...,green (100%); aromatic (69.7843%); fresh spicy...,"['Mint', 'Grapefruit', 'Bergamot', 'Anise']","['Green Notes', 'Cassis', 'Rose', 'Hedione']",['White Musk'],0.086034
2027,Sweet Diamond Pink Pepper 25,Kayali Fragrances,Sweet Diamond Pink Pepper 25 by Kayali Fragran...,rose (100%); warm spicy (83.0303%); soft spicy...,"['Pink Pepper', 'Saffron', 'Bergamot', 'Royal ...","['Bulgarian Rose', 'May Rose', 'Vanilla Orchid...","['Amber', 'Sandalwood', 'Patchouli', 'Musk']",0.077753
1098,Dia Woman,Amouage,Dia Woman by Amouage is a Floral fragrance for...,floral (100%); fresh (99.8436%); aldehydic (88...,"['Aldehydes', 'Cyclamen', 'Violet Leaf', 'Sage...","['Peony', 'Turkish Rose Oil', 'Orris Root', 'O...","['White Musk', 'Heliotrope', 'Sandalwood', 'In...",0.075209
52,Sand Desert At Sunset,Zara,Sand Desert At Sunset by Zara is a Oriental Sp...,warm spicy (100%); cinnamon (76.7838%); amber ...,"['Orange', 'Incense']","['Cinnamon', 'Tonka Bean', 'Chocolate']","['Iris', 'Cedar']",0.07407


#### loading the prelearned model

In [773]:
#load the model
cwd = os.getcwd()

project_root = os.path.abspath(os.path.join(cwd, '..'))
model_path = os.path.join(project_root, 'backend', 'recommend_models', 'transformer_model')
embeddings_path = os.path.join(project_root, 'backend', 'recommend_models', 'embeddings.npy')

### sentence transformer

In [774]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
top_indices = cos_scores.argsort()[::-1][:5]

In [775]:
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

# Display
recommended_perfumes

Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
1292,Trianon Palace Versailles,Maison Francis Kurkdjian,,floral (100%); woody (92.2222%),"['Floral Notes', 'Woodsy Notes']",[],[],0.037491
2181,The Tragedy of Lord George,Penhaligon's,,woody (100%); amber (77.6319%); vanilla (52.50...,"['Woodsy Notes', 'Brandy', 'Tonka Bean', 'Amber']",[],[],0.01573
1163,Lady Million Royal,Rabanne,,woody (100%); white floral (84.9506%); fruity ...,['Pomegranate'],['White Flowers'],['Woodsy Notes'],0.017597
998,Let s Settle This Argument Like Adults In The ...,By Kilian,,woody (100%); powdery (70.7588%); fruity (62.3...,"['Bergamot', 'Lemon']","['Fig Nectar', 'Green Notes', 'Hedione']","['Vanilla', 'Sandalwood', 'Cedar', 'Orris', 'A...",0.01356
2967,Shine,Ajmal,,woody (100%); fruity (95.7116%); powdery (69.1...,"['Strawberry', 'Pomegranate']","['Peony', 'Lily-of-the-Valley']","['Powdery Notes', 'Woody Notes']",0.026302


## accords

### extracting accords from dataframe

In [776]:
unique_accords = set()

for row in df['Accords'].dropna():
    matches = re.findall(r'([\w\s\-]+)\s*\(\d', row)
    unique_accords.update(name.strip() for name in matches)

unique_accords_list = list(unique_accords)

#### matching accords to given prompt by user

In [777]:
prompt_words = re.findall(r'\b\w+\b', user_prompt.lower())

matched_accords = set()
threshold = 80 # adjusted manually, determines sensitivity to finding matching words

for word in prompt_words:
    match, score, _ = process.extractOne(word, unique_accords_list, scorer=fuzz.partial_ratio)
    if score >= threshold:
        matched_accords.add(match)


def extract_accord_scores(accords_string, matched_accords):
    scores = {}
    for match in matched_accords:
        pattern = re.compile(rf'{re.escape(match)}\s*\(([\d\.]+)%\)', re.IGNORECASE)
        result = pattern.search(accords_string)
        if result:
            scores[match] = float(result.group(1))
    return scores

#### content based but with accords

In [778]:
df_content = df

In [779]:
df_content['matched_accord_scores'] = df_content['Accords'].apply(lambda x: extract_accord_scores(str(x), matched_accords))
df_content['accord_match_score'] = df_content['matched_accord_scores'].apply(lambda d: sum(d.values()) if d else 0)

In [780]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])
user_vec = vectorizer.transform([user_prompt])
cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()
df_content['similarity'] = cos_similarities

In [781]:
scaler = MinMaxScaler()

df_content['similarity_norm'] = scaler.fit_transform(df_content[['similarity']])
df_content['accord_score_norm'] = scaler.fit_transform(df_content[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df['final_score'] = alpha * df_content['similarity_norm'] + (1 - alpha) * df_content['accord_score_norm']
top_recommendations = df_content.sort_values(by='final_score', ascending=False).head(5)
top_recommendations[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
2858,Sweet Milk,The Dua Brand,0.105903,106.6668,0.807101
1768,Mr Burberry Eau de Parfum,Burberry,0.069742,181.3343,0.643053
2347,You Or Someone Like You,Etat Libre d'Orange,0.086034,69.7843,0.638733
1098,Dia Woman,Amouage,0.075209,111.4076,0.608979
2027,Sweet Diamond Pink Pepper 25,Kayali Fragrances,0.077753,65.905,0.580107


#### sentence transformer but with accords

In [782]:
df_transformer = df

In [783]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
top_indices = cos_scores.argsort()[::-1][:5]

df_transformer['similarity'] = cos_scores

In [784]:
scaler = MinMaxScaler()

df_transformer['similarity_norm'] = scaler.fit_transform(df_transformer[['similarity']])
df_transformer['accord_score_norm'] = scaler.fit_transform(df_transformer[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df_transformer['final_score'] = alpha * df_transformer['similarity_norm'] + (1 - alpha) * df_transformer['accord_score_norm']
top_recommendations_transformers = df_transformer.sort_values(by='final_score', ascending=False).head(5)
top_recommendations_transformers[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
1292,Trianon Palace Versailles,Maison Francis Kurkdjian,0.532245,92.2222,0.792598
2181,The Tragedy of Lord George,Penhaligon's,0.52477,100.0,0.790447
1678,Royal Mayfair 2024,Creed,0.378687,272.251,0.768742
1471,Hermessence Vetiver Tonka,Hermès,0.389861,236.7209,0.747957
1595,Aura Mugler,Mugler,0.455639,144.5255,0.743036


## negative use of words

In [785]:
table_name = 'Perfumes_v2'

df = pd.read_sql_table(table_name, con=engine)

data_list = df.to_dict(orient='records')  # Each row as a dictionary in a list

df['text_features'] = df['Description'].fillna('') + ' ' + df['Accords'].fillna('') + ' ' + df['Designer'].fillna('') + ' ' + df['TopNotes'].fillna('') + ' ' + df['MiddleNotes'].fillna('') + ' ' + df['BaseNotes'].fillna('')

2025-05-24 14:03:39,259 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-24 14:03:39,261 INFO sqlalchemy.engine.Engine DESCRIBE `michals2`.`Perfumes_v2`
2025-05-24 14:03:39,262 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:39,290 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-24 14:03:39,291 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:39,317 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-24 14:03:39,318 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:39,341 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `Perfumes_v2`
2025-05-24 14:03:39,343 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-24 14:03:39,370 INFO sqlalchemy.engine.Engine SELECT `Perfumes_v2`.id, `Perfumes_v2`.`Name`, `Perfumes_v2`.`Designer`, `Perfumes_v2`.`URL`, `Perfumes_v2`.`Gender`, `Perfumes_v2`.`Accords`, `Perfumes_v2`.`Description`, `Perfumes_v2`.`ProsAndCons`, `Perfumes_v2`.`TopNotes`, `Perfumes_v2`.`MiddleNotes`, `Pe

In [None]:
#without keywords
# user_prompt = "Fresh citrus floral notes are my go-to scents."
# user_prompt = "Citrus, lavender, and patchouli always make a great combination."
# user_prompt = "Fresh, clean, and powdery notes are ideal for daily wear."
# user_prompt = "Woody, earthy scents work well in the fall and winter."

#with include keywords
# user_prompt = "I really enjoy vanilla, sandalwood, and amber accords."
# user_prompt = "I prefer vanilla, musk, and amber in my perfumes."
# user_prompt = "Looking for something with fresh, green, and aquatic vibes."
# user_prompt = "I love sweet floral combinations and soft musk notes."

#with exclude keyword
# user_prompt = "I dislike overly sweet, smoky, or synthetic fragrances."
# user_prompt = "Avoid strong synthetic and overly spicy notes."
# user_prompt = "I don’t like anything fruity or overly sweet."
# user_prompt = "Skip powdery and incense-based fragrances."

#with mix both exclude and include
# user_prompt = "I love woody and musky tones but avoid anything too fruity or aquatic."
# user_prompt = "I enjoy fresh and citrusy scents, but dislike woody and leathery notes."
user_prompt = "Looking for green and floral perfumes, without anything too musky or heavy."
# user_prompt = "Prefer amber and vanilla accords but avoid anything synthetic or spicy."

#absurdly long prompt:
# user_prompt = "Explore a world of olfactory elegance and complexity with a collection of perfumes that embody the artistry of scent composition. Each fragrance tells a unique story through its accords, blending notes in harmonious symphonies that evoke emotions and memories. Imagine a perfume where fresh citrus top notes dance delicately with vibrant bergamot and zesty lemon, leading into a heart of lush jasmine and romantic rose, underscored by warm base notes of sensual musk and velvety vanilla. Picture another fragrance, where crisp green apple mingles with spicy cinnamon and earthy patchouli, creating an aura of mystery and allure. Step into a realm where each perfume is meticulously crafted, balancing floral, fruity, woody, and oriental elements to create a sensory journey. Envision a scent that opens with a burst of Mediterranean herbs, intertwining with aromatic lavender and intoxicating tobacco, culminating in a rich base of amber and leather, reminiscent of an evening by the fireplace. Picture yet another composition where sparkling grapefruit intertwines with aquatic marine notes, complemented by a bouquet of water lilies and lotus flowers, finishing with a hint of warm sandalwood and vetiver. Delve deeper into the nuances of perfumery, where accords like oud and saffron lend an exotic touch, while ambergris and tonka bean add depth and sophistication. Imagine a fragrance inspired by a lush garden, with dewy green notes, jasmine blossoms, and a touch of sweet honey, evoking the tranquility of nature in full bloom. Consider the allure of a perfume that blends smoky incense with opulent spices, creating an aura of ceremonial mystique and spiritual reverence In this collection, each perfume is not just a blend of scents but a narrative woven from the finest essences sourced from around the world. Experience the contrast of light and shadow, freshness and warmth, purity and sensuality in every bottle. Visualize the craftsmanship behind each fragrance, from the selection of raw materials to the meticulous blending process, resulting in a masterpiece that transcends time and trends. Immerse yourself in the world of perfumery, where tradition meets innovation, and each scent is an invitation to explore the realms of beauty and desire. Discover your signature fragrance among these olfactory masterpieces, each one a testament to the art of perfumery and the power of scent to captivate, seduce, and inspire"

take care of user prompt matched accords

In [1050]:
def match_accords(prompt, threshold = 80):
    prompt_words = re.findall(r'\b\w+\b', prompt.lower())

    matched_accords = set()

    for word in prompt_words:
        match, score, _ = process.extractOne(word, unique_accords_list, scorer=fuzz.partial_ratio)
        if score >= threshold:
            matched_accords.add(match)

    return matched_accords

def extract_accord_scores(accords_string, matched_accords):
    scores = {}
    for match in matched_accords:
        pattern = re.compile(rf'{re.escape(match)}\s*\(([\d\.]+)%\)', re.IGNORECASE)
        result = pattern.search(accords_string)
        if result:
            scores[match] = float(result.group(1))
    return scores

#### keywords

In [1051]:
INCLUDE_KEYWORDS = {
    'like', 'love', 'include', 'want', 'with', 'prefer', 'enjoy', 'looking',
    'need', 'interested', 'favor', 'wish', 'choose', 'require', 'add', 'pick',
    'select', 'hope', 'desire', 'accept', 'appreciate', 'keen', 'fond', 'favoring',
    'crave', 'seek', 'inclined', 'opt', 'request'
}
EXCLUDE_KEYWORDS = {
    'not', 'exclude', 'avoid', 'without', 'nothing', 'skip', 'dislike',
    'never', 'none', 'refuse', 'reject', 'remove', 'deny', 'lack',
    'bypass', 'ban', 'prohibit', 'decline', 'omit', 'except',
    'nope', 'nix', 'hate', 'forbid'
}
STOP_WORDS = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now'}

In [1052]:
def split_prompt_by_keywords(prompt):
    prompt_lower = prompt.lower()

    keywords = {word: 'include' for word in INCLUDE_KEYWORDS}
    keywords.update({word: 'exclude' for word in EXCLUDE_KEYWORDS})

    tokens = re.findall(r'\b\w+\b|,', prompt_lower)

    included = set()
    excluded = set()

    current_mode = None

    for token in tokens:
        if token in keywords:
            current_mode = keywords[token]
        elif token == ',':
            continue
        else:
            if current_mode == 'include':
                included.add(token)
            elif current_mode == 'exclude':
                excluded.add(token)
            else:
                pass

    if not included and not excluded:
        all_words = [w for w in tokens if w != ',']
        included.update(all_words)

    included = {word for word in included if word not in STOP_WORDS}
    excluded = {word for word in excluded if word not in STOP_WORDS}

    return included, excluded

In [1053]:
include_words, exclude_words = split_prompt_by_keywords(user_prompt)

In [1054]:
include_words

{'musky', 'tones', 'woody'}

In [1055]:
exclude_words

{'anything', 'aquatic', 'fruity'}

create prompts with associated liknage to user liking based on user prompt

In [1056]:
include_prompt = ' '.join(map(str, include_words))
exclude_prompt = ' '.join(map(str, exclude_words))

In [1057]:
include_matched = match_accords(include_prompt)
exclude_matched = match_accords(exclude_prompt)

#### add accord columns

In [1058]:
df['matched_include'] = df['Accords'].apply(lambda x: extract_accord_scores(str(x), include_matched))
df['accord_score_include'] = df['matched_include'].apply(lambda d: sum(d.values()) if d else 0)

df['matched_exclude'] = df['Accords'].apply(lambda x: extract_accord_scores(str(x), exclude_matched))
df['accord_score_exclude'] = df['matched_exclude'].apply(lambda d: sum(d.values()) if d else 0)

want

In [1059]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

In [1060]:
include_embedding = model.encode(include_prompt, convert_to_tensor=True)

cos_scores_include = util.cos_sim(include_embedding, embeddings).cpu().numpy().flatten()
top_indices_include = cos_scores_include.argsort()[::-1][:5]

df['similarity_like'] = cos_scores_include

In [1061]:
scaler = MinMaxScaler()

df['similarity_norm_like'] = scaler.fit_transform(df[['similarity_like']])
df['accord_score_norm_like'] = scaler.fit_transform(df[['accord_score_include']])

alpha = 0.7 #adjustable to match what is more important

df['final_score_like'] = alpha * df['similarity_norm_like'] + (1 - alpha) * df['accord_score_norm_like']

don't want

In [1062]:
model = SentenceTransformer(model_path)
embeddings = np.load(embeddings_path)

In [1063]:
exclude_embedding = model.encode(exclude_prompt, convert_to_tensor=True)

cos_scores_exclude = util.cos_sim(exclude_embedding, embeddings).cpu().numpy().flatten()
top_indices_exclude = cos_scores_exclude.argsort()[::-1][:5]

df['similarity_exclude'] = cos_scores_exclude

In [1064]:
scaler = MinMaxScaler()

df['similarity_norm_dislike'] = scaler.fit_transform(df[['similarity_exclude']])
df['accord_score_norm_dislike'] = scaler.fit_transform(df[['accord_score_exclude']])

alpha = 0.7 #adjustable to match what is more important

df['final_score_dislike'] = alpha * df['similarity_norm_dislike'] + (1 - alpha) * df['accord_score_norm_dislike']

In [1065]:
df['final_score'] = df['final_score_like'] - df['final_score_dislike']

In [1066]:
top_recommendations_transformers = df.sort_values(by='final_score', ascending=False).head(5)
top_recommendations_transformers[['Name', 'Designer', 'similarity_like', 'similarity_exclude', 'accord_score_include', 'accord_score_exclude', 'final_score']]

Unnamed: 0,Name,Designer,similarity_like,similarity_exclude,accord_score_include,accord_score_exclude,final_score
2777,Not A Perfume,Juliette Has A Gun,0.473416,0.151741,118.8231,0.0,0.610287
2157,Mukhlat Abiyad,Afnan,0.438166,0.206856,186.9769,0.0,0.603408
2667,Gaiac 10 Tokyo,Le Labo,0.518728,0.280006,170.4579,0.0,0.59783
2802,White Spirit,Juliette Has A Gun,0.444646,0.186047,156.7696,0.0,0.590497
2158,Dehn al Oudh Abiyad,Afnan,0.361906,0.17895,181.3332,0.0,0.528983
