In [56]:
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import process, fuzz

import pandas as pd
import re

In [57]:
username = 'michals2'
password = 'Nv1VZRmuTxRvJn3u'
host = 'mysql.agh.edu.pl'
port = 3306
database = 'michals2'

connection_string = f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}'

In [58]:
engine = create_engine(connection_string, echo=True)

In [59]:
table_name = 'Perfumes'

df = pd.read_sql_table(table_name, con=engine)
data_list = df.to_dict(orient='records')  # Each row as a dictionary in a list

2025-05-03 12:01:49,577 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2025-05-03 12:01:49,579 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-03 12:01:49,626 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2025-05-03 12:01:49,627 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-03 12:01:49,651 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2025-05-03 12:01:49,653 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-03 12:01:49,699 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-05-03 12:01:49,702 INFO sqlalchemy.engine.Engine DESCRIBE `michals2`.`Perfumes`
2025-05-03 12:01:49,703 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-03 12:01:49,732 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-03 12:01:49,733 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-03 12:01:49,756 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `michals2`
2025-05-03 12:01:49,761 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-05-03 12:01:49,811 INFO sqlalchem

df #use as dataframe </br>
data_list #use as dict

define user prompt

In [60]:
user_prompt = "I like woody, tabacco notes which would be perfect for evening."

#### content based filtering

In [61]:
df['text_features'] = df['Description'].fillna('') + ' ' + df['Accords'].fillna('') + ' ' + df['Designer'].fillna('') + ' ' + df['TopNotes'].fillna('') + ' ' + df['MiddleNotes'].fillna('') + ' ' + df['BaseNotes'].fillna('')

In [62]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])

user_vec = vectorizer.transform([user_prompt])

cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()

In [63]:
top_indices = cos_similarities.argsort()[::-1][:5]
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

In [64]:
print("Top perfume recommendations:")
recommended_perfumes

Top perfume recommendations:


Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
12,Chilling By The Pacific,The Dua Brand,Chilling By The Pacific by The Dua Brand is a ...,citrus (100%); aromatic (83.5618%); green (59....,"['Citron', 'Coriander', 'Mint', 'Apricot', 'Or...",[],[],0.07764
1098,Dia Woman,Amouage,Dia Woman by Amouage is a Floral fragrance for...,floral (100%); fresh (99.8436%); aldehydic (88...,"['Aldehydes', 'Cyclamen', 'Violet Leaf', 'Sage...","['Peony', 'Turkish Rose Oil', 'Orris Root', 'O...","['White Musk', 'Heliotrope', 'Sandalwood', 'In...",0.076541
52,Sand Desert At Sunset,Zara,Sand Desert At Sunset by Zara is a Oriental Sp...,warm spicy (100%); cinnamon (76.7838%); amber ...,"['Orange', 'Incense']","['Cinnamon', 'Tonka Bean', 'Chocolate']","['Iris', 'Cedar']",0.075917
644,Cinéma,Yves Saint Laurent,Cinéma by Yves Saint Laurent is a Oriental Flo...,floral (100%); vanilla (75.8754%); amber (73.5...,"['Almond Blossom', 'Clementine', 'Cyclamen']","['Jasmine', 'Peony', 'Amaryllis']","['Vanilla', 'Amber', 'Benzoin', 'White Musk']",0.071787
0,Unicorn Milkshake,The Dua Brand,Unicorn Milkshake by The Dua Brand is a Floral...,sweet (100%); vanilla (58.1656%); lactonic (55...,[],[],[],0.071374


#### sentence transformer

In [65]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')
text_features = df['text_features'].tolist()

embeddings = model.encode(text_features, convert_to_tensor=True)
user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
top_indices = cos_scores.argsort()[::-1][:5]

In [66]:
recommended_perfumes = df.iloc[top_indices][['Name', 'Designer', 'Description', 'Accords', 'TopNotes', 'MiddleNotes', 'BaseNotes']]
recommended_perfumes['similarity'] = cos_similarities[top_indices]

# Display
recommended_perfumes

Unnamed: 0,Name,Designer,Description,Accords,TopNotes,MiddleNotes,BaseNotes,similarity
998,,By Kilian,,woody (100%); powdery (70.7588%); fruity (62.3...,"['Bergamot', 'Lemon']","['Fig Nectar', 'Green Notes', 'Hedione']","['Vanilla', 'Sandalwood', 'Cedar', 'Orris', 'A...",0.015269
950,,Mancera,,coffee (100%); woody (92.2342%); warm spicy (7...,"['Black Currant', 'Bergamot', 'Peach']","['Coffee', 'Amber', 'Floral Notes']","['Woody Notes', 'Sweet Notes', 'White Musk']",0.032694
766,More Than Words,Xerjoff,,amber (100%); oud (68.8718%); fruity (64.8205%...,"['Agarwood (Oud)', 'Fruity Notes', 'Floral Not...",[],[],0.040287
450,Dangerous Lily,Dzintars,,white floral (100%); floral (76.6668%); woody ...,"['Lily', 'Floral Notes', 'dark woodsy notes', ...",[],[],0.026576
977,,By Kilian,,sweet (100%); woody (75.308%); fruity (67.0148...,"['Cognac', 'Raspberry', 'Liquor']","['Tonka Bean', 'Bulgarian Rose', 'Oakmoss']","['Praline', 'Oak', 'Sandalwood']",0.007445


#### extracting accords from dataframe

In [67]:
unique_accords = set()

for row in df['Accords'].dropna():
    matches = re.findall(r'([\w\s\-]+)\s*\(\d', row)
    unique_accords.update(name.strip() for name in matches)

unique_accords_list = list(unique_accords)

#### matching accords to given prompt by user

In [68]:
prompt_words = re.findall(r'\b\w+\b', user_prompt.lower())

matched_accords = set()
threshold = 80 # adjusted manually, determines sensitivity to finding matching words

for word in prompt_words:
    match, score, _ = process.extractOne(word, unique_accords_list, scorer=fuzz.partial_ratio)
    if score >= threshold:
        matched_accords.add(match)


def extract_accord_scores(accords_string, matched_accords):
    scores = {}
    for match in matched_accords:
        pattern = re.compile(rf'{re.escape(match)}\s*\(([\d\.]+)%\)', re.IGNORECASE)
        result = pattern.search(accords_string)
        if result:
            scores[match] = float(result.group(1))
    return scores

#### content based but with accords

In [69]:
df_content = df

In [70]:
df_content['matched_accord_scores'] = df_content['Accords'].apply(lambda x: extract_accord_scores(str(x), matched_accords))

df_content['accord_match_score'] = df_content['matched_accord_scores'].apply(lambda d: sum(d.values()) if d else 0)

In [71]:
print(f"Mean: {df_content['accord_match_score'].mean()}, Std: {df_content['accord_match_score'].std()}")

Mean: 76.8074131059246, Std: 51.769600956639366


In [72]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text_features'])

user_vec = vectorizer.transform([user_prompt])

cos_similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()

df_content['similarity'] = cos_similarities

In [73]:
scaler = MinMaxScaler()

df_content['similarity_norm'] = scaler.fit_transform(df_content[['similarity']])
df_content['accord_score_norm'] = scaler.fit_transform(df_content[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df['final_score'] = alpha * df_content['similarity_norm'] + (1 - alpha) * df_content['accord_score_norm']
top_recommendations = df_content.sort_values(by='final_score', ascending=False).head(5)
top_recommendations[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
1098,Dia Woman,Amouage,0.076541,63.5253,0.772497
12,Chilling By The Pacific,The Dua Brand,0.07764,46.2534,0.760002
552,Tobacolor,Dior,0.06089,156.188,0.751599
52,Sand Desert At Sunset,Zara,0.075917,48.0105,0.746751
1069,Guidance,Amouage,0.057411,166.4206,0.733509


#### sentence transformer but with accords

In [74]:
df_transformer = df

In [75]:
model = SentenceTransformer('all-MiniLM-L6-v2')
text_features = df_transformer['text_features'].tolist()

embeddings = model.encode(text_features, convert_to_tensor=True)
user_embedding = model.encode(user_prompt, convert_to_tensor=True)

cos_scores = util.cos_sim(user_embedding, embeddings).cpu().numpy().flatten()
df_transformer['similarity'] = cos_scores

In [76]:
scaler = MinMaxScaler()

df_transformer['similarity_norm'] = scaler.fit_transform(df_transformer[['similarity']])
df_transformer['accord_score_norm'] = scaler.fit_transform(df_transformer[['accord_match_score']])

alpha = 0.7 #adjustable to match what is more important
 
df_transformer['final_score'] = alpha * df_transformer['similarity_norm'] + (1 - alpha) * df_transformer['accord_score_norm']
top_recommendations_transformers = df_transformer.sort_values(by='final_score', ascending=False).head(5)
top_recommendations_transformers[['Name', 'Designer', 'similarity', 'accord_match_score', 'final_score']]

Unnamed: 0,Name,Designer,similarity,accord_match_score,final_score
998,,By Kilian,0.471539,162.3165,0.910566
950,,Mancera,0.458425,146.2041,0.870754
948,,Mancera,0.425671,169.8644,0.85422
700,Le Beau Le Parfum,Jean Paul Gaultier,0.415989,166.1747,0.835473
994,,By Kilian,0.419754,158.2636,0.830638
