In [None]:
from bs4 import BeautifulSoup
import os
import numpy as np
import pandas as pd

In [None]:
file_name = '2000-2010.csv'

In [None]:
raw_data = pd.read_csv(file_name)
perfume_data = raw_data.copy()

## Drop unnecessary scraping columns (web-scraper-order, web-scraper-start-url, product-link, product-link-href)

In [None]:
perfume_data = perfume_data.drop(columns=['web-scraper-order', 'web-scraper-start-url', 'product-link', 'product-link-href'])

In [None]:
perfume_data.shape

(2307, 15)

In [None]:
print(perfume_data.at[0, 'winter'])

<div style="border-radius: 0.2rem; height: 0.3rem; background: rgb(120, 214, 240); width: 74.0864%; opacity: 1;"></div>


## Extracting season and day/night percentage values

In [None]:
import re

In [None]:
def get_season_day_night(html_string):
    match = re.search(r'width: (\d+\.?\d*)%', html_string)
    return float(match.group(1)) if match else None

In [None]:
get_season_day_night(perfume_data.at[0, 'winter'])

74.0864

In [None]:
# test_season = '<div style="border-radius: 0.2rem; height: 0.3rem; background: rgb(120, 214, 240); width: 98.5782%; opacity: 1;"></div>'
# print(get_season_day_night(test_season))

In [None]:
perfume_data['winter'] = perfume_data['winter'].apply(get_season_day_night)
perfume_data['spring'] = perfume_data['spring'].apply(get_season_day_night)
perfume_data['fall'] = perfume_data['fall'].apply(get_season_day_night)
perfume_data['summer'] = perfume_data['summer'].apply(get_season_day_night)
perfume_data['day'] = perfume_data['day'].apply(get_season_day_night)
perfume_data['night'] = perfume_data['night'].apply(get_season_day_night)

In [None]:
perfume_data.head()

Unnamed: 0,name,accords,notes,longevity,sillage,gender,price-value,winter,spring,summer,fall,day,night,votes,rating
0,Nature Millenaire Yves Rocher for women,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","<div class=""grid-x grid-margin-x""><div class=""...","<div class=""grid-x grid-margin-x""><div class=""...","<div class=""grid-x grid-margin-x""><div class=""...",,74.0864,15.6146,11.2957,100.0,71.0963,70.0997,847,4.35
1,Tea for Two L'Artisan Parfumeur for women and men,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","<div class=""grid-x grid-margin-x""><div class=""...","<div class=""grid-x grid-margin-x""><div class=""...","<div class=""grid-x grid-margin-x""><div class=""...",,72.155,27.4818,14.4068,100.0,85.8354,49.7579,2641,4.11
2,I Am Eisenberg for women,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","<div class=""grid-x grid-margin-x""><div class=""...",,"<div class=""grid-x grid-margin-x""><div class=""...",,57.783,75.2358,42.9245,75.7075,100.0,36.7925,1397,3.95
3,Noix de Coco de Malaisie Yves Rocher for women,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","<div class=""grid-x grid-margin-x""><div class=""...",,"<div class=""grid-x grid-margin-x""><div class=""...",,53.4615,34.8077,73.4615,38.6538,100.0,23.4615,1309,3.76
4,Osmanthus The Different Company for women and men,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","<div class=""grid-x grid-margin-x""><div class=""...","<div class=""grid-x grid-margin-x""><div class=""...","<div class=""grid-x grid-margin-x""><div class=""...",,13.75,86.25,69.375,40.625,100.0,17.5,534,3.73


In [None]:
nan_counts = perfume_data.isna().sum()
print(nan_counts)

name              0
accords          12
notes            12
longevity       274
sillage         994
gender            0
price-value    1713
winter           11
spring           11
summer           11
fall             11
day              11
night            11
votes             2
rating            2
dtype: int64


In [None]:
perfume_data.at[0, 'price-value']

nan

## Extracting Longevity, Sillage, Gender, Price-Value values

In [None]:
def extract_long_sill_gend_pv(html):
    gender_votes = {}
    try:
        soup = BeautifulSoup(html, 'html.parser')
        for div in soup.find_all('div', class_='grid-x grid-margin-x'):
            gender = div.find('span', class_='vote-button-name').text.strip()
            votes = int(div.find('span', class_='vote-button-legend').text.strip())
            gender_votes[gender] = votes
    except Exception as e:
        # print("Error detected", e)
        return None

    return gender_votes

In [None]:
test_gender = """<div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">female</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">266</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="614" class="alert" value="266" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">more female</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">161</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="614" class="alert" value="161" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">unisex</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">172</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="614" class="alert" value="172" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">more male</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">7</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="614" class="alert" value="7" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">male</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">8</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="614" class="alert" value="8" style="color: crimson; cursor: pointer;"></progress></div></div>"""


test_longevity = """
<div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">very weak</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">10</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="237" class="alert" value="10" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">weak</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">25</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="237" class="alert" value="25" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">moderate</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">89</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="237" class="alert" value="89" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">long lasting</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">84</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="237" class="alert" value="84" style="color: crimson; cursor: pointer;"></progress></div></div><div class="grid-x grid-margin-x"><div class="cell small-5 medium-5 large-5"><span class="vote-button-name">eternal</span></div> <div class="cell small-1 medium-1 large-1"><span class="vote-button-legend">29</span></div> <div class="cell small-6 medium-6 large-6" style="display: inline-flex; align-items: center; justify-content: center;"><progress max="237" class="alert" value="29" style="color: crimson; cursor: pointer;"></progress></div></div>
"""

print(extract_long_sill_gend_pv(test_gender))
print(extract_long_sill_gend_pv(test_longevity))

{'female': 266, 'more female': 161, 'unisex': 172, 'more male': 7, 'male': 8}
{'very weak': 10, 'weak': 25, 'moderate': 89, 'long lasting': 84, 'eternal': 29}


In [None]:
perfume_data['longevity'] = perfume_data['longevity'].apply(extract_long_sill_gend_pv)
perfume_data['sillage'] = perfume_data['sillage'].apply(extract_long_sill_gend_pv)
perfume_data['gender'] = perfume_data['gender'].apply(extract_long_sill_gend_pv)
perfume_data['price-value'] = perfume_data['price-value'].apply(extract_long_sill_gend_pv)

In [None]:
perfume_data.head()

Unnamed: 0,name,accords,notes,longevity,sillage,gender,price-value,winter,spring,summer,fall,day,night,votes,rating
0,Nature Millenaire Yves Rocher for women,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","{'very weak': 7, 'weak': 16, 'moderate': 55, '...","{'intimate': 25, 'moderate': 80, 'strong': 87,...","{'female': 38, 'more female': 18, 'unisex': 26...",,74.0864,15.6146,11.2957,100.0,71.0963,70.0997,847,4.35
1,Tea for Two L'Artisan Parfumeur for women and men,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","{'very weak': 43, 'weak': 66, 'moderate': 307,...","{'intimate': 156, 'moderate': 416, 'strong': 1...","{'female': 7, 'more female': 22, 'unisex': 199...",,72.155,27.4818,14.4068,100.0,85.8354,49.7579,2641,4.11
2,I Am Eisenberg for women,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","{'intimate': 125, 'moderate': 307, 'strong': 9...",,"{'female': 177, 'more female': 33, 'unisex': 3...",,57.783,75.2358,42.9245,75.7075,100.0,36.7925,1397,3.95
3,Noix de Coco de Malaisie Yves Rocher for women,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","{'intimate': 103, 'moderate': 222, 'strong': 8...",,"{'female': 53, 'more female': 28, 'unisex': 11...",,53.4615,34.8077,73.4615,38.6538,100.0,23.4615,1309,3.76
4,Osmanthus The Different Company for women and men,"<div class=""cell accord-box""><div class=""accor...","<div class=""cell""><div style=""display: flex; f...","{'very weak': 38, 'weak': 38, 'moderate': 58, ...","{'intimate': 84, 'moderate': 65, 'strong': 20,...","{'female': 30, 'more female': 14, 'unisex': 25...",,13.75,86.25,69.375,40.625,100.0,17.5,534,3.73


## Extracting accords

In [None]:
def get_accords(cell_value):
    # check if the value is NA
    if pd.isna(cell_value):
        return None

    # if it's not NA, proceed with parsing
    html_string = str(cell_value)  # Convert to string in case it's not already

    # regular expression pattern to extract name and width
    pattern = r'<div class="accord-bar"[^>]*>([^<]+)</div></div>'

    # find all matches in the HTML string
    matches = re.findall(pattern, html_string)

    # create a dictionary from the matches
    result = {}
    for match in matches:
        name = match.strip()
        # find the corresponding width for this name
        width_pattern = rf'>{re.escape(name)}</div></div><div class="cell accord-box"><div class="accord-bar"[^>]*width: (\d+(?:\.\d+)?)%'
        width_match = re.search(width_pattern, html_string)
        if width_match:
            result[name] = float(width_match.group(1))
        else:
            # for the last item, search for width before the name
            last_width_pattern = rf'width: (\d+(?:\.\d+)?)%[^>]*>{re.escape(name)}'
            last_width_match = re.search(last_width_pattern, html_string)
            if last_width_match:
                result[name] = float(last_width_match.group(1))

    return result

In [None]:
test_accords = """
<div class="cell accord-box"><div class="accord-bar" style="color: rgb(255, 255, 255); background: rgb(204, 51, 0); opacity: 1; width: 100%;">warm spicy</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(0, 0, 0); background: rgb(241, 227, 197); opacity: 0.897727; width: 88.0681%;">almond</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(255, 255, 255); background: rgb(238, 54, 59); opacity: 0.742954; width: 70.0114%;">sweet</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(255, 255, 255); background: rgb(206, 29, 51); opacity: 0.723861; width: 67.7838%;">cherry</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(0, 0, 0); background: rgb(180, 149, 95); opacity: 0.698634; width: 64.8406%;">nutty</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(0, 0, 0); background: rgb(252, 75, 41); opacity: 0.689432; width: 63.767%;">fruity</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(0, 0, 0); background: rgb(249, 255, 82); opacity: 0.631818; width: 57.0454%;">citrus</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(0, 0, 0); background: rgb(151, 176, 183); opacity: 0.621592; width: 55.8524%;">metallic</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(0, 0, 0); background: rgb(183, 167, 215); opacity: 0.584091; width: 51.4773%;">iris</div></div><div class="cell accord-box"><div class="accord-bar" style="color: rgb(255, 255, 255); background: rgb(120, 72, 58); opacity: 0.577271; width: 50.6816%;">leather</div></div>
"""

print(get_accords(test_accords))

{'warm spicy': 88.0681, 'almond': 70.0114, 'sweet': 67.7838, 'cherry': 64.8406, 'nutty': 63.767, 'fruity': 57.0454, 'citrus': 55.8524, 'metallic': 51.4773, 'iris': 50.6816, 'leather': 50.6816}


In [None]:
perfume_data['accords'] = perfume_data['accords'].apply(get_accords)

In [None]:
perfume_data.head()

Unnamed: 0,name,accords,notes,longevity,sillage,gender,price-value,winter,spring,summer,fall,day,night,votes,rating
0,Nature Millenaire Yves Rocher for women,"{'woody': 79.7803, 'amber': 48.5836, 'powdery'...","<div class=""cell""><div style=""display: flex; f...","{'very weak': 7, 'weak': 16, 'moderate': 55, '...","{'intimate': 25, 'moderate': 80, 'strong': 87,...","{'female': 38, 'more female': 18, 'unisex': 26...",,74.0864,15.6146,11.2957,100.0,71.0963,70.0997,847,4.35
1,Tea for Two L'Artisan Parfumeur for women and men,"{'warm spicy': 74.7873, 'sweet': 72.005, 'cinn...","<div class=""cell""><div style=""display: flex; f...","{'very weak': 43, 'weak': 66, 'moderate': 307,...","{'intimate': 156, 'moderate': 416, 'strong': 1...","{'female': 7, 'more female': 22, 'unisex': 199...",,72.155,27.4818,14.4068,100.0,85.8354,49.7579,2641,4.11
2,I Am Eisenberg for women,"{'fruity': 84.5919, 'sweet': 76.5142, 'fresh s...","<div class=""cell""><div style=""display: flex; f...","{'intimate': 125, 'moderate': 307, 'strong': 9...",,"{'female': 177, 'more female': 33, 'unisex': 3...",,57.783,75.2358,42.9245,75.7075,100.0,36.7925,1397,3.95
3,Noix de Coco de Malaisie Yves Rocher for women,"{'coconut': 58.0, 'sweet': 54.5, 'lactonic': 5...","<div class=""cell""><div style=""display: flex; f...","{'intimate': 103, 'moderate': 222, 'strong': 8...",,"{'female': 53, 'more female': 28, 'unisex': 11...",,53.4615,34.8077,73.4615,38.6538,100.0,23.4615,1309,3.76
4,Osmanthus The Different Company for women and men,"{'floral': 85.8822, 'fruity': 81.9403, 'citrus...","<div class=""cell""><div style=""display: flex; f...","{'very weak': 38, 'weak': 38, 'moderate': 58, ...","{'intimate': 84, 'moderate': 65, 'strong': 20,...","{'female': 30, 'more female': 14, 'unisex': 25...",,13.75,86.25,69.375,40.625,100.0,17.5,534,3.73


In [None]:
perfume_data.shape

(2307, 15)

## Extracting Notes

In [None]:
# def extract_notes(note_div):
#     """Extract notes from a note div."""
#     notes_list = []
#     note_divs = note_div.find_all("div")
#     for i in range(2, len(note_divs), 3):
#         notes_list.append(note_divs[i].get_text())
#     return notes_list

# def parse_fragrance_notes(html_element):
#     """Parse fragrance notes from the BeautifulSoup object."""
#     if pd.isna(html_element):
#         return None

#     # print(f'html_element: {html_element}')
#     soup = BeautifulSoup(html_element, 'html.parser')
#     note_style = "display: flex; justify-content: center; text-align: center; flex-flow: wrap; align-items: flex-end; padding: 0.5rem;"
#     notes = soup.find_all("div", attrs={"style": note_style})

#     top_notes_list = []
#     middle_notes_list = []
#     base_notes_list = []

#     if len(notes) == 3:
#         top_notes_list = extract_notes(notes[0])
#         middle_notes_list = extract_notes(notes[1])
#         base_notes_list = extract_notes(notes[2])
#     elif len(notes) == 2:
#         top_notes_list = extract_notes(notes[0])
#         middle_notes_list = extract_notes(notes[1])
#     elif len(notes) == 1:
#         middle_notes_list = extract_notes(notes[0])

#     return top_notes_list, middle_notes_list, base_notes_list

In [None]:
def extract_notes(note_div):
    """Extract notes from a note div."""
    notes_list = []
    note_divs = note_div.find_all("div")
    for i in range(2, len(note_divs), 3):
        notes_list.append(note_divs[i].get_text())
    return notes_list if notes_list else None

def parse_fragrance_notes(html_element):
    """Parse fragrance notes from the BeautifulSoup object."""
    if pd.isna(html_element):
        return None

    # print(f'html_element: {html_element}')
    soup = BeautifulSoup(html_element, 'html.parser')
    note_style = "display: flex; justify-content: center; text-align: center; flex-flow: wrap; align-items: flex-end; padding: 0.5rem;"
    notes = soup.find_all("div", attrs={"style": note_style})

    top_notes_list = None
    middle_notes_list = None
    base_notes_list = None

    if len(notes) == 3:
        top_notes_list = extract_notes(notes[0])
        middle_notes_list = extract_notes(notes[1])
        base_notes_list = extract_notes(notes[2])
    elif len(notes) == 2:
        top_notes_list = extract_notes(notes[0])
        middle_notes_list = extract_notes(notes[1])
    elif len(notes) == 1:
        middle_notes_list = extract_notes(notes[0])

    return top_notes_list, middle_notes_list, base_notes_list

In [None]:
test_notes = """
<div class="cell"><div style="display: flex; flex-direction: column; justify-content: center; text-align: center; background: white;"><div class="strike-title"><span>Perfume Pyramid</span></div><div><div style="display: flex; justify-content: center;"><div class="button-group" style="margin: auto; flex-grow: 0;"><div class="switch tiny" style="display: inline-block;"><input id="showDiagram" type="checkbox" name="showDiagram" class="switch-input"> <label for="showDiagram" class="switch-paddle"><span class="show-for-sr">Show votes</span></label></div> <span style="margin: 0px 1rem;">Show votes</span></div></div> <div class="text-center notes-box"><!----> <!----></div><h4 style="margin-top: 0.5rem;"><b>Top Notes</b></h4><div><!----> <div style="display: flex; justify-content: center; text-align: center; flex-flow: wrap; align-items: flex-end; padding: 0.5rem;"><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 1; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.55.jpg" style="width: 4.7rem;"></div><div><a href="https://www.fragrantica.com/notes/Saffron-55.html"><span class="link-span"></span></a>Saffron</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 1; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.1376.jpg" style="width: 4rem;"></div><div><a href="https://www.fragrantica.com/notes/Black-Cherry-1376.html"><span class="link-span"></span></a>Black Cherry</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.930955; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.79.jpg" style="width: 3.5rem;"></div><div><a href="https://www.fragrantica.com/notes/Bitter-Orange-79.html"><span class="link-span"></span></a>Bitter Orange</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.735089; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.117.jpg" style="width: 2.5rem;"></div><div><a href="https://www.fragrantica.com/notes/Peach-117.html"><span class="link-span"></span></a>Peach</div></div></div></div><h4 style="margin-top: 0.5rem;"><b>Middle Notes</b></h4><div><!----> <div style="display: flex; justify-content: center; text-align: center; flex-flow: wrap; align-items: flex-end; padding: 0.5rem;"><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.854803; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.61.jpg" style="width: 3.05rem;"></div><div><a href="https://www.fragrantica.com/notes/Cloves-61.html"><span class="link-span"></span></a>Cloves</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.840458; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.136.jpg" style="width: 2.975rem;"></div><div><a href="https://www.fragrantica.com/notes/Dark-Chocolate-136.html"><span class="link-span"></span></a>Chocolate</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.804771; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.14.jpg" style="width: 2.8rem;"></div><div><a href="https://www.fragrantica.com/notes/Jasmine-14.html"><span class="link-span"></span></a>Jasmine</div></div></div></div><h4 style="margin-top: 0.5rem;"><b>Base Notes</b></h4><div><!----> <div style="display: flex; justify-content: center; text-align: center; flex-flow: wrap; align-items: flex-end; padding: 0.5rem;"><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 1; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.130.jpg" style="width: 5rem;"></div><div><a href="https://www.fragrantica.com/notes/Almond-130.html"><span class="link-span"></span></a>Sweet Almond</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.886308; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.11.jpg" style="width: 3.225rem;"></div><div><a href="https://www.fragrantica.com/notes/Iris-11.html"><span class="link-span"></span></a>Iris</div></div><div style="margin: 0.2rem; display: flex; justify-content: center; flex-direction: column; text-align: center; opacity: 0.815302; position: relative;"><div><img loading="lazy" src="https://fimgs.net/mdimg/sastojci/t.521.jpg" style="width: 2.85rem;"></div><div><a href="https://www.fragrantica.com/notes/Brown-Sugar-521.html"><span class="link-span"></span></a>Brown sugar</div></div></div></div></div></div></div><div class="cell text-center"><button data-open="NotesRatingModal" class="button small" aria-controls="NotesRatingModal" aria-haspopup="dialog" tabindex="0">Vote for Ingredients</button></div>
"""
print(parse_fragrance_notes(test_notes))

(['Saffron', 'Black Cherry', 'Bitter Orange', 'Peach'], ['Cloves', 'Chocolate', 'Jasmine'], ['Sweet Almond', 'Iris', 'Brown sugar'])


In [None]:
parsed_notes = perfume_data['notes'].apply(parse_fragrance_notes)
# Create new columns 1, 2, and 3 from the parsed notes
# TODO Ask team whether to store different note categories in different columns (yes)
perfume_data[["top notes",	"middle notes",	"base notes"]] = pd.DataFrame(parsed_notes.tolist(), index=perfume_data.index)

In [None]:
perfume_data = perfume_data.drop(columns=['notes'])

In [None]:
perfume_data.head()

Unnamed: 0,name,accords,longevity,sillage,gender,price-value,winter,spring,summer,fall,day,night,votes,rating,top notes,middle notes,base notes
0,Nature Millenaire Yves Rocher for women,"{'woody': 79.7803, 'amber': 48.5836, 'powdery'...","{'very weak': 7, 'weak': 16, 'moderate': 55, '...","{'intimate': 25, 'moderate': 80, 'strong': 87,...","{'female': 38, 'more female': 18, 'unisex': 26...",,74.0864,15.6146,11.2957,100.0,71.0963,70.0997,847,4.35,,"[Benzoin, Woodsy Notes, Virginia Cedar, Iris, ...",
1,Tea for Two L'Artisan Parfumeur for women and men,"{'warm spicy': 74.7873, 'sweet': 72.005, 'cinn...","{'very weak': 43, 'weak': 66, 'moderate': 307,...","{'intimate': 156, 'moderate': 416, 'strong': 1...","{'female': 7, 'more female': 22, 'unisex': 199...",,72.155,27.4818,14.4068,100.0,85.8354,49.7579,2641,4.11,"[Tea, Star Anise, Bergamot]","[Cinnamon, Spices, Ginger, Gingerbread]","[Tobacco, Honey, Leather, Vanilla]"
2,I Am Eisenberg for women,"{'fruity': 84.5919, 'sweet': 76.5142, 'fresh s...","{'intimate': 125, 'moderate': 307, 'strong': 9...",,"{'female': 177, 'more female': 33, 'unisex': 3...",,57.783,75.2358,42.9245,75.7075,100.0,36.7925,1397,3.95,"[Raspberry, Black Pepper, Bergamot, Pink Pepper]","[Magnolia, Violet, Rose, Jasmine]","[Sandalwood, Amber, Vetiver, Musk, Vanilla, Be..."
3,Noix de Coco de Malaisie Yves Rocher for women,"{'coconut': 58.0, 'sweet': 54.5, 'lactonic': 5...","{'intimate': 103, 'moderate': 222, 'strong': 8...",,"{'female': 53, 'more female': 28, 'unisex': 11...",,53.4615,34.8077,73.4615,38.6538,100.0,23.4615,1309,3.76,,[Coconut],
4,Osmanthus The Different Company for women and men,"{'floral': 85.8822, 'fruity': 81.9403, 'citrus...","{'very weak': 38, 'weak': 38, 'moderate': 58, ...","{'intimate': 84, 'moderate': 65, 'strong': 20,...","{'female': 30, 'more female': 14, 'unisex': 25...",,13.75,86.25,69.375,40.625,100.0,17.5,534,3.73,"[Green Notes, Bergamot, Mandarin Orange]","[Osmanthus, Jasmine, Geranium]","[Musk, Rose]"


In [None]:
perfume_data.shape

(2307, 17)

# Storing Data in new file

In [None]:
perfume_data.head()

Unnamed: 0,name,accords,longevity,sillage,gender,price-value,winter,spring,summer,fall,day,night,votes,rating,top notes,middle notes,base notes
0,Nature Millenaire Yves Rocher for women,"{'woody': 79.7803, 'amber': 48.5836, 'powdery'...","{'very weak': 7, 'weak': 16, 'moderate': 55, '...","{'intimate': 25, 'moderate': 80, 'strong': 87,...","{'female': 38, 'more female': 18, 'unisex': 26...",,74.0864,15.6146,11.2957,100.0,71.0963,70.0997,847,4.35,,"[Benzoin, Woodsy Notes, Virginia Cedar, Iris, ...",
1,Tea for Two L'Artisan Parfumeur for women and men,"{'warm spicy': 74.7873, 'sweet': 72.005, 'cinn...","{'very weak': 43, 'weak': 66, 'moderate': 307,...","{'intimate': 156, 'moderate': 416, 'strong': 1...","{'female': 7, 'more female': 22, 'unisex': 199...",,72.155,27.4818,14.4068,100.0,85.8354,49.7579,2641,4.11,"[Tea, Star Anise, Bergamot]","[Cinnamon, Spices, Ginger, Gingerbread]","[Tobacco, Honey, Leather, Vanilla]"
2,I Am Eisenberg for women,"{'fruity': 84.5919, 'sweet': 76.5142, 'fresh s...","{'intimate': 125, 'moderate': 307, 'strong': 9...",,"{'female': 177, 'more female': 33, 'unisex': 3...",,57.783,75.2358,42.9245,75.7075,100.0,36.7925,1397,3.95,"[Raspberry, Black Pepper, Bergamot, Pink Pepper]","[Magnolia, Violet, Rose, Jasmine]","[Sandalwood, Amber, Vetiver, Musk, Vanilla, Be..."
3,Noix de Coco de Malaisie Yves Rocher for women,"{'coconut': 58.0, 'sweet': 54.5, 'lactonic': 5...","{'intimate': 103, 'moderate': 222, 'strong': 8...",,"{'female': 53, 'more female': 28, 'unisex': 11...",,53.4615,34.8077,73.4615,38.6538,100.0,23.4615,1309,3.76,,[Coconut],
4,Osmanthus The Different Company for women and men,"{'floral': 85.8822, 'fruity': 81.9403, 'citrus...","{'very weak': 38, 'weak': 38, 'moderate': 58, ...","{'intimate': 84, 'moderate': 65, 'strong': 20,...","{'female': 30, 'more female': 14, 'unisex': 25...",,13.75,86.25,69.375,40.625,100.0,17.5,534,3.73,"[Green Notes, Bergamot, Mandarin Orange]","[Osmanthus, Jasmine, Geranium]","[Musk, Rose]"


In [None]:
perfume_data.shape

(2307, 17)

In [None]:
# perfume_data.to_csv('2010-2022_converted.csv', index=False)