In [None]:
import json
import re
from collections import Counter, defaultdict


In [3]:
raw_data = json.load(open("../data/perfumes.json"))

In [19]:
# Function to remove parenthetical content from ingredient names
def clean_ingredient(name):
    return re.sub(r"\s*\(.*?\)", "", name).strip()

## Ingredients

In [None]:
# Initialize a Counter to keep track of ingredient occurrences
ingredient_counter = Counter()
# Iterate over each perfume in the dataset
for perfume in raw_data:
    ingredients = perfume.get("ingredients", [])
    # Clean ingredient names and update the counter
    cleaned_ingredients = [clean_ingredient(ingredient) for ingredient in ingredients]
    ingredient_counter.update(cleaned_ingredients)

# Sort the ingredients in descending order by occurrence
sorted_ingredients = ingredient_counter.most_common()

# Display the occurrence of each unique ingredient
print("Occurrence of each unique ingredient (sorted in descending order):")
for ingredient, count in sorted_ingredients:
    print(f"{ingredient}: {count}")



Occurrence of each unique ingredient (sorted in descending order):
Musk: 12270
Bergamot: 9037
Jasmine: 8396
Vanilla: 8158
Sandalwood: 8087
Patchouli: 8020
Rose: 7730
Amber: 7580
Cedarwood: 7332
Vetiver: 4586
Mandarin: 4099
Lemon: 3780
Tonka Bean: 3453
Orange Blossom: 3318
Iris / Orris: 3106
Lavender: 2927
Cardamom: 2852
Pink Pepper: 2736
Grapefruit: 2573
Woody Notes: 2451
Geranium: 2348
Leather: 2284
Incense / Olibanum: 2209
Lily Of The Valley: 2180
Violet: 2174
Ylang-ylang: 1957
Oud / Agarwood: 1927
Ciste Labdanum: 1838
Benzoin: 1722
Neroli: 1717
Saffron: 1683
Ginger: 1647
Peach: 1621
Clary Sage: 1593
Cinnamon: 1591
Oakmoss: 1555
Pear: 1515
Freesia: 1513
Nutmeg: 1499
Cashmeran: 1453
Peony: 1442
Blackcurrant Bud: 1418
Black Pepper: 1391
Tuberose: 1366
Apple: 1256
Raspberry: 1234
Magnolia: 1220
Violet Leaves: 1171
Heliotrope: 1033
Guaiac Wood: 1021
Clove: 968
Ambergris: 959
Pepper: 956
Citruses: 951
Gardenia: 944
Coriander: 923
Plum: 875
Tangerine: 854
Mint: 851
Petitgrain: 823
Lime: 81

## Family / Subfamily

In [21]:
# Initialize counters and a dictionary for hierarchical mapping
family_counter = Counter()
subfamily_counter = defaultdict(Counter)


# Function to clean names by removing parenthetical content and standardizing case
def clean_name(name):
    name = re.sub(r"\s*\(.*?\)", "", name).strip()
    return name.upper()


# Iterate over each perfume in the dataset
for perfume in raw_data:
    # Clean family and subfamily names
    family = clean_name(perfume.get("family", "Unknown"))
    subfamily = clean_name(perfume.get("subfamily", "Unknown"))

    # Update family count
    family_counter[family] += 1

    # Update subfamily count under the corresponding family
    subfamily_counter[family][subfamily] += 1

# Sort families by occurrence in descending order
sorted_families = family_counter.most_common()

print("Occurrence of each unique family and subfamily (sorted in descending order):\n")

for family, fam_count in sorted_families:
    print(f"{family}: {fam_count}")
    # Sort subfamilies under this family by occurrence
    sorted_subfamilies = subfamily_counter[family].most_common()
    for subfamily, sub_count in sorted_subfamilies:
        print(f"  └── {subfamily}: {sub_count}")
    print()  # Add an empty line for better readability

Occurrence of each unique family and subfamily (sorted in descending order):

FLORAL: 10774
  └── AMBERY: 3129
  └── FRUITY: 1993
  └── FLORAL: 1882
  └── CITRUS: 1342
  └── MUSK SKIN: 1026
  └── GREEN: 425
  └── ALDEHYDIC: 389
  └── WATERY: 247
  └── GOURMAND: 244
  └── WOODY: 54
  └── SPICY: 24
  └── CHYPRE: 8
  └── AROMATIC FOUGERE: 6
  └── LEATHER: 3
  └── TOBACCO: 2

WOODY: 7044
  └── AMBERY: 3396
  └── CITRUS: 1715
  └── WOODY: 854
  └── GREEN: 311
  └── FRUITY: 282
  └── FLORAL: 160
  └── WATERY: 88
  └── GOURMAND: 74
  └── SPICY: 60
  └── TOBACCO: 41
  └── AROMATIC FOUGERE: 29
  └── LEATHER: 25
  └── MUSK SKIN: 6
  └── CHYPRE: 3

AMBERY: 2362
  └── AMBERY: 646
  └── FLORAL: 551
  └── CITRUS: 423
  └── GOURMAND: 301
  └── FRUITY: 180
  └── SPICY: 88
  └── GREEN: 62
  └── WOODY: 44
  └── WATERY: 18
  └── AROMATIC FOUGERE: 16
  └── LEATHER: 15
  └── TOBACCO: 9
  └── MUSK SKIN: 7
  └── ALDEHYDIC: 1
  └── CHYPRE: 1

AROMATIC FOUGERE: 2330
  └── AROMATIC FOUGERE: 827
  └── WATERY: 41

## Subfamily Only

In [22]:
# Initialize a Counter to keep track of ingredient occurrences
subfamily_only_counter = Counter()
# Iterate over each perfume in the dataset
for perfume in raw_data:
    subfamily = perfume.get("subfamily")
    # print(subfamily)
    # Clean ingredient names and update the counter
    subfamily_only_counter.update([clean_ingredient(subfamily)])

# Sort the ingredients in descending order by occurrence
sorted_subfamily = subfamily_only_counter.most_common()

# Display the occurrence of each unique ingredient
print("Occurrence of each unique subfamily (sorted in descending order):")
for ingredient, count in sorted_subfamily:
    print(f"{ingredient}: {count}")

Occurrence of each unique subfamily (sorted in descending order):
AMBERY: 7243
CITRUS: 5407
FRUITY: 3130
FLORAL: 3010
WOODY: 1474
GREEN: 1376
MUSK SKIN: 1045
AROMATIC FOUGERE: 895
WATERY: 821
GOURMAND: 716
SPICY: 427
ALDEHYDIC: 391
CHYPRE: 192
LEATHER: 114
TOBACCO: 78


## Gender

In [23]:
# Initialize a Counter to keep track of ingredient occurrences
gender_counter = Counter()
# Iterate over each perfume in the dataset
for perfume in raw_data:
    gender = perfume.get("gender")
    # print(subfamily)
    # Clean ingredient names and update the counter
    gender_counter.update([clean_ingredient(gender)])

# Sort the ingredients in descending order by occurrence
sorted_gender = gender_counter.most_common()

# Display the occurrence of each unique ingredient
print("Occurrence of each unique gender (sorted in descending order):")
for ingredient, count in sorted_gender:
    print(f"{ingredient}: {count}")

Occurrence of each unique gender (sorted in descending order):
Unisex: 11057
Female: 9735
Male: 5527


## Year

In [25]:
# Initialize a Counter to keep track of ingredient occurrences
year_counter = Counter()
# Iterate over each perfume in the dataset
for perfume in raw_data:
    year = perfume.get("years")
    # print(subfamily)
    # Clean ingredient names and update the counter
    year_counter.update([(year)])

# Sort the ingredients in descending order by occurrence
sorted_year = year_counter.most_common()

# Display the occurrence of each unique ingredient
print("Occurrence of each unique year (sorted in descending order):")
for ingredient, count in sorted_year:
    print(f"{ingredient}: {count}")

Occurrence of each unique year (sorted in descending order):
2019: 2369
2020: 2248
2021: 2195
2018: 2180
2022: 2036
2017: 1842
2023: 1686
2015: 1608
2016: 1586
2014: 1174
2013: 989
2012: 891
2010: 650
2011: 582
2009: 473
2024: 464
2008: 409
2007: 354
2006: 322
2005: 254
2000: 219
2004: 200
2003: 161
2002: 143
2001: 122
1999: 95
1998: 83
1996: 82
1997: 69
1995: 69
1993: 60
1994: 54
1988: 52
1989: 43
1992: 43
1991: 29
1987: 27
1985: 23
1984: 20
1990: 18
1973: 17
1978: 17
1976: 16
1986: 16
1981: 16
1905: 14
1982: 14
1983: 14
1980: 13
1979: 13
1970: 12
1975: 11
1977: 10
1948: 10
1969: 9
1925: 7
1974: 7
1965: 7
1972: 6
1955: 6
1828: 6
1935: 5
1967: 5
1971: 5
1933: 4
1966: 4
1929: 4
1949: 4
1947: 4
1921: 4
1932: 4
1968: 4
1901: 3
1934: 3
1912: 3
1959: 3
1889: 3
1936: 3
1939: 3
1960: 3
1964: 3
1941: 3
1963: 3
1919: 3
1946: 3
1916: 2
1956: 2
1913: 2
1904: 2
1853: 2
1859: 2
1953: 2
1903: 2
1937: 2
1874: 2
1951: 2
1911: 2
1930: 2
1952: 2
1890: 2
1917: 2
1902: 2
1938: 2
1910: 2
1877: 2
1924: 1
18