In [1]:
import json
import pandas as pd
from numpy import trapz
from statistics import mean
from sklearn.preprocessing import QuantileTransformer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
with open("data/parfumo/collection/graph.json", "r") as f:
    graph = json.load(f)

nodes_df = pd.DataFrame(graph["nodes"])

In [3]:
all_columns = [
    "type_animal",
    "type_aquatic",
    "type_chypre",
    "type_citrus",
    "type_creamy",
    "type_earthy",
    "type_floral",
    "type_fougere",
    "type_fresh",
    "type_fruity",
    "type_gourmand",
    "type_green",
    "type_leathery",
    "type_oriental",
    "type_powdery",
    "type_resinous",
    "type_smoky",
    "type_spicy",
    "type_sweet",
    "type_synthetic",
    "type_woody",
    "occasion_evening",
    "occasion_business",
    "occasion_night_out",
    "occasion_leisure",
    "occasion_sport",
    "occasion_daily",
    "season_spring",
    "season_summer",
    "season_fall",
    "season_winter",
    "audience_youthful",
    "audience_mature",
    "audience_feminine",
    "audience_masculine",
]

nodes_df[all_columns] = (
    nodes_df[all_columns].astype(int).apply(lambda row: row / row.sum(), axis=1)
)


components = ["type_spicy", "type_woody", "type_sweet"]

In [4]:
nodes_df = nodes_df[
    nodes_df["collection_group"].isin(["I have", "Decants", "Miniatures"])
]
nodes_df = nodes_df[["short_name"] + components]

nodes_df["trapz"] = nodes_df[components].apply(trapz, axis=1)
nodes_df["mean"] = nodes_df[components].apply(mean, axis=1)

In [5]:
nodes_df[components + ["trapz", "mean"]] = pd.DataFrame(
    QuantileTransformer().fit_transform(nodes_df[components + ["trapz", "mean"]].values)
)



In [6]:
nodes_df[nodes_df[components].apply(min, axis=1) > 0].sort_values(
    "trapz", ascending=False
)

Unnamed: 0,short_name,type_spicy,type_woody,type_sweet,trapz,mean
40,Haltane Parfums de Marly,1.0,0.96,0.64,1.0,1.0
64,Oud for Greatness Initio,0.986667,0.986667,0.586667,0.986667,0.986667
59,Santal Eau de Santal Floris,0.933333,0.973333,0.506667,0.973333,0.893333
57,The One for Men Dolce & Gabbana,0.946667,0.853333,0.8,0.96,0.973333
10,Select Night Mercedes-Benz,0.866667,0.733333,0.933333,0.946667,0.96
61,Gentleman Givenchy Réserve Privée Givenchy,0.853333,0.826667,0.813333,0.933333,0.946667
36,Ishq Al Shuyukh Silver Lattafa / لطافة,0.573333,0.92,0.6,0.92,0.8
60,Royal Oud Creed,0.84,0.946667,0.386667,0.906667,0.746667
41,Angels' Share Kilian,0.826667,0.653333,0.96,0.893333,0.933333
62,Emporio Armani - Stronger With You Absolutely ...,0.76,0.626667,0.986667,0.88,0.92
