In [None]:
import pandas as pd
import ast

# Data Preprocessing

In [None]:
perfumes = pd.read_csv("perfume_data_final.csv")

In [None]:
perfumes.shape

(478, 13)

## Longevity Preprocessing

`weighted_longevity` varies from $1$ to $5$; this is a continuous value

Get rid of the rows where longevity was not rated.

In [None]:
def all_zeros(longevity_dict):
    return all(value == 0 for value in longevity_dict.values())

In [None]:
perfumes['longevity'] = perfumes['longevity'].apply(ast.literal_eval)

perfumes = perfumes[~perfumes['longevity'].apply(all_zeros)]

In [None]:
perfumes.shape

(457, 13)

Convert the dictionary into the weighted average

In [None]:
def calculate_weighted_average(longevity):
    # check if longevity is a string, convert to dictionary if it is
    if isinstance(longevity, str):
        longevity_dict = ast.literal_eval(longevity)
    elif isinstance(longevity, dict):
        longevity_dict = longevity
    else:
        raise ValueError(f"Unexpected type for longevity: {type(longevity)}")

    total_responses = sum(longevity_dict.values())

    weights = {'very weak': 1, 'weak': 2, 'moderate': 3, 'long lasting': 4, 'eternal': 5}  # weights for each category

    weighted_average = sum(weights[key] * value for key, value in longevity_dict.items()) / total_responses

    return round(weighted_average, 3)  # use only 3 digits after coma

In [None]:
perfumes['weighted_longevity'] = perfumes['longevity'].apply(calculate_weighted_average)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  perfumes['weighted_longevity'] = perfumes['longevity'].apply(calculate_weighted_average)


In [None]:
perfumes = perfumes.drop(columns=['longevity'])

In [None]:
perfumes.head()

Unnamed: 0,name,company,for_gender,rating,number_votes,main accords,top notes,middle notes,base notes,sillage,gender_vote,price value,weighted_longevity
0,Angels' Share,By Kilian,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']","{'intimate': 40, 'moderate': 187, 'strong': 15...","{'female': 40, 'more female': 39, 'unisex': 22...","{'way overpriced': 64, 'overpriced': 143, 'ok'...",3.69
1,My Way,Giorgio Armani,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...","{'intimate': 127, 'moderate': 322, 'strong': 2...","{'female': 349, 'more female': 21, 'unisex': 4...","{'way overpriced': 38, 'overpriced': 121, 'ok'...",3.332
2,Libre Intense,Yves Saint Laurent,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...","{'intimate': 39, 'moderate': 155, 'strong': 23...","{'female': 162, 'more female': 91, 'unisex': 7...","{'way overpriced': 11, 'overpriced': 59, 'ok':...",3.736
3,Dior Homme 2020,Christian Dior,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']","{'intimate': 214, 'moderate': 370, 'strong': 1...","{'female': 3, 'more female': 1, 'unisex': 17, ...","{'way overpriced': 31, 'overpriced': 59, 'ok':...",3.146
4,Acqua di Giò Profondo,Giorgio Armani,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']","{'intimate': 115, 'moderate': 333, 'strong': 1...","{'female': 3, 'more female': 0, 'unisex': 5, '...","{'way overpriced': 32, 'overpriced': 84, 'ok':...",3.188


## Sillage Preprocessing

`weighted_sillage` varies from $1$ to $4$; this is a continuous value

Get rid of the rows where sillage was not rated

In [None]:
def all_zeros(sillage_dict):
    return all(value == 0 for value in sillage_dict.values())

In [None]:
perfumes['sillage'] = perfumes['sillage'].apply(ast.literal_eval)

perfumes = perfumes[~perfumes['sillage'].apply(all_zeros)]

In [None]:
perfumes.shape

(453, 13)

Convert sillage dictionary into the weighted sillage value

In [None]:
def calculate_sillage_weighted_average(sillage):
    if isinstance(sillage, str):
        sillage_dict = ast.literal_eval(sillage)
    elif isinstance(sillage, dict):
        sillage_dict = sillage
    else:
        raise ValueError(f"Unexpected type for sillage: {type(sillage)}")

    total_responses = sum(sillage_dict.values())

    weights = {'intimate': 1, 'moderate': 2, 'strong': 3, 'enormous': 4}

    weighted_average = sum(weights[key] * value for key, value in sillage_dict.items()) / total_responses

    return round(weighted_average, 3)

In [None]:
perfumes['weighted_sillage'] = perfumes['sillage'].apply(calculate_sillage_weighted_average)

In [None]:
perfumes = perfumes.drop(columns=['sillage'])

In [None]:
perfumes.head()

Unnamed: 0,name,company,for_gender,rating,number_votes,main accords,top notes,middle notes,base notes,gender_vote,price value,weighted_longevity,weighted_sillage
0,Angels' Share,By Kilian,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']","{'female': 40, 'more female': 39, 'unisex': 22...","{'way overpriced': 64, 'overpriced': 143, 'ok'...",3.69,2.594
1,My Way,Giorgio Armani,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...","{'female': 349, 'more female': 21, 'unisex': 4...","{'way overpriced': 38, 'overpriced': 121, 'ok'...",3.332,2.615
2,Libre Intense,Yves Saint Laurent,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...","{'female': 162, 'more female': 91, 'unisex': 7...","{'way overpriced': 11, 'overpriced': 59, 'ok':...",3.736,2.819
3,Dior Homme 2020,Christian Dior,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']","{'female': 3, 'more female': 1, 'unisex': 17, ...","{'way overpriced': 31, 'overpriced': 59, 'ok':...",3.146,2.3
4,Acqua di Giò Profondo,Giorgio Armani,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']","{'female': 3, 'more female': 0, 'unisex': 5, '...","{'way overpriced': 32, 'overpriced': 84, 'ok':...",3.188,2.38


## Gender Preprocessing

`weighted_gender` varies from $1$ to $5$; this is a continuous value

In [None]:
def all_zeros(gender_dict):
    return all(value == 0 for value in gender_dict.values())

In [None]:
perfumes['gender_vote'] = perfumes['gender_vote'].apply(ast.literal_eval)

perfumes = perfumes[~perfumes['gender_vote'].apply(all_zeros)]

In [None]:
perfumes.shape

(433, 13)

In [None]:
def calculate_gender_weighted_average(gender):
    if isinstance(gender, str):
        gender_dict = ast.literal_eval(gender)
    elif isinstance(gender, dict):
        gender_dict = gender
    else:
        raise ValueError(f"Unexpected type for sillage: {type(gender)}")

    total_responses = sum(gender_dict.values())

    weights = {'female': 1, 'more female': 2, 'unisex': 3, 'more male': 4, 'male':5}

    weighted_average = sum(weights[key] * value for key, value in gender_dict.items()) / total_responses

    return round(weighted_average, 3)

In [None]:
perfumes['weighted_gender'] = perfumes['gender_vote'].apply(calculate_gender_weighted_average)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  perfumes['weighted_gender'] = perfumes['gender_vote'].apply(calculate_gender_weighted_average)


In [None]:
perfumes = perfumes.drop(columns=['gender_vote'])

In [None]:
perfumes.head()

Unnamed: 0,name,company,for_gender,rating,number_votes,main accords,top notes,middle notes,base notes,price value,weighted_longevity,weighted_sillage,weighted_gender
0,Angels' Share,By Kilian,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']","{'way overpriced': 64, 'overpriced': 143, 'ok'...",3.69,2.594,2.926
1,My Way,Giorgio Armani,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...","{'way overpriced': 38, 'overpriced': 121, 'ok'...",3.332,2.615,1.078
2,Libre Intense,Yves Saint Laurent,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...","{'way overpriced': 11, 'overpriced': 59, 'ok':...",3.736,2.819,1.79
3,Dior Homme 2020,Christian Dior,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']","{'way overpriced': 31, 'overpriced': 59, 'ok':...",3.146,2.3,4.62
4,Acqua di Giò Profondo,Giorgio Armani,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']","{'way overpriced': 32, 'overpriced': 84, 'ok':...",3.188,2.38,4.596


## Price Preprocessing

`weighted_price` varies from $1$ to $5$; this is a continuos value

In [None]:
perfumes['price value'] = perfumes['price value'].apply(ast.literal_eval)

perfumes = perfumes[~perfumes['price value'].apply(all_zeros)]

In [None]:
perfumes.shape

(431, 13)

In [None]:
def calculate_price_weighted_average(price):
    if isinstance(price, str):
        price_dict = ast.literal_eval(price)
    elif isinstance(price, dict):
        price_dict = price
    else:
        raise ValueError(f"Unexpected type for sillage: {type(price)}")

    total_responses = sum(price_dict.values())

    weights = {'way overpriced': 1, 'overpriced': 2, 'ok': 3, 'good value': 4, 'great value':5}

    weighted_average = sum(weights[key] * value for key, value in price_dict.items()) / total_responses

    return round(weighted_average, 3)

In [None]:
perfumes['weighted_price'] = perfumes['price value'].apply(calculate_price_weighted_average)

In [None]:
perfumes = perfumes.drop(columns=['price value'])

In [None]:
perfumes.head()

Unnamed: 0,name,company,for_gender,rating,number_votes,main accords,top notes,middle notes,base notes,weighted_longevity,weighted_sillage,weighted_gender,weighted_price
0,Angels' Share,By Kilian,for women and men,4.31,682.0,"{'woody': 100.0, 'sweet': 92.6987, 'warm spicy...",['Cognac'],"['Cinnamon', 'Tonka Bean', 'Oak']","['Praline', 'Vanilla', 'Sandalwood']",3.69,2.594,2.926,2.353
1,My Way,Giorgio Armani,for women,3.57,1471.0,"{'white floral': 100.0, 'citrus': 60.4322, 'tu...","['Orange Blossom', 'Bergamot']","['Tuberose', 'Indian Jasmine']","['White Musk', 'Madagascar Vanilla', 'Virginia...",3.332,2.615,1.078,2.513
2,Libre Intense,Yves Saint Laurent,for women,4.02,858.0,"{'vanilla': 100.0, 'aromatic': 71.4216, 'sweet...","['Lavender', 'Mandarin Orange', 'Bergamot']","['Lavender', 'Tunisian Orange Blossom', 'Jasmi...","['Madagascar Vanilla', 'Tonka Bean', 'Ambergri...",3.736,2.819,1.79,2.862
3,Dior Homme 2020,Christian Dior,for men,3.42,1402.0,"{'woody': 100.0, 'musky': 72.7229, 'amber': 53...","['Bergamot', 'Pink Pepper', 'elemi']","['Cashmere Wood', 'Atlas Cedar', 'Patchouli']","['Iso E Super', 'Haitian Vetiver', 'White Musk']",3.146,2.3,4.62,2.685
4,Acqua di Giò Profondo,Giorgio Armani,for men,4.03,869.0,"{'aromatic': 100.0, 'marine': 93.2493, 'citrus...","['Sea Notes', 'Aquozone', 'Bergamot', 'Green M...","['Rosemary', 'Cypress', 'Lavender', 'Mastic or...","['Mineral notes', 'Musk', 'Patchouli', 'Amber']",3.188,2.38,4.596,2.462
