In [4]:
mens_decade = "https://www.fragrantica.com/awards/category/the-legend-of-the-latest-decade-for-men"

mens_all_time_2017 = (
    "https://www.fragrantica.com/awards/category/best-fragrance-of-all-times-for-men"
)
mens_all_time_2018 = (
    "https://www.fragrantica.com/awards2018/category/best-fragrance-for-men"
)
mens_all_time_2019 = (
    "https://www.fragrantica.com/awards2019/category/Best-Men-s-Perfume-of-All-Time"
)
mens_all_time_2020 = (
    "https://www.fragrantica.com/awards2020/category/Best-Men-s-Perfume-of-All-Time"
)
mens_all_time_2021 = (
    "https://www.fragrantica.com/awards2021/category/Best-Men-s-Perfume-of-All-Time"
)
mens_all_time_2022 = (
    "https://www.fragrantica.com/awards2022/category/Best-Men-s-Perfume-of-All-Time"
)

In [29]:
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import undetected_chromedriver

import pandas as pd

from re import search, sub, findall, split
from ast import literal_eval
from numpy import int64, isnan, NaN
import numpy as np
from scipy.stats import beta

# import matplotlib.pyplot as plt
import statistics

In [17]:
singleton_driver = undetected_chromedriver.Chrome()
singleton_driver.implicitly_wait(10)

In [3]:
def get_year(name: str):
    try:
        year = search("(\d{4})$", name).group(0)
    except AttributeError:
        year = 0
    return year


def get_awards_votes(driver: undetected_chromedriver.Chrome, url):
    index = url.rindex("/") + 1
    file_name = url[index:] + ".txt"
    driver.get(url)
    frag_dict = dict()
    i = 1
    while True:
        try:
            name = driver.find_element(
                By.CSS_SELECTOR, f"div.small-6:nth-child({i}) > a:nth-child(2)"
            ).text
            upvotes = driver.find_element(
                By.CSS_SELECTOR,
                f"div.small-6:nth-child({i}) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2)",
            ).text
            downvotes = driver.find_element(
                By.CSS_SELECTOR,
                f"div.small-6:nth-child({i}) > div:nth-child(1) > div:nth-child(2) > div:nth-child(2)",
            ).text
            year = get_year(name)
            frag_dict[search("(.+\(\w+\))", name).group(0)] = (
                int(year),
                int(upvotes),
                int(downvotes),
            )
            i = i + 1
        except (NoSuchElementException, AttributeError):
            print(name, upvotes, downvotes, year)
            break
    return file_name, frag_dict


def positive_votes(x, **kwargs):
    return x[1][1]


def vote_difference(x, **kwargs):
    diff = x[1][1] - x[1][2]
    return diff


def bayesian_rating(x, **kwargs):
    a = x[1][1] + kwargs["avg_pos_votes"]
    b = x[1][2] + kwargs["avg_neg_votes"]
    rank = beta.ppf(0.05, a, b)
    return rank


def print_console_and_file(text, file):
    if file != None:
        print(text, file=file)
    print(text)


def print_list(
    file_name,
    frag_dict,
    min_year=0,
    max_year=3000,
    cutoff=1,
    ranking_func=bayesian_rating,
    max_print=50,
    include_female=True,
):
    if file_name != None:
        out_file = open(file_name, "w")
    else:
        out_file = None

    avg_pos_votes = statistics.mean([item[1][1] for item in frag_dict.items()])
    avg_neg_votes = statistics.mean([item[1][2] for item in frag_dict.items()])
    filtered_dict = dict()

    i = 0
    for key, value in sorted(
        frag_dict.items(),
        key=lambda x: ranking_func(
            x, avg_pos_votes=avg_pos_votes, avg_neg_votes=avg_neg_votes
        ),
        reverse=True,
    ):
        if (
            value[0] in [*range(min_year, max_year + 1)]
            and value[1] >= cutoff
            and i < max_print
        ):
            if include_female:
                filtered_dict[key] = value
                i = i + 1
            else:
                if "female" not in findall("\((\w*?)\)", key):
                    filtered_dict[key] = value
                    i = i + 1

    name_spaces = max([len(item[0]) for item in filtered_dict.items()]) + 2
    total_spaces = name_spaces + 32
    print_console_and_file("-" * total_spaces, file=out_file)
    print_console_and_file(
        "{1:<{0}s}{2:>10s}{3:>10s}{4:>12s}".format(
            name_spaces, "Name", "Year", "Upvotes", "Ratio"
        ),
        file=out_file,
    )
    print_console_and_file("-" * total_spaces, file=out_file)

    for key, value in sorted(
        filtered_dict.items(),
        key=lambda x: ranking_func(
            x, avg_pos_votes=avg_pos_votes, avg_neg_votes=avg_neg_votes
        ),
        reverse=True,
    ):
        ratio = value[1] / (value[1] + value[2]) if (value[1] + value[2]) > 0 else 1
        print_console_and_file(
            "{1:<{0}s}{2:>10d}{3:>10d}{4:>12f}".format(
                name_spaces, key, int(value[0]), int(value[1]), ratio
            ),
            file=out_file,
        )


def graph_years(frag_dict, ranking_func=bayesian_rating):
    avg_pos_votes = statistics.mean([item[1][1] for item in frag_dict.items()])
    avg_neg_votes = statistics.mean([item[1][2] for item in frag_dict.items()])

    x = [item[1][0] for item in frag_dict.items()]
    y = [
        ranking_func(item, avg_pos_votes=avg_pos_votes, avg_neg_votes=avg_neg_votes)
        for item in frag_dict.items()
    ]

    return x, y

In [49]:
singleton_driver.get(mens_all_time_2022)
count_fragrances = len(WebDriverWait(singleton_driver, 10).until(
    EC.presence_of_all_elements_located((
        By.CLASS_NAME, 'nomination-box'
    ))
))
print(count_fragrances)

fragrances = []
for i in range(1, count_fragrances + 1):
    try:
        name = singleton_driver.find_element(
            By.CSS_SELECTOR, f"div.small-6:nth-child({i}) > a:nth-child(2)"
        ).text
        upvotes = singleton_driver.find_element(
            By.CSS_SELECTOR,
            f"div.small-6:nth-child({i}) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2)",
        ).text
        downvotes = singleton_driver.find_element(
            By.CSS_SELECTOR,
            f"div.small-6:nth-child({i}) > div:nth-child(1) > div:nth-child(2) > div:nth-child(2)",
        ).text
        year = get_year(name)
        fragrances.append(
            {
                "name": name,
                "upvotes": upvotes,
                "downvotes": downvotes,
                "year": year,
                "order": i
            }
        )
    except (NoSuchElementException, AttributeError):
        print(name, upvotes, downvotes, year)
        break

pd.DataFrame(fragrances).to_csv("mens_all_time_2022.csv", index=False)

1175


In [7]:
with undetected_chromedriver.Chrome(headless=True) as driver:
    # driver.implicitly_wait(10)
    file_name, frag_dict = get_awards_votes(driver, mens_all_time_2017)

ValueError: I/O operation on closed file

In [6]:
print_list(None, frag_dict, ranking_func=bayesian_rating)

StatisticsError: mean requires at least one data point

In [21]:
# df1 = DataFrame.from_dict(frag_dict1, orient='index', columns=['year','upvotes','downvotes']).reset_index()
# df2 = DataFrame.from_dict(frag_dict2, orient='index', columns=['year','upvotes','downvotes']).reset_index()
# df3 = DataFrame.from_dict(frag_dict3, orient='index', columns=['year','upvotes','downvotes']).reset_index()
# df4 = DataFrame.from_dict(frag_dict4, orient='index', columns=['year','upvotes','downvotes']).reset_index()
# df5 = DataFrame.from_dict(frag_dict5, orient='index', columns=['year','upvotes','downvotes']).reset_index()

# df_combined = concat([df1, df4, df5]).groupby('index').sum()
# df_combined['year'] = df_combined['year'].apply(lambda x: x/2 if x>3000 else x)

# combined_dict = df_combined.to_dict(orient='index')

In [10]:
# print_list('testfile.txt', {k: tuple(v.values()) for k, v in combined_dict.items()}, ranking_func=vote_difference)

In [7]:
# with Firefox(options=options) as driver:
#     driver.implicitly_wait(10)
#     driver.get('https://www.fragrantica.com/notes/')

#     note_group_names = []
#     for i in ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']:
#         note_group_names.append(
#             driver.find_element(By.CSS_SELECTOR,
#                                 f'#groupnotes_group_{i}_title > div:nth-child(1) > h2:nth-child(1)').text.title())

#     note_group_notes = []
#     i = 0
#     for j in [6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39]:
#         note_group_notes.append(
#             sub(r'\n\n', ',',
#                 driver.find_element(By.CSS_SELECTOR,
#                                     f'.grid-padding-x > div:nth-child({j}) > div:nth-child(1)').text).split(','))
#         i = i + 1

# note_groups = list(zip(note_group_names, note_group_notes))

In [8]:
# fragdb = read_csv('frags.csv')

# notes_to_add = fragdb['notes'].tolist()
# seasons_to_add = fragdb['seasons'].tolist()
# times_to_add = fragdb['times'].tolist()
# accords_to_add = fragdb['accords'].tolist()
# last_index = fragdb['accords'].last_valid_index()
# start = last_index + 1 if last_index != None else 0

# with Firefox(options=options) as driver:
#     driver.implicitly_wait(10)

#     for i in range(start, start + 10, 1):
#         # Get page
#         driver.get(fragdb.iloc[i].url)

#         # Get notes
#         notes = driver.find_element(By.ID, 'userMainNotes').find_elements(By.TAG_NAME, 'div')
#         names = []
#         votes = []
#         image_nums = []
#         for note in notes:
#             img_element = note.find_elements(By.TAG_NAME, 'img')[0]
#             names.append(img_element.get_attribute('title').title())
#             votes.append(note.find_elements(By.TAG_NAME, 'span')[0].text)
#             image_nums.append(sub(r'[^0-9]', '', img_element.get_attribute('src')))
#         votes = [int(i) for i in votes]
#         votes = [float(i)/sum(votes) for i in votes]

#         # Get seasons
#         winter = int(driver.find_element(By.ID, 'clswinterD').get_property('style').get('height')[:-2])
#         spring = int(driver.find_element(By.ID, 'clsspringD').get_property('style').get('height')[:-2])
#         summer = int(driver.find_element(By.ID, 'clssummerD').get_property('style').get('height')[:-2])
#         fall = int(driver.find_element(By.ID, 'clsautumnD').get_property('style').get('height')[:-2])
#         seasons_total = winter + spring + summer + fall
#         winter = winter / seasons_total
#         spring = spring / seasons_total
#         summer = summer / seasons_total
#         fall = fall / seasons_total

#         # Get times
#         day = int(driver.find_element(By.ID, 'clsdayD').get_property('style').get('height')[:-2])
#         night = int(driver.find_element(By.ID, 'clsnightD').get_property('style').get('height')[:-2])
#         times_total = day + night
#         day = day / times_total
#         night = night / times_total

#         # Get accords
#         accords = []
#         values = []
#         accords_css = '#prettyPhotoGallery > div:nth-child(1) > '
#         # TODO: support variable number of accords
#         for j in [2, 4, 6, 8, 10]:
#             accords.append(driver.find_element(By.CSS_SELECTOR,
#                                              accords_css + f'div:nth-child({j}) > span:nth-child(1)')
#                            .text)
#             values.append(driver.find_element(By.CSS_SELECTOR,
#                                              accords_css + f'div:nth-child({j}) > div:nth-child(2)')
#                           .get_property('style').get('width')[:-2])
#         values = [int(i) for i in values]
#         values = [float(i)/130 for i in values]


#         # Add notes, seasons, and times
#         notes_to_add[i] = list(zip(names, votes, image_nums))
#         seasons_to_add[i] = (winter, spring, summer, fall)
#         times_to_add[i] = (day, night)
#         accords_to_add[i] = list(zip(accords, values))

# fragdb['notes'] = Series(notes_to_add)
# fragdb['seasons'] = Series(seasons_to_add)
# fragdb['times'] = Series(times_to_add)
# fragdb['accords'] = Series(accords_to_add)
# fragdb.to_csv('frags.csv', index=False)

In [9]:
# fragdb = read_csv('frags.csv')
# last_valid_index = fragdb['accords'].last_valid_index()
# valid_indices = range(0, last_valid_index+1)

# # Get unique notes
# unique_notes = set()
# for i in valid_indices:
#     for note in literal_eval(fragdb['notes'].iloc[i]):
#         unique_notes.add(note[0])

# # Get unique accords
# unique_accords = set()
# for i in valid_indices:
#     for accord in literal_eval(fragdb['accords'].iloc[i]):
#         unique_accords.add(accord[0])

# # Output notes sheet
# frag_dict = {}
# for i in valid_indices:
#     note_dict = {}
#     note_list = literal_eval(fragdb['notes'].iloc[i])
#     for note in note_list:
#         note_dict.update({note[0]:note[1]})
#     frag_dict.update({fragdb['name'].iloc[i]:note_dict})
# fragdb_notes = DataFrame(index=frag_dict.keys(), columns=sorted(unique_notes))
# for frag in fragdb_notes.iterrows():
#     for note in frag_dict.get(frag[0]):
#         fragdb_notes.loc[frag[0], note] = frag_dict.get(frag[0]).get(note)
# fragdb_notes = fragdb_notes.fillna(0)
# fragdb_notes.to_csv('frags_notes.csv')

# # Output accords sheet
# frag_dict = {}
# for i in valid_indices:
#     accord_dict = {}
#     accord_list = literal_eval(fragdb['accords'].iloc[i])
#     for accord in accord_list:
#         accord_dict.update({accord[0]:accord[1]})
#     frag_dict.update({fragdb['name'].iloc[i]:accord_dict})
# fragdb_accords = DataFrame(index=frag_dict.keys(), columns=sorted(unique_accords))
# for frag in fragdb_accords.iterrows():
#     for accord in frag_dict.get(frag[0]):
#         fragdb_accords.loc[frag[0], accord] = frag_dict.get(frag[0]).get(accord)
# fragdb_accords = fragdb_accords.fillna(0)
# fragdb_accords.to_csv('frags_accords.csv')

# # Output seasons sheet
# fragdb_seasons = DataFrame(index=fragdb_notes.index)
# fragdb_seasons.insert(fragdb_seasons.shape[1], 'Winter', [
#     literal_eval(season_tuple)[0] for season_tuple in fragdb['seasons'].iloc[valid_indices]
# ])
# fragdb_seasons.insert(fragdb_seasons.shape[1], 'Spring', [
#     literal_eval(season_tuple)[1] for season_tuple in fragdb['seasons'].iloc[valid_indices]
# ])
# fragdb_seasons.insert(fragdb_seasons.shape[1], 'Summer', [
#     literal_eval(season_tuple)[2] for season_tuple in fragdb['seasons'].iloc[valid_indices]
# ])
# fragdb_seasons.insert(fragdb_seasons.shape[1], 'Fall', [
#     literal_eval(season_tuple)[3] for season_tuple in fragdb['seasons'].iloc[valid_indices]
# ])
# fragdb_seasons.to_csv('frags_seasons.csv')