# Import packages

In [None]:
import pickle
from operator import itemgetter
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
import urllib.request

# Open perfume urls

In [None]:
with open("perfume_links_all.txt", "rb") as fp:
    perfume_list = pickle.load(fp)

# Sort perfume list to figure out which perfumes to scrape first
How to rank: I hypothesize that the more number of perfumes, the more popular the brand overall, which means the more reviews and user data each perfume of that brand would have. So, I counted the number of perfumes each brand had, ranked the perfumes based on number per brand in descending order, and started scraping the most productive brands.

In [None]:
def perfume_dict(perfume_list):
    """
    input: perfume_list, list of perfume urls
    returns: dictionary where the keys are designers and the values are number of perfumes per designer
    """
    
    # Create perfume dictionary
    perfumes_dict = {}
    for i in perfume_list:
        try:
            perfumes_dict[i.split('/')[4]] += 1
        except KeyError:
            perfumes_dict[i.split('/')[4]] = 1
            
    return perfumes_dict

In [None]:
# Get perfumes dict
perfumes_dict = perfume_dict(perfume_list)

# Sort perfume list by perfume count
sorted_perfumes = sorted(perfumes_dict.items(), key=itemgetter(1), reverse=True)

In [None]:
def sort_perfume_urls(sorted_perfumes_tuples, perfume_list):
    """
    input: 
    sorted_perfumes_tuples: nested list of tuples where [0] is designer and [1] is designer count
    perfume_list: list of perfume urls
    returns: list of strings of perfume urls, sorted by most popular designer
    """
    sorted_perfume_urls = []
    
    for sorted_perfume in sorted_perfumes_tuples:
        for perfume_url in perfume_list:
            if perfume_url.startswith('https://www.fragrantica.com/perfume/' + sorted_perfume[0]):
                sorted_perfume_urls.append(perfume_url)
                
    return sorted_perfume_urls

In [None]:
# Get sorted perfume urls
sorted_perfume_urls = sort_perfume_urls(sorted_perfumes, perfume_list)

In [None]:
len(sorted_perfume_urls)

In [None]:
# save sorted perfume urls
with open("sorted_perfumes.txt", "wb") as fp:   #Pickling
    pickle.dump(sorted_perfume_urls, fp)

# open sorted perfume urls
with open("sorted_perfumes.txt", "rb") as fp:   # Unpickling
    sorted_perfume_urls = pickle.load(fp)

In [None]:
sorted_perfume_urls

# Scrape perfume data: 
- 1300 pages/hour if average of 1 second wait
- 2142 pages/hour if average of 0.5 second wait

## Features to get

In [None]:
def get_features(soup):
    """
    input: soup parsed by BeautifulSoup
    returns: a list of features
    """
    
    # designer
    try:
        designer = str(soup.find_all(itemprop='name')[0])[22:-7]
    except:
        designer = np.nan
    
    # name
    try:
        name = str(soup.find_all(itemprop='name')[1])[22:-7]
    except:
        name = np.nan
    
    # image url
    try:
        image = str(soup.find_all(itemprop='image')).split('src="')[1].split('" style')[0]
    except:
        image = np.nan

    # love/like/dislike/winter/spring/summer/fall/day/night
    try:
        diagramresult = soup.find('div', attrs={'id':'diagramresult'})
        diagram_result_divs = diagramresult.find_all('div')
        love = int(str(diagram_result_divs[0]).split("height: ")[1].split('px')[0])/100
        like = int(str(diagram_result_divs[1]).split("height: ")[1].split('px')[0])/100
        dislike = int(str(diagram_result_divs[2]).split("height: ")[1].split('px')[0])/100
        winter = int(str(diagram_result_divs[3]).split("height: ")[1].split('px')[0])/100
        spring = int(str(diagram_result_divs[4]).split("height: ")[1].split('px')[0])/100
        summer = int(str(diagram_result_divs[5]).split("height: ")[1].split('px')[0])/100
        fall = int(str(diagram_result_divs[6]).split("height: ")[1].split('px')[0])/100
        day = int(str(diagram_result_divs[7]).split("height: ")[1].split('px')[0])/100
        night = int(str(diagram_result_divs[8]).split("height: ")[1].split('px')[0])/100
    except:
        diagramresult = np.nan
        diagram_result_divs = np.nan
        love = np.nan
        like = np.nan
        dislike = np.nan
        winter = np.nan
        spring = np.nan
        summer = np.nan
        fall = np.nan
        day = np.nan
        night = np.nan
    
    # accords
    try:
        accords = soup.find('div', attrs = {'id':'prettyPhotoGallery'}).find_all('div')
        def clean_accords(main_accords):
            all_accords = []
            for i in range(1, len(main_accords[0].find_all('div')), 3):
                accord = (str(main_accords[0].find_all('div')[i]).split('z-index: 60;">')[1].split('<')[0],
                          int(str(main_accords[0].find_all('div')[i]).split('width: ')[2].split('px')[0])/130)
                all_accords.append(accord)
            return all_accords
        accords = clean_accords(accords)
    except:
        accords = np.nan
    
    # rating
    try:
        rating = float(str(soup.find_all('span', attrs={'itemprop' : 'ratingValue'})[0]).split('">')[1].split('<')[0])/5
    except:
        rating = np.nan
    
    # rating count
    try:
        rating_count = int(str(soup.find_all('span', attrs={'itemprop' : 'ratingCount'})[0]).split('">')[1].split('<')[0])
    except:
        rating_count = np.nan
    
    # description
    try:
        description = soup.find('div', attrs={'itemprop':'description'}).get_text()
    except:
        description = np.nan
    
    # reminds me of...
    try:
        reminds = soup.find_all('div', attrs={'class':'votes'})
        remind_list = reminds[0].find_all('img')
        def clean_reminds(reminds_list):
            all_remind = []
            for reminds_of in reminds_list:
                remind_dirty = str(reminds_of).split('src="')[1].split('"')
                all_remind.append((remind_dirty[2], remind_dirty[0]))
            return all_remind
        reminds = clean_reminds(remind_list)
    except:
        reminds = np.nan
    
    # notes
    try:
        notes = []
        for j in soup.find_all('span', attrs={'class': 'rtgNote'}):
            notes.append(str(j.img).split('alt="')[1].split('" class')[0])
    except:
        notes = np.nan
    
    # reviews
    try:
        reviews_dirty = soup.find_all('div', attrs={'class':'pwq'})
        reviews_dict = {}
        for review_dirty in reviews_dirty:
            reviews_dict[review_dirty.find('b').get_text()] = review_dirty.find('p').get_text()
        reviews_dict    
    except:
        reviews_dict = np.nan
        
    return [name, image, designer, accords, notes, description, rating, 
            rating_count, love, like, dislike, winter, spring, summer, 
            fall, day, night, reminds, reviews_dict]

## Scrape url!

In [None]:
def scrape_perfume_urls(sorted_perfume_urls):
    # set index count
    n = 0

    # get webpage and scrape features
    for perf_url in sorted_perfume_urls:
        # set headers
        user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        headers={'User-Agent':user_agent,} 

        # request
        request=urllib.request.Request(perf_url,None,headers) #The assembled request
        try:
            response = urllib.request.urlopen(request, timeout=10)
            data = response.read() # The data u need
        except:
            print (n, 'iswrong')
            continue

        time.sleep(random.random())

        # parse and get features
        designer_soup = BeautifulSoup(data, 'html.parser')
        single_url = get_features(designer_soup)

        # append to dataframe
        df.loc[n] = single_url

        # increase index count
        n += 1
        print (n)

## Implementation

In [None]:
time_a = time.time()

# set up dataframe
df = pd.DataFrame(columns=['name', 'image', 'designer', 'accords', 'notes', 'description', 'rating', 
                           'rating_count', 'love', 'like', 'dislike', 'winter', 'spring', 'summer', 
                           'fall', 'day', 'night', 'reminds', 'reviews_dict'])

# scrape perfume urls
scrape_perfume_urls(sorted_perfume_urls)

# save df to csv
df.to_csv('perfumes_temp.csv', mode='a', header=False)

time_b = time.time()
print(time_b - time_a)