## Data Scraper
The data used in this project is from nosetime.com, a Chinese perfume lover community. 

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
from multiprocess import Pool

#### Get all brands and their urls

In [8]:
main = 'https://www.nosetime.com'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41'
}

pp_main_url = main + '/pinpai/'
pp_main = requests.get(pp_main_url, headers=headers, timeout=1)
pp_main_sp = BeautifulSoup(pp_main.content, 'html.parser')

In [9]:
pp_menu = pp_main_sp.find('div', attrs={'class':'brandmenu'}).find_all('li')

menu_dict = {}
for li in pp_menu:
    menu_dict[li.text] = li.a['href']

In [10]:
brand_url = {}
for menu, url in menu_dict.items():
    pp_cat_url = main + url
    pp_cat = requests.get(pp_cat_url, headers=headers, timeout=1)
    pp_cat_sp = BeautifulSoup(pp_cat.content, 'html.parser')
    pp_ls = pp_cat_sp.find('div', attrs={'class':'odorlist'}).find_all('li')
    for pp in pp_ls:
        tmp = pp.find_all('a')[1]
        brand_url[tmp['href']] = (tmp.find('br').previous, tmp.find('br').next)

In [11]:
brands = pd.DataFrame(brand_url.values(), brand_url.keys())
brands.columns = ['name_cn', 'name_en']
brands['id'] = brands.index.map(lambda x: x.split('-')[0].split('/')[2])

#### Query for all perfumes by brands
Luckily I can use an API (strangely, the website didn't use any protection) to directly request the info. I use multiprocess to speed up the process.

In [30]:
def crawl_item(b_id):
    import requests
    import json
    import pandas as pd
    page = 1
    res = []
    while True:
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41'
        }
        tpl = 'https://app.nosetime.com/app/search.php?type=item&in=brandid&word={}&page={}&orderby=hot&desc=-'
        search_url = tpl.format(b_id, page)
        search = requests.get(search_url, headers=headers)
        search_content = json.loads(search.content)['item']['data']
        if search_content:
            tmp = pd.DataFrame(search_content)[['id']]
            tmp['brand_id'] = b_id
            res.append(tmp)
            page += 1
        else:
            break
    if res:
        res = pd.concat(res)
    else:
        res = None
    return res

In [31]:
with Pool(5) as p:
    res = p.map(crawl_item, brands['id'])

In [33]:
items = pd.concat(res)

#### Query perfume info
Again, use the API with multiprocess.

In [38]:
def crawl_item_detail(item_id):
    import requests
    import pandas as pd
    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    url = 'https://www.nosetime.com/app/item.php?id=' + item_id
    try:
        item = requests.get(url, headers=headers)
        ss = pd.read_json(item.content,typ='series')
    except:
        ss = pd.Series({'id':item_id})
    return ss

In [39]:
with Pool(10) as p:
    res = p.map(crawl_item_detail, items['id'])

res_df = pd.DataFrame(res).set_index('id')

if len(res_df[res_df['title'].isna()]):
    with Pool(10) as p:
        res = p.map(crawl, res_df[res_df['title'].isna()].index)

    res_df = res_df.combine_first(pd.DataFrame(res).set_index('id'))

res_df[~res_df.index.isna()].to_csv('data/perfumes.csv', index=None)