In [None]:
import re
import requests
import random
import json
from tqdm import tqdm

random.seed(2020)


In [None]:
# request using proxies to prevent IP banned

PROXIES = None # list of proxies

def get_proxies():
    """Get all proxies from free-proxy-list.net"""
    res = requests.get('https://free-proxy-list.net/')
    pattern = r'<tr><td>([\d\.]+)<\/td><td>([\d]+)<\/td>'
    proxies = re.findall(pattern, res.text)
    proxies = ['http://{}:{}'.format(ip, port) for (ip, port) in proxies[:20]]
    return proxies

def _request(url):
    """request with proxies"""
    for proxy in random.sample(PROXIES, len(PROXIES)):
        try:
            res = requests.get(url, proxies={'http': proxy})
            return res
        except:
            pass
    # all proxies are dead
    return None

def request(url):
    """request and update new PROXIES"""
    global PROXIES
    if PROXIES is None:
        PROXIES = get_proxies()
    res = _request(url)
    if res:
        return res
    # refresh proxies
    PROXIES = get_proxies()
    res = _request(url)
    return res

In [None]:
def allshoes_url():
    """Get shoes's URL in zappos.com"""
    url = 'https://www.zappos.com'
    res = request('{}{}'.format(url, '/null/.zso'))
    pattern = r'class="" href="(/shoes/[\_\w]+\.zso)"'
    m = re.search(pattern, res.text)
    if m:
        return '{}{}'.format(url, m.group(1))

def get_numpages(text):
    """Get total pages (each page contains 100 shoes)"""
    pattern = '"pageCount":([\d]+)'
    m = re.search(pattern, text)
    if m:
        return int(m.group(1))

def get_pageinfo(url_root, npage):
    """Request to get source file for each page (100 shoes)"""
    if npage == 0:
        url_page = url_root
    else:
        url_page = '{}?p={}'.format(url_root, npage)
    return request(url_page).text

def get_links(pageinfo):
    """Get link for each shoes in a page"""
    pattern = '" data-style-id="([\d]+)" itemProp="url" class="bg" href="([\/\w\-]+)"><meta item'
    itemlinks = re.findall(pattern, pageinfo)
    return itemlinks

def get_iteminfo(metadata):
    """Get a shoes's info"""
    pattern = '","price":".+","name":"(.+)","brand":"(.+)",'\
            '"category":"(.+)","subCategory":"(.+)","gender":"(.+)"}},"pageType"'
    res = re.search(pattern, metadata)
    if res and res.group(3) == "Shoes":
        iteminfo = {
                "name": res.group(1),
                "brand": res.group(2),
                "category": res.group(4),
                "gender": res.group(5),
                }
        if 'accessories' in iteminfo["category"].lower():
            return None
        return iteminfo
    else:
        res = re.search('"defaultProductType":"(.+)","defaultProductUrl"', metadata)
        if res and res.group(1) == "Shoes":
            res = re.search('"productName":"(.+)","productRating"', metadata)
            name = res.group(1)
            res = re.search('"brandName":"(.+)","defaultCategory":"(.+)","defaultImageUrl"', metadata)
            brand = res.group(1)
            category = res.group(2)
            res = re.search('"genders":([\",\[\]\w]+),"overallRating"', metadata)
            gender = res.group(1)
            iteminfo = {
                    "name": name,
                    "brand": brand,
                    "category": category,
                    "gender": gender,
                    }
            if 'accessories' in iteminfo["category"].lower():
                return None
            return iteminfo
    return None

def get_itemimages(metadata, link):
    """Get download links for each pair of shoes"""
    pattern = '"styleId":"([\d]+)"[\w\s\+\]\[\'&-{},":]*"images":(\[{"type":"PAIR"[\w\+-\{\}\]\[,\"]+),("[a-zA-Z]+")'
    res = re.findall(pattern, metadata)
    items_dict = {}
    for item_id, images_data, _ in res:
        pattern = '"type":"([\w]+)","imageId":"([\w\+-]+)"}'
        m = re.findall(pattern, images_data)
        if len(m) == 0 or item_id in items_dict:
            continue
        items_dict[item_id] = {}
        for _type, _imageid in m:
            items_dict[item_id][_type] = 'https://m.media-amazon.com/images/I/{}.jpg'.format(_imageid)
    # DEBUG
    test_pattern = '"styleId":"([\d]+)"[\w\s\+\]\[\'&-{},":]*"images":\[{"type":"PAIR"'
    res_test = re.findall(test_pattern, metadata)
    for _id in res_test:
        if _id not in items_dict.keys():
            return None
    return items_dict

In [None]:
url_main = 'https://www.zappos.com'
url_root = allshoes_url()
res = request(url_root)
num_pages = get_numpages(res.text)

itemlinks = []
for npage in tqdm(range(num_pages)):
    pageinfo = get_pageinfo(url_root, npage)
    _links = get_links(pageinfo)
    itemlinks.extend(_links)
print('\ntotal links', len(itemlinks))

In [None]:
DATASET = {}
broken_links = []

for item_id, link in tqdm(itemlinks):
    if item_id in DATASET:
        continue
    metadata = request('https://www.zappos.com{}'.format(link)).text
    iteminfo = get_iteminfo(metadata)
    itemimages = get_itemimages(metadata, link)
    if iteminfo is None or itemimages is None:
        broken_links.append(link)
        continue
    for _id in itemimages:
        if _id in DATASET:
            continue
        DATASET[_id] = {
                'info': iteminfo,
                'images': itemimages[_id],
                }
        
print('Total shoes:', len(DATASET))
print('Broken links:', len(broken_links))

In [None]:
with open('dataset.json', 'w') as f:
    json.dump(DATASET, f)