In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import json

## getting list of products

In [2]:
def get_products(url, pages=5):
    
    # Looking for products page
    prds_link = url + '/page/offerlist.htm'
    driver = webdriver.Firefox()

    ids = set()   
    for i in range(pages):
        
        prds_page = prds_link + '?pageNum=' + str(i)
        driver.get(prds_page)
        page = driver.page_source
        soup = BeautifulSoup(page, 'html.parser')

        prds = soup.find_all(attrs={'class':'offer-list-row-offer'})

        for p in prds:
            ids.add(p['data-offerid'])
    
    driver.close()
    driver.quit()
    return ids

In [4]:
url = 'https://shop934899wt55923.1688.com'

In [63]:
print(get_products(url))

{'565503532653', '582574059616', '585996499728', '565921082239', '585717265883', '566017573067', '566091022626', '566179163281', '575152039953', '569187016916', '589922080678', '573102375316', '585850238688', '585479245396', '587830298073', '565428089381', '587104053333', '565702953175', '586034411590', '565368924250', '565638027874', '565479858645', '576852113639', '565528881870', '589267401060', '585548980105', '565702681819', '579596430944', '568886687366', '589260879343', '588863734752', '590916719889', '589570415056', '585467715959', '589467741800', '571152751936', '565585035407', '587103289793'}


In [3]:
pages = {'565503532653', '582574059616', '585996499728', '565921082239', '585717265883', '566017573067', '566091022626', '566179163281', '575152039953', '569187016916', '589922080678', '573102375316', '585850238688', '585479245396', '587830298073', '565428089381', '587104053333', '565702953175', '586034411590', '565368924250', '565638027874', '565479858645', '576852113639', '565528881870', '589267401060', '585548980105', '565702681819', '579596430944', '568886687366', '589260879343', '588863734752', '590916719889', '589570415056', '585467715959', '589467741800', '571152751936', '565585035407', '587103289793'}

## Extracting data about product

In [43]:
def get_product_info(prd, percent=0.17):
    
    url = 'https://detail.1688.com/offer/'
    prd_page = url + prd + '.html'
    
    driver = webdriver.Firefox()
    driver.get(prd_page)
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    driver.close()
    driver.quit()
    
    # Getting prices and quantities
    info = soup.find_all(attrs={'data-range': True})
    prices = []
    for s in info:
        prices.append(json.loads(s['data-range']))
    
    # Getting available sizes
    info = soup.find_all(attrs={'data-sku-config': True})
    sizes = []
    for s in info:
        sizes.append(json.loads(s['data-sku-config']))
        
        
    # Finding product specifications
    info = soup.find_all(attrs={'class': 'unit-detail-spec-operator'})
    specs = []
    for p in info:
        specs.append(json.loads(p['data-imgs'])['original'])
    
    # Finding photos
    url = 'https://detail.1688.com/pic/'
    photo_page = url + prd + '.html'
    
    driver = webdriver.Firefox()
    driver.get(photo_page)
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    driver.close()
    driver.quit()
    
    info = soup.find_all(attrs={'data-img':True})
    photos = []
    for s in info:
        photos.append(s['data-img'])
        
    
    answer = {}
    answer['id'] = prd
    answer['prices'] = prices
    answer['sizes'] = sizes
    answer['photos'] = photos
    answer['specs'] = specs
    
    
    # Preparing answer for being saved in SQL
    
    ranges = []
    for r in answer['prices']:
        range_ = ''
        if r['begin'] == '':
            range_ += '>='
        else:
            range_ += r['begin']

        if r['end'] == '':
            range_ += '=<'
        else:
            range_ += '-' + r['end']

        ranges.append((range_, float(r['price']) * (1 + percent)))
    answer['prices'] = ranges
    
    sizes = []
    for s in answer['sizes']:
        sizes.append(s['skuName'])

    answer['sizes'] = sizes
    
    return answer

{'id': '565503532653', 'prices': [('2-9', 117.0), ('10-49', 111.14999999999999), ('50=<', 105.3)], 'sizes': ['S', 'M', 'L', 'XL', 'XXL'], 'photos': ['https://cbu01.alicdn.com/img/ibank/2018/835/790/8576097538_126174301.jpg', 'https://cbu01.alicdn.com/img/ibank/2018/989/150/8596051989_126174301.jpg', 'https://cbu01.alicdn.com/img/ibank/2018/244/690/8596096442_126174301.jpg', 'https://cbu01.alicdn.com/img/ibank/2018/859/191/8610191958_126174301.jpg', 'https://cbu01.alicdn.com/img/ibank/2018/852/721/8576127258_126174301.jpg'], 'specs': ['https://cbu01.alicdn.com/img/ibank/2018/244/690/8596096442_126174301.jpg', 'https://cbu01.alicdn.com/img/ibank/2018/989/150/8596051989_126174301.jpg', 'https://cbu01.alicdn.com/img/ibank/2018/835/790/8576097538_126174301.jpg']}


## Saving data in PDF

In [49]:
from fpdf import FPDF
import urllib.request

In [None]:
product = get_product_info('565503532653')

In [265]:
# A4 size is 210 x 297

def add_product(product, pdf):
    
    # Setting some parameters
    price = product['prices'][0][1]
    min_quantity = product['prices'][0][0].split('-')[0]
    sizes = ''
    for s in product['sizes']:
        sizes += s + ' '

    
    # Putting images into PDF
    name = 0
    for spec in product['specs']:
        
        urllib.request.urlretrieve(spec, "img/{}.jpg".format(name))
        pdf.image("img/{}.jpg".format(name), x=10 + (60 * name), y=8, w=30)
        
        name += 1

        
    # Putting text description into PDF    
    pdf.add_font('DejaVu', '', 'DejaVuSansCondensed.ttf', uni=True)
    pdf.set_font('DejaVu', '', 8)
    
    articules = ''
    name = 0
    for spec in product['specs']:    
        pdf.cell(w=25 + (60 * name))
        text = 'Артикул {}/{}'.format(product['id'], name) + ' ' * 40 * name
        pdf.cell(0, 5, text, name, ln=1)
        name += 1
    pdf.cell(0, ln=1)
    
    
    """"
    name = 0
    for spec in product['specs']:
        pdf.cell(w=25 + (60 * name))
        text = 'Размеры: {} + '.format(sizes) + ' ' * 40 * name 
        pdf.cell(0, 5, text, ln=1)
        name += 1
        
        
        pdf.cell(w=25 + (60 * name))
        pdf.cell(0, 5 - 5 * name, 'Мин заказ: {}'.format(min_quantity), ln=1)

        pdf.set_font('DejaVu', '', 12)
        pdf.cell(w=25 + (60 * name), h=0)
        pdf.cell(0, 10 - 5 * name, 'Цена: {}'.format(price), ln=1)
        
    """
        
        
        #pdf.ln(20)

        #pdf.cell(70, 40, ln=1)
        
        
    pdf.output("add_image.pdf")

In [266]:
image_path = "img/1.jpg"
pdf = FPDF()
pdf.add_page()

In [267]:
add_product(product, pdf)

# Testing new functions

In [17]:
# Handling prices and sizes

ranges = []
for r in product['prices']:
    range_ = ''
    if r['begin'] == '':
        range_ += '>='
    else:
        range_ += r['begin']
    
    if r['end'] == '':
        range_ += '=<'
    else:
        range_ += '-' + r['end']
        
    ranges.append((range_, r['price']))
    
product['prices'] = ranges

In [24]:
# Handling sizes

sizes = []
for s in product['sizes']:
    sizes.append(s['skuName'])

product['sizes'] = sizes

In [26]:
product

{'id': '565503532653',
 'prices': [('2-9', '100.00'), ('10-49', '95.00'), ('50=<', '90.00')],
 'sizes': ['S', 'M', 'L', 'XL', 'XXL'],
 'photos': ['https://cbu01.alicdn.com/img/ibank/2018/835/790/8576097538_126174301.jpg',
  'https://cbu01.alicdn.com/img/ibank/2018/989/150/8596051989_126174301.jpg',
  'https://cbu01.alicdn.com/img/ibank/2018/244/690/8596096442_126174301.jpg',
  'https://cbu01.alicdn.com/img/ibank/2018/859/191/8610191958_126174301.jpg',
  'https://cbu01.alicdn.com/img/ibank/2018/852/721/8576127258_126174301.jpg']}

In [27]:
url = 'https://detail.1688.com/offer/'
prd_page = url + '565503532653' + '.html'

driver = webdriver.Firefox()
driver.get(prd_page)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
driver.close()
driver.quit()

In [35]:
products

['https://cbu01.alicdn.com/img/ibank/2018/244/690/8596096442_126174301.jpg',
 'https://cbu01.alicdn.com/img/ibank/2018/989/150/8596051989_126174301.jpg',
 'https://cbu01.alicdn.com/img/ibank/2018/835/790/8576097538_126174301.jpg']

In [67]:
url = 'https://detail.1688.com/offer/'

In [129]:
prd_page = url + '565921082239' + '.html'

In [152]:
driver = webdriver.Firefox()
driver.get(prd_page)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
driver.close()
driver.quit()

In [132]:
info = soup.find_all(attrs={'data-range': True})
prices = []
for s in info:
    prices.append(json.loads(s['data-range']))

In [133]:
info = soup.find_all(attrs={'data-sku-config': True})
sizes = []
for s in info:
    sizes.append(json.loads(s['data-sku-config']))

## Finding photos

In [135]:
url = 'https://detail.1688.com/pic/'

In [136]:
photo_page = url + '565921082239' + '.html'

In [137]:
driver = webdriver.Firefox()
driver.get(photo_page)
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
driver.close()
driver.quit()

In [144]:
photos = soup.find_all(attrs={'data-img':True})

In [148]:
for p in photos:
    print(p['data-img'])

https://cbu01.alicdn.com/img/ibank/2018/773/823/8591328377_126174301.jpg
https://cbu01.alicdn.com/img/ibank/2018/445/114/8611411544_126174301.jpg
https://cbu01.alicdn.com/img/ibank/2018/554/518/8625815455_126174301.jpg
https://cbu01.alicdn.com/img/ibank/2018/479/977/8625779974_126174301.jpg
https://cbu01.alicdn.com/img/ibank/2018/210/054/8611450012_126174301.jpg
