In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import qgrid
import pygsheets
from collections import namedtuple
from itertools import zip_longest

In [2]:
URL = 'http://www.brandkamp.de/produkte/chrysanthemen.html'
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html5lib')

In [3]:
Filter = namedtuple('Filter',
                    ['category', 'options'])


def parse_filters(f):
    category = f['data-filter-group']
    opt_tags = f.find_all('option')
    options = [opt['data-filter-value'][1:].strip() for opt in opt_tags[1:]]
    return Filter(category=category, options=set(options))

In [4]:
filters = []
filter_container = soup.find(id='filters')
for f in filter_container.select('select'):
    filters.append(parse_filters(f))
filters[0] = filters[0]._replace(category='cat')
filters[1] = filters[1]._replace(category='subcat')


def find_category(css_class):
    for f in filters:
        if css_class in f.options:
            return f.category
    return None

In [5]:
FILTER_CATEGORIES = ['cat', 'subcat', 'color', 'time', 'form', 'size']
Chrysamthemum = namedtuple('Chrysamthemum',
                           ['id', 'name'] + FILTER_CATEGORIES)

               
def parse_product(pr):
    text = pr.find('div', class_='product-text')
    title = text.contents[0].get_text().strip().split()
    id = title[0]
    name = ' '.join(title[1:])
    
    filter_categories = dict(zip_longest(FILTER_CATEGORIES, []))
    for css_class in pr['class'][5:]:
        if css_class == '':
            continue
        cat = find_category(css_class)
        if cat is None:
            print(css_class)
            continue
        if filter_categories[cat]:
            filter_categories[cat] += ', {}'.format(css_class)
        else:
            filter_categories[cat] = css_class
    
    return Chrysamthemum(
        id=id,
        name=name,
        **filter_categories
    )

In [6]:
products = []
container = soup.find(id='isotope-container')
for pr in container.find_all('div', class_='product'):
    products.append(parse_product(pr))

In [7]:
df = pd.DataFrame.from_records(products, columns=Chrysamthemum._fields)

In [8]:
gc = pygsheets.authorize()

In [9]:
sheet = gc.create('brandkamp', parent_id='0B_ntZsb545hpRG8wdDR5T1l4d28')

In [10]:
worksheet = sheet.sheet1

In [11]:
worksheet.set_dataframe(df, 'A1')

In [12]:
worksheet.sync()