In [112]:
from bs4 import BeautifulSoup, NavigableString, Tag
import requests

In [157]:
base_url = 'https://3trolle.pl'

In [175]:
def get_soup_from_url(url):
    return BeautifulSoup(requests.get(url).text)

def get_games_from_page(category_page):
    game_tags = category_page.find('ul', {'class': 'product_list grid row'}).find_all('a', {'class': 'product_img_link'})
    return (get_soup_from_url(tag.get('href')) for tag in game_tags)

def get_next_page(category_page):
    next_page_tag = category_page.find('li', {'id': 'pagination_next'}).find('a')
    if next_page_tag is not None:
        link = next_page_tag.get('href')
        if not link.startswith('https'):
            link = base_url + link
        return get_soup_from_url(link)
    else:
        return None

def get_all_category_pages(category_url):
    first_category_page = get_soup_from_url(category_url)
    pages = [first_category_page]
    next_page = get_next_page(first_category_page)
    while next_page is not None:
        pages.append(next_page)
        next_page = get_next_page(next_page)
    return pages

def get_game_title(game_page):
    return str(game_page.find('div', {'id': 'product_name_wrap'}).find('h1').string)

def get_game_description(game_page):
    description_tag = game_page.find('div', {'id': 'short_description_content'})
    if game_page.find('span', string='Instrukcja: angielska') is not None:
        return None
    elif description_tag is None:
        return None
    else:
        return str(description_tag.string)
    
def get_titles_and_descriptions(category_url):
    pages = get_all_category_pages(category_url)
    
    titles_and_descriptions = [(get_game_title(game_page), get_game_description(game_page)) 
                    for page in pages for game_page in get_games_from_page(page)]
    return [td for td in titles_and_descriptions if td[1] is not None]

In [153]:
category_urls = {
    'Familijne': 'https://3trolle.pl/28-rodzinne//s-1/pokaz_tylko-gry_planszowe?n=80',
    'Strategiczne': 'https://3trolle.pl/39-strategiczne//s-1/pokaz_tylko-gry_planszowe?n=80',
    'Imprezowe': 'https://3trolle.pl/34-imprezowe//s-1/pokaz_tylko-gry_planszowe+gry_karciane?n=80',
    'Przygodowe': 'https://3trolle.pl/36-przygodowe/?id_category=36&n=80'
}

In [173]:
descs = get_descriptions_of_games_in_category(category_urls['Strategiczne'])
len(descs)

139

In [176]:
titles_and_descriptions = {}

for category_name, category_url in category_urls.items():
    descriptions[category_name] = get_titles_and_descriptions(category_url)

In [184]:
desc_arr = [(title, description, category_name) for category_name in titles_and_descriptions 
            for title, description in titles_and_descriptions[category_name]]

In [201]:
import pandas as pd

dataset = pd.DataFrame(desc_arr, columns=['title', 'description', 'category'])
dataset = dataset.drop_duplicates('title')

In [205]:
dataset.to_csv('gry_planszowe.csv', index=False)

In [203]:
len(dataset)

654

In [204]:
for k, v in titles_and_descriptions.items():
    print(f'{k}: {len(v)}')

Familijne: 322
Strategiczne: 139
Imprezowe: 179
Przygodowe: 70
