In [1]:
import requests
import time
import json
from bs4 import BeautifulSoup

In [2]:
base_url = 'https://www2.hm.com'
url = 'https://www2.hm.com/en_gb/index.html'
headers = {'User-Agent':('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                         ' Chrome/89.0.4389.90 Safari/537.36')}

In [3]:
class Category:
    def __init__(self, id, parent_id, title):
        self.id = id
        self.parent_id = parent_id
        self.title = title
        
    def to_dict(self):
        return {'id': self.id, 'parent_id': self.parent_id, 'title': self.title}
    

class Offer:
    def __init__(self, title, images, category_id, description=None):
        self.title = title
        self.description = description
        self.images = images
        self.category_id = category_id
        
    def to_dict(self):
        return {'title': self.title,
                'images': [x for x in self.images],
                'description': self.description,
                'category_id': self.category_id}

In [4]:
def parse_all_categories():
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    women = Category(id=1, parent_id=None, title="Women")
    men = Category(id=2, parent_id=None, title="Men")

    sections = soup.find_all("li", {"class": "menu__block menu--unfolded"})[:2]
    all_categories = [women, men]
    all_offers = []

    for section in sections:
        categories = section.find_all("a", {"class": "menu__sub-link"})
        parent_section = women if "ladies" in categories[0]['href'] else men

        for entry in categories:
            category = Category(id=len(all_categories) + 1, parent_id=parent_section.id, title=entry.text)
            all_categories.append(category)
            
            print(category.to_dict())

            if "view all" in category.title.lower():
                continue
            if "sizes" in category.title.lower():
                all_categories.pop()
                continue
            if "h&m" in category.title.lower():
                all_categories.pop()
                continue
            if "premium selection" in category.title.lower():
                all_categories.pop()
                break

            parse_subcategories(base_url + entry['href'], category, all_categories)
            time.sleep(2.5)
    return all_categories

def parse_subcategories(subcat_url, parent_category, all_categories):
    page = requests.get(subcat_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    subcategories = [x.find("a").text.strip() for x in soup.find("a", {"class": "link current"})
        .find_parent()
        .find_all("li", {"class": "item"})]

    for entry in subcategories:
        subcategory = Category(id=len(all_categories) + 1, parent_id=parent_category.id, title=entry)
        all_categories.append(subcategory)

In [5]:
# all_categories = parse_all_categories()
# with open("categories.json", "w", encoding="utf-8") as f:
#     json.dump([x.to_dict() for x in all_categories], f)

In [6]:
def parse_all_offers():
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    women = Category(id=1, parent_id=None, title="Women")
    men = Category(id=2, parent_id=None, title="Men")

    sections = soup.find_all("li", {"class": "menu__block menu--unfolded"})[:2]
    all_categories = [women, men]
    all_offers = []

    for section in sections:
        categories = section.find_all("a", {"class": "menu__sub-link"})
        parent_section = women if "ladies" in categories[0]['href'] else men

        for entry in categories:
            category = Category(id=len(all_categories) + 1, parent_id=parent_section.id, title=entry.text)
            all_categories.append(category)
            
            print(category.to_dict())

            if "view all" in category.title.lower():
                continue
            if "sizes" in category.title.lower():
                all_categories.pop()
                continue
            if "h&m" in category.title.lower():
                all_categories.pop()
                continue
            if "premium selection" in category.title.lower():
                all_categories.pop()
                break

            parse_subcategories(base_url + entry['href'], category, all_categories, all_offers)
            time.sleep(2.5)
    return all_offers

def parse_subcategories(subcat_url, parent_category, all_categories, all_offers):
    page = requests.get(subcat_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    subcategories = [x.find("a").text.strip() for x in soup.find("a", {"class": "link current"})
        .find_parent()
        .find_all("li", {"class": "item"})]
    subcategories_refs = [x.find("a")['href'] for x in soup.find("a", {"class": "link current"})
        .find_parent()
        .find_all("li", {"class": "item"})]

    for i in range(len(subcategories)):
        subcategory = Category(id=len(all_categories) + 1, parent_id=parent_category.id, title=subcategories[i])
        all_categories.append(subcategory)
        
        print(subcategory.to_dict())
        
        parse_items(base_url + subcategories_refs[i], subcategory.id, all_offers)
        
def parse_items(subcat_url, category_id, all_offers):
    page = requests.get(subcat_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    items = soup.find_all("li", "product-item")
    for item in items:
        title = item.find("a", {"class": "link"}).text
        img = item.find("img", {"class": "item-image"})
        images = [img['data-src'], img['data-altimage']]
        
        all_offers.append(Offer(title=title, images=images, category_id=category_id))
        
    time.sleep(2.5)

In [7]:
all_offers = parse_all_offers()
with open("offers.json", "w", encoding="utf-8") as f:
    json.dump([x.to_dict() for x in all_offers], f)

{'id': 3, 'parent_id': 1, 'title': 'View All'}
{'id': 4, 'parent_id': 1, 'title': 'Dresses'}
{'id': 5, 'parent_id': 4, 'title': 'T-Shirt Dresses'}
{'id': 6, 'parent_id': 4, 'title': 'Denim Dresses'}
{'id': 7, 'parent_id': 4, 'title': 'Short Dresses'}
{'id': 8, 'parent_id': 4, 'title': 'Midi Dresses'}
{'id': 9, 'parent_id': 4, 'title': 'Maxi Dresses'}
{'id': 10, 'parent_id': 4, 'title': 'Bodycon Dresses'}
{'id': 11, 'parent_id': 4, 'title': 'Party Dresses'}
{'id': 12, 'parent_id': 4, 'title': 'Shirt Dresses'}
{'id': 13, 'parent_id': 4, 'title': 'Wrap Dresses'}
{'id': 14, 'parent_id': 4, 'title': 'Kaftan Dresses'}
{'id': 15, 'parent_id': 1, 'title': 'Shirts & Blouses'}
{'id': 16, 'parent_id': 15, 'title': 'Shirts'}
{'id': 17, 'parent_id': 15, 'title': 'Blouses'}
{'id': 18, 'parent_id': 15, 'title': 'Tunics'}
{'id': 19, 'parent_id': 15, 'title': 'Off-Shoulder'}
{'id': 20, 'parent_id': 15, 'title': 'Denim shirts'}
{'id': 21, 'parent_id': 1, 'title': 'Tops'}
{'id': 22, 'parent_id': 21, 'tit

{'id': 158, 'parent_id': 154, 'title': 'Nails'}
{'id': 159, 'parent_id': 154, 'title': 'Bath & Body Care'}
{'id': 160, 'parent_id': 154, 'title': 'Fragrance & Perfume'}
{'id': 161, 'parent_id': 154, 'title': 'Hair'}
{'id': 162, 'parent_id': 154, 'title': 'Brushes & Tools'}
{'id': 163, 'parent_id': 154, 'title': 'Removers & Cleansers'}
{'id': 164, 'parent_id': 154, 'title': 'Make-up Bags & Travel'}
{'id': 165, 'parent_id': 154, 'title': 'Conscious'}
{'id': 166, 'parent_id': 154, 'title': 'Other'}
{'id': 167, 'parent_id': 154, 'title': 'Limited Edition'}
{'id': 168, 'parent_id': 1, 'title': 'Care Products'}
{'id': 169, 'parent_id': 1, 'title': 'Premium Selection'}
{'id': 169, 'parent_id': 2, 'title': 'View All'}
{'id': 170, 'parent_id': 2, 'title': 'Hoodies & Sweatshirts'}
{'id': 171, 'parent_id': 170, 'title': 'Hoodies'}
{'id': 172, 'parent_id': 170, 'title': 'Sweatshirts'}
{'id': 173, 'parent_id': 2, 'title': 'T-shirts & Tanks'}
{'id': 174, 'parent_id': 173, 'title': 'Vests'}
{'id': 17