In [None]:
import requests
import time
import json
from bs4 import BeautifulSoup

In [None]:
base_url = 'https://www2.hm.com'
url = 'https://www2.hm.com/en_gb/index.html'
headers = {'User-Agent':('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                         ' Chrome/89.0.4389.90 Safari/537.36')}

In [None]:
class Category:
    def __init__(self, id, parent_id, title):
        self.id = id
        self.parent_id = parent_id
        self.title = title
        
    def to_dict(self):
        return {'id': self.id, 'parent_id': self.parent_id, 'title': self.title}
    

class Offer:
    def __init__(self, title, images, category_id, description=None):
        self.title = title
        self.description = description
        self.images = images
        self.category_id = category_id
        
    def to_dict(self):
        return {'title': self.title,
                'images': [x for x in self.images],
                'description': self.description,
                'category_id': self.category_id}

In [None]:
def parse_all_categories():
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    women = Category(id=1, parent_id=None, title="Women")
    men = Category(id=2, parent_id=None, title="Men")

    sections = soup.find_all("li", {"class": "menu__block menu--unfolded"})[:2]
    all_categories = [women, men]
    all_offers = []

    for section in sections:
        categories = section.find_all("a", {"class": "menu__sub-link"})
        parent_section = women if "ladies" in categories[0]['href'] else men

        for entry in categories:
            category = Category(id=len(all_categories) + 1, parent_id=parent_section.id, title=entry.text)
            all_categories.append(category)
            
            print(category.to_dict())

            if "view all" in category.title.lower():
                continue
            if "sizes" in category.title.lower():
                all_categories.pop()
                continue
            if "h&m" in category.title.lower():
                all_categories.pop()
                continue
            if "premium selection" in category.title.lower():
                all_categories.pop()
                break

            parse_subcategories(base_url + entry['href'], category, all_categories)
            time.sleep(2.5)
    return all_categories

def parse_subcategories(subcat_url, parent_category, all_categories):
    page = requests.get(subcat_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    subcategories = [x.find("a").text.strip() for x in soup.find("a", {"class": "link current"})
        .find_parent()
        .find_all("li", {"class": "item"})]

    for entry in subcategories:
        subcategory = Category(id=len(all_categories) + 1, parent_id=parent_category.id, title=entry)
        all_categories.append(subcategory)

In [None]:
# all_categories = parse_all_categories()
# with open("categories.json", "w", encoding="utf-8") as f:
#     json.dump([x.to_dict() for x in all_categories], f)

In [None]:
def parse_all_offers():
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    women = Category(id=1, parent_id=None, title="Women")
    men = Category(id=2, parent_id=None, title="Men")

    sections = soup.find_all("li", {"class": "menu__block menu--unfolded"})[:2]
    all_categories = [women, men]
    all_offers = []

    for section in sections:
        categories = section.find_all("a", {"class": "menu__sub-link"})
        parent_section = women if "ladies" in categories[0]['href'] else men

        for entry in categories:
            category = Category(id=len(all_categories) + 1, parent_id=parent_section.id, title=entry.text)
            all_categories.append(category)
            
            print(category.to_dict())

            if "view all" in category.title.lower():
                continue
            if "sizes" in category.title.lower():
                all_categories.pop()
                continue
            if "h&m" in category.title.lower():
                all_categories.pop()
                continue
            if "premium selection" in category.title.lower():
                all_categories.pop()
                break

            parse_subcategories(base_url + entry['href'], category, all_categories, all_offers)
            time.sleep(2.5)
    return all_offers

def parse_subcategories(subcat_url, parent_category, all_categories, all_offers):
    page = requests.get(subcat_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    subcategories = [x.find("a").text.strip() for x in soup.find("a", {"class": "link current"})
        .find_parent()
        .find_all("li", {"class": "item"})]
    subcategories_refs = [x.find("a")['href'] for x in soup.find("a", {"class": "link current"})
        .find_parent()
        .find_all("li", {"class": "item"})]

    for i in range(len(subcategories)):
        subcategory = Category(id=len(all_categories) + 1, parent_id=parent_category.id, title=subcategories[i])
        all_categories.append(subcategory)
        
        print(subcategory.to_dict())
        
        parse_items(base_url + subcategories_refs[i], subcategory.id, all_offers)
        
def parse_items(subcat_url, category_id, all_offers):
    page = requests.get(subcat_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    products = soup.find_all("li", {"class": "product-item"})
    for entry in products:
        title = entry.find("a", {"class": "link"}).text
        img = product.find("img", {"class": "item-image"})
        images = [img['src'], img['data-altimage']]
        
        all_offers.append(Offer(title=title, images=images, category_id=category_id))
    
    time.sleep(2.5)

In [None]:
# all_offers = parse_all_offers()
# with open("offers.json", "w", encoding="utf-8") as f:
#     json.dump([x.to_dict() for x in all_offers], f)