In [1]:
print('hello')

hello


In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

BASE_URL = "https://www.konimex.com"
TARGET_PATH = "/products/pharmaceutical/over-the-counter/"

def scrape_otc_products():
    resp = requests.get(urljoin(BASE_URL, TARGET_PATH), timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    products = []
    for h3 in soup.find_all("h3"):
        name = h3.get_text(strip=True)

        # Extract description
        desc = ""
        for sib in h3.next_siblings:
            if isinstance(sib, str) and sib.strip():
                desc = sib.strip()
                break
            if hasattr(sib, "get_text"):
                text = sib.get_text(strip=True)
                if text:
                    desc = text
                    break

        # Extract link
        a = h3.find("a")
        href = a["href"] if a and a.has_attr("href") else None
        link = urljoin(BASE_URL, href) if href else None

        products.append({
            "name": name,
            "description": desc,
            "detail_url": link
        })

    return products

def save_to_csv(data, filename="konimex_otc_products.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["name", "description", "detail_url"])
        writer.writeheader()
        writer.writerows(data)

if __name__ == "__main__":
    items = scrape_otc_products()
    save_to_csv(items)
    print(f"✅ Saved {len(items)} items to 'konimex_otc_products.csv'")


✅ Saved 38 items to 'konimex_otc_products.csv'


In [16]:
import re
import time
import csv
import requests
from bs4 import BeautifulSoup

# ─── Settings ─────────────────────────────────────────────
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
                  " AppleWebKit/537.36 (KHTML, like Gecko)"
                  " Chrome/113.0.0.0 Safari/537.36"
}
base_listing_url = "https://www.mandjur.co.id/collections/obat-bebas-otc?page={}&grid_list=grid-view"
base_product_url = "https://www.mandjur.co.id/collections/obat-bebas-otc/products/{}"

# ─── 1. Get raw product names from all pages ───────────────────────────────
product_names = []

for page in range(1, 25):
    url = base_listing_url.format(page)
    resp = requests.get(url, headers=headers)
    if resp.status_code != 200:
        print(f"⚠️ Page {page} returned status {resp.status_code}, skipping")
        continue

    soup = BeautifulSoup(resp.text, "html.parser")
    name_tags = soup.select("a.grid-view-item__title, .productitem--title")

    if not name_tags:
        print(f"⚠️ No product titles found on page {page}")

    for tag in name_tags:
        name = tag.get_text(strip=True)
        product_names.append(name)

    time.sleep(0.3)

# ─── 2. Slugify helper ─────────────────────────────────────────────────────
def slugify(name: str) -> str:
    name = name.lower().strip()
    name = re.sub(r"[ _/]+", "-", name)
    name = re.sub(r"[^a-z0-9\-]", "", name)
    name = re.sub(r"-{2,}", "-", name)
    return name

# ─── 3. Visit each product page ────────────────────────────────────────────
rows = []
unique_names = list(dict.fromkeys(product_names))

for name in unique_names:
    slug = slugify(name)
    product_url = base_product_url.format(slug)

    resp = requests.get(product_url, headers=headers)
    if resp.status_code != 200:
        print(f"❌ Failed to load {product_url} (status {resp.status_code})")
        continue

    soup = BeautifulSoup(resp.text, "html.parser")

    title = soup.select_one("h1.product__title, h1.product-single__title")
    price = soup.select_one(".product__price, .product-single__price")
    desc = soup.select_one(".product__description, .rte")

    rows.append({
        "url": product_url,
        "title": title.get_text(strip=True) if title else name,
        "price": price.get_text(strip=True) if price else "",
        "description": desc.get_text(" ", strip=True) if desc else "",
    })

    time.sleep(0.2)

# ─── 4. Save to CSV ────────────────────────────────────────────────────────
with open("products.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "title", "price", "description"])
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

print(f"✅ Done! Scraped {len(rows)} products to products.csv")


❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/salonpas-pain-relief-patch (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/miconazole-2-cream-10-gr (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/alerzin-10-mg-10-tablet (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/degirol-dus-isi-5-strip (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/mirasic-500-mg-10-kaplet (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/cendo-cenfresh-minidose-06-ml (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/acetylcysteine-200-mg-strip-10-kapsul (status 404)
❌ Failed to load https://www.mandjur.co.id/collections/obat-bebas-otc/products/obh-combi-anak-batuk-plus-flu-jeruk-60-ml (status 404)
❌ Failed to load https://www.