In [15]:
import requests
import random
import time
import os
import pandas as pd
from bs4 import BeautifulSoup

In [16]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)"
]

SEARCH_TERMS = {
    "industrial-machinery": "https://dir.indiamart.com/search.mp?ss=industrial+machinery",
    "packaging-machinery": "https://dir.indiamart.com/search.mp?ss=packaging+machinery"
}



In [17]:
def fetch_page(url):
    headers = {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US,en;q=0.9"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        time.sleep(random.uniform(2, 3))
        return response.text
    except Exception as e:
        print(f"[ERROR] {e}")
        return None


In [18]:
def parse_products(html, category):
    soup = BeautifulSoup(html, "html.parser")
    products = []

    cards = soup.select("div.card")  # search page cards

    for card in cards:
        product = {
            "product_name": None,
            "category": category,
            "price": None,
            "supplier_name": None,
            "supplier_location": None
        }

        name = card.select_one("h2")
        price = card.select_one(".price")
        company = card.select_one(".company-name")
        location = card.select_one(".location")

        if name:
            product["product_name"] = name.get_text(strip=True)
        if price:
            product["price"] = price.get_text(strip=True)
        if company:
            product["supplier_name"] = company.get_text(strip=True)
        if location:
            product["supplier_location"] = location.get_text(strip=True)

        products.append(product)

    return products


In [19]:
all_products = []

for category, url in SEARCH_TERMS.items():
    print(f"Crawling: {category}")
    
    html = fetch_page(url)
    if not html:
        continue
    
    data = parse_products(html, category)
    print(f"  → {len(data)} items found")
    all_products.extend(data)

df_raw = pd.DataFrame(all_products)

if df_raw.empty:
    raise ValueError("❌ No data scraped. HTML structure likely blocked.")

# ensure directory exists
output_path = "../data/raw/products_raw.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

df_raw.to_csv(output_path, index=False)
print(f"✅ Saved {len(df_raw)} rows")

df_raw.head()



Crawling: industrial-machinery
  → 10 items found
Crawling: packaging-machinery
  → 10 items found
✅ Saved 20 rows


Unnamed: 0,product_name,category,price,supplier_name,supplier_location
0,,industrial-machinery,,,
1,,industrial-machinery,,,
2,,industrial-machinery,,,
3,,industrial-machinery,,,
4,,industrial-machinery,,,
