In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def get_urls_from_sitemap(sitemap_url):
    urls = []
    try:
        response = requests.get(sitemap_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(response.content, 'xml')  # XML parser
        urls = [loc.text for loc in soup.find_all("loc")]
        print(f"✅ Found {len(urls)} URLs in sitemap")
    except Exception as e:
        print(f"❌ Error reading sitemap: {e}")
    return urls

def extract_page_data(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Meta Title
        title = soup.title.string.strip() if soup.title else "No Title"

        # Meta Description
        meta_desc = soup.find("meta", attrs={"name": "description"})
        meta_description = meta_desc["content"].strip() if meta_desc else "No Meta Description"

        # OG Tags
        og_title = soup.find("meta", property="og:title")
        og_desc = soup.find("meta", property="og:description")
        og_image = soup.find("meta", property="og:image")

        og_title_content = og_title["content"].strip() if og_title else "No OG Title"
        og_desc_content = og_desc["content"].strip() if og_desc else "No OG Description"

        if og_image:
            og_image_url = og_image["content"].strip()
            if og_image_url.lower().endswith((".png", ".jpg", ".jpeg")):
                og_image_url = "Skipped (.png/.jpg)"
        else:
            og_image_url = "No OG Image"

        # H1 and H1 Description
        h1 = soup.find("h1")
        h1_text = h1.get_text(strip=True) if h1 else "No H1 Tag"

        h1_desc = ""
        if h1:
            next_elem = h1.find_next()
            while next_elem:
                if next_elem.name == "p":
                    h1_desc = next_elem.get_text(strip=True)
                    break
                next_elem = next_elem.find_next()

        return [url, title, meta_description, og_title_content, og_desc_content, og_image_url, h1_text, h1_desc]
    except Exception as e:
        return [url, "Error", "Error", "Error", "Error", "Error", "Error", f"Error: {e}"]

def scrape_from_sitemap(sitemap_url):
    urls = get_urls_from_sitemap(sitemap_url)

    with open("sitemap_scraped_data.csv", mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([
            "URL", "Meta Title", "Meta Description",
            "OG Title", "OG Description", "OG Image",
            "H1 Tag", "H1 Description"
        ])

        for url in urls:
            print(f"🔍 Scraping: {url}")
            data = extract_page_data(url)
            writer.writerow(data)

    print("✅ Done! Data saved to sitemap_scraped_data.csv")

# Example usage:
scrape_from_sitemap("https://www.comfygen.com/sitemap.xml")


✅ Found 104 URLs in sitemap
🔍 Scraping: https://www.comfygen.com/
🔍 Scraping: https://www.comfygen.com/altcoin-development-services
🔍 Scraping: https://www.comfygen.com/cryptocurrency-exchange-development
🔍 Scraping: https://www.comfygen.com/crypto-wallet-development
🔍 Scraping: https://www.comfygen.com/cryptocurrency-mlm-software-development
🔍 Scraping: https://www.comfygen.com/smart-contract-development
🔍 Scraping: https://www.comfygen.com/polygon-blockchain-development
🔍 Scraping: https://www.comfygen.com/multichain-blockchain-development
🔍 Scraping: https://www.comfygen.com/cricket-live-line-api
🔍 Scraping: https://www.comfygen.com/live-casino-game-api-integration
🔍 Scraping: https://www.comfygen.com/mobile-app-development
🔍 Scraping: https://www.comfygen.com/hybrid-mobile-app-development
🔍 Scraping: https://www.comfygen.com/ios-app-development
🔍 Scraping: https://www.comfygen.com/web-development
🔍 Scraping: https://www.comfygen.com/web-design
🔍 Scraping: https://www.comfygen.com/f