In [14]:
import requests
from bs4 import BeautifulSoup
import json
import xml.etree.ElementTree as ET

url = "https://en.wikipedia.org/wiki/Data_science"

# Extracting header from Wiki Page
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, "html.parser")

# Extracting title from Wiki Page
title_tag = soup.find("h1", id="firstHeading")
title = title_tag.text.strip() if title_tag else "Unknown Title"

# Extracting the first paragraph from Wiki Page
intro_paragraph = ""
for p in soup.select("p"):
    if p.text.strip():
        intro_paragraph = p.text.strip()
        break

# Extracting section headings from Wiki Page
headings = [h.get_text(strip=True) for h in soup.find_all(["h2", "h3"]) if h.get_text(strip=True)]

data = {
    "page_title": title,
    "summary": intro_paragraph,
    "headings": headings
}

# Saving data as JSON file
with open("data_science.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

# Save data as XML file to be viewed in browser
root = ET.Element("article", title=title)
ET.SubElement(root, "summary").text = intro_paragraph
sections = ET.SubElement(root, "sections")
for h in headings:
    ET.SubElement(sections, "heading").text = h
ET.ElementTree(root).write("data_science.xml", encoding="utf-8", xml_declaration=True)

print("Files saved successfully with title, summary, and headings(data_science.json and data_science.xml).")


Files saved successfully with title, summary, and headings(data_science.json and data_science.xml).
