# Python Web Scraping Cheatsheet

## Setup Libraries

In [None]:
# Install
pip install requests beautifulsoup4 lxml selenium

- requests → Fetch web pages
- BeautifulSoup → Parse HTML/XML
- lxml → Faster parsing engine
- selenium → For dynamic (JavaScript-heavy) sites
- pandas → Store scraped tables/data

## Basic HTTP Request

In [None]:
import requests

url = "https://example.com"
headers = {"User-Agent": "Mozilla/5.0"}  # avoid blocking
response = requests.get(url, headers=headers)

print(response.status_code)   # 200 = OK
print(response.text[:500])    # preview HTML

## BeautifulSoup Basics

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, "lxml")

# Find by tag
title = soup.title.text
heading = soup.h1.text

# Find first element
div = soup.find("div", {"class": "content"})

# Find all elements
links = soup.find_all("a")
for link in links:
    print(link.get("href"))

# CSS selectors
soup.select("div.article h2")     # nested tags
soup.select_one("span.price")     # first match

## Extract Attributes

In [None]:
img = soup.find("img")
print(img["src"])       # image source
print(img.get("alt"))   # alt text

## Handling Tables

In [None]:
import pandas as pd

table = soup.find("table")
df = pd.read_html(str(table))[0]
print(df.head())

## Pagination Scraping

In [None]:
for page in range(1, 6):
    url = f"https://example.com/page/{page}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")
    # parse as usual

## Selenium for Dynamic Pages

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()
driver.get("https://example.com")

# Extract text
element = driver.find_element(By.CLASS_NAME, "price")
print(element.text)

# Get multiple elements
items = driver.find_elements(By.TAG_NAME, "a")
for item in items:
    print(item.get_attribute("href"))

driver.quit()

## Save Data

In [None]:
# CSV
df.to_csv("output.csv", index=False)

# JSON
import json
with open("data.json", "w") as f:
    json.dump(scraped_data, f, indent=4)

## Best Practices

✅ Respect robots.txt (`https://site.com/robots.txt`)
✅ Use User-Agent headers to avoid blocking
✅ Add `time.sleep()` to avoid rate limits
✅ Store intermediate data (CSV/JSON)
✅ Handle errors with `try/except`
✅ For APIs, prefer requests + JSON over scraping