In [1]:

!pip install requests beautifulsoup4 selenium pandas lxml

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [3

In [2]:

# Q1: Books Scraping
base_url = "https://books.toscrape.com/catalogue/page-{}.html"
books = []

page = 1
while True:
    url = base_url.format(page)
    response = requests.get(url)
    if response.status_code != 200:
        break  # no more pages
    soup = BeautifulSoup(response.text, "lxml")
    articles = soup.find_all("article", class_="product_pod")
    if not articles:
        break

    for article in articles:
        title = article.h3.a["title"]
        price = article.find("p", class_="price_color").text.strip()
        availability = article.find("p", class_="instock availability").text.strip()
        rating = article.p["class"][1]  # e.g. "One", "Two", etc.
        books.append([title, price, availability, rating])

    page += 1

# Save to DataFrame
books_df = pd.DataFrame(books, columns=["Title", "Price", "Availability", "Star Rating"])
books_df.to_csv("books.csv", index=False)
books_df.head()


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


In [3]:

# Q2: IMDB Top 250
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

# Setup Selenium in Colab
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)
driver.get("https://www.imdb.com/chart/top/")

movies = []

rows = driver.find_elements(By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")
for idx, row in enumerate(rows, start=1):
    title = row.find_element(By.CSS_SELECTOR, "h3").text
    year = row.find_element(By.CSS_SELECTOR, ".cli-title-metadata-item").text
    rating = row.find_element(By.CSS_SELECTOR, ".ipc-rating-star--imdb").text.split()[0]
    movies.append([idx, title, year, rating])
driver.quit()

# Save to DataFrame
imdb_df = pd.DataFrame(movies, columns=["Rank", "Title", "Year", "IMDB Rating"])
imdb_df.to_csv("imdb_top250.csv", index=False)
imdb_df.head()

Unnamed: 0,Rank,Title,Year,IMDB Rating


In [4]:

# Q3: Weather Scraping
weather_url = "https://www.timeanddate.com/weather/"
response = requests.get(weather_url)
soup = BeautifulSoup(response.text, "lxml")

cities = []
for row in soup.select("table tbody tr"):
    cols = row.find_all("td")
    if len(cols) >= 4:
        city = cols[0].text.strip()
        temp = cols[1].text.strip()
        condition = cols[2].text.strip()
        cities.append([city, temp, condition])

weather_df = pd.DataFrame(cities, columns=["City", "Temperature", "Condition"])
weather_df.to_csv("weather.csv", index=False)
weather_df.head()


Unnamed: 0,City,Temperature,Condition
