<a href="https://colab.research.google.com/github/vidhya130401-a11y/PYTHON-Daily-Challenge/blob/main/Python_DA_Assignment_3_Web_Scraping_and_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Wep Scraping

In [15]:
pip install requests beautifulsoup4



In [3]:
import requests
from bs4 import BeautifulSoup

# Base URL
url = "https://books.toscrape.com/catalogue/page-1.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

books = []

for article in soup.find_all("article", class_="product_pod"):
    # Title
    title = article.h3.a["title"]

    # Price
    price = article.find("p", class_="price_color").text

    # Rating (stored in class names like 'star-rating Three')
    rating = article.find("p", class_="star-rating")["class"][1]

    # Availability
    availability = article.find("p", class_="instock availability").text.strip()

    # Genre (from breadcrumb navigation)
    genre = soup.find("ul", class_="breadcrumb").find_all("li")[1].text.strip()

    books.append({
        "Title": title,
        "Price": price,
        "Rating": rating,
        "Availability": availability,
        "Genre": genre
    })

# Print results
for book in books:
    print(book)

{'Title': 'A Light in the Attic', 'Price': 'Â£51.77', 'Rating': 'Three', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'Tipping the Velvet', 'Price': 'Â£53.74', 'Rating': 'One', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'Soumission', 'Price': 'Â£50.10', 'Rating': 'One', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'Sharp Objects', 'Price': 'Â£47.82', 'Rating': 'Four', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'Sapiens: A Brief History of Humankind', 'Price': 'Â£54.23', 'Rating': 'Five', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'The Requiem Red', 'Price': 'Â£22.65', 'Rating': 'One', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'The Dirty Little Secrets of Getting Your Dream Job', 'Price': 'Â£33.34', 'Rating': 'Four', 'Availability': 'In stock', 'Genre': 'All products'}
{'Title': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'Pr

In [6]:
import requests
from bs4 import BeautifulSoup

def scrape_books(base_url="https://books.toscrape.com/catalogue/page-{}.html", total_pages=50):
    all_books = []

    for page in range(1, total_pages + 1):
        url = base_url.format(page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract genre from breadcrumb (same for all books on the page)
        # Changed index from 2 to 1 to correctly get 'All products' from catalogue pages
        genre = soup.find("ul", class_="breadcrumb").find_all("li")[1].text.strip()

        # Loop through each book on the page
        for article in soup.find_all("article", class_="product_pod"):
            title = article.h3.a["title"]
            price = article.find("p", class_="price_color").text
            rating = article.find("p", class_="star-rating")["class"][1]
            availability = article.find("p", class_="instock availability").text.strip()

            all_books.append({
                "Title": title,
                "Price": price,
                "Rating": rating,
                "Availability": availability,
                "Genre": genre
            })

    return all_books


# Example usage
books_data = scrape_books()
print(f"Total books scraped: {len(books_data)}")
print(books_data[:5])  # Show first 5 results

Total books scraped: 1000
[{'Title': 'A Light in the Attic', 'Price': 'Â£51.77', 'Rating': 'Three', 'Availability': 'In stock', 'Genre': 'All products'}, {'Title': 'Tipping the Velvet', 'Price': 'Â£53.74', 'Rating': 'One', 'Availability': 'In stock', 'Genre': 'All products'}, {'Title': 'Soumission', 'Price': 'Â£50.10', 'Rating': 'One', 'Availability': 'In stock', 'Genre': 'All products'}, {'Title': 'Sharp Objects', 'Price': 'Â£47.82', 'Rating': 'Four', 'Availability': 'In stock', 'Genre': 'All products'}, {'Title': 'Sapiens: A Brief History of Humankind', 'Price': 'Â£54.23', 'Rating': 'Five', 'Availability': 'In stock', 'Genre': 'All products'}]


In [8]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_books(base_url="https://books.toscrape.com/catalogue/page-{}.html", total_pages=50):
    all_books = []

    for page in range(1, total_pages + 1):
        url = base_url.format(page)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract genre from breadcrumb (same for all books on the page)
        # Corrected index from 2 to 1 for 'All products' on catalogue pages
        genre = soup.find("ul", class_="breadcrumb").find_all("li")[1].text.strip()

        # Loop through each book on the page
        for article in soup.find_all("article", class_="product_pod"):
            title = article.h3.a["title"]
            price = article.find("p", class_="price_color").text
            rating = article.find("p", class_="star-rating")["class"][1]
            availability = article.find("p", class_="instock availability").text.strip()

            all_books.append({
                "Title": title,
                "Price": price,
                "Rating": rating,
                "Availability": availability,
                "Genre": genre
            })

    return all_books


def save_to_csv(data, filename="books_data.csv"):
    # Define column names
    fieldnames = ["Title", "Price", "Rating", "Availability", "Genre"]

    # Write to CSV
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

    print(f"Data successfully saved to {filename}")


# Example usage
books_data = scrape_books()
save_to_csv(books_data)

Data successfully saved to books_data.csv


## PySpark

In [10]:
from pyspark.sql import SparkSession

# Create or get a Spark session
spark = SparkSession.builder \
    .appName("BooksAnalysis") \
    .getOrCreate()

print("PySpark environment setup complete!")


PySpark environment setup complete!


In [11]:
from pyspark.sql import SparkSession

# 1. Initialize Spark Session
spark = SparkSession.builder.appName("BooksDataAnalysis").getOrCreate()

# 2. Load CSV into Spark DataFrame
df = spark.read.csv("books_data.csv", header=True, inferSchema=True)

# 3. Check Schema
df.printSchema()

# 4. View First Few Rows
df.show(5)

# 5. Summary Statistics for Numerical Columns
df.describe().show()

root
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Availability: string (nullable = true)
 |-- Genre: string (nullable = true)

+--------------------+-------+------+------------+------------+
|               Title|  Price|Rating|Availability|       Genre|
+--------------------+-------+------+------------+------------+
|A Light in the Attic|Â£51.77| Three|    In stock|All products|
|  Tipping the Velvet|Â£53.74|   One|    In stock|All products|
|          Soumission|Â£50.10|   One|    In stock|All products|
|       Sharp Objects|Â£47.82|  Four|    In stock|All products|
|Sapiens: A Brief ...|Â£54.23|  Five|    In stock|All products|
+--------------------+-------+------+------------+------------+
only showing top 5 rows

+-------+--------------------+-------+------+------------+------------+
|summary|               Title|  Price|Rating|Availability|       Genre|
+-------+--------------------+-------+------+-----------

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, when

# 1. Initialize Spark Session
spark = SparkSession.builder.appName("BooksFiltering").getOrCreate()

# 2. Load CSV
df = spark.read.csv("books_data.csv", header=True, inferSchema=True)

# 3. Clean Price column (remove £ and cast to float)
df = df.withColumn("Price", regexp_replace(col("Price"), "£", "").cast("float"))

# 4. Convert Rating words to numbers
rating_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}
df = df.withColumn("RatingNum",
                   when(col("Rating") == "One", 1)
                   .when(col("Rating") == "Two", 2)
                   .when(col("Rating") == "Three", 3)
                   .when(col("Rating") == "Four", 4)
                   .when(col("Rating") == "Five", 5))

# 5. Filter books with Price > 20
df_price_filtered = df.filter(col("Price") > 20)

# 6. Filter books with Rating >= 4
df_rating_filtered = df.filter(col("RatingNum") >= 4)

# Show results
print("Books with Price > 20:")
df_price_filtered.show(5)

print("Books with Rating >= 4:")
df_rating_filtered.show(5)

Books with Price > 20:
+-----+-----+------+------------+-----+---------+
|Title|Price|Rating|Availability|Genre|RatingNum|
+-----+-----+------+------------+-----+---------+
+-----+-----+------+------------+-----+---------+

Books with Rating >= 4:
+--------------------+-----+------+------------+------------+---------+
|               Title|Price|Rating|Availability|       Genre|RatingNum|
+--------------------+-----+------+------------+------------+---------+
|       Sharp Objects| NULL|  Four|    In stock|All products|        4|
|Sapiens: A Brief ...| NULL|  Five|    In stock|All products|        5|
|The Dirty Little ...| NULL|  Four|    In stock|All products|        4|
|The Boys in the B...| NULL|  Four|    In stock|All products|        4|
|Shakespeare's Son...| NULL|  Four|    In stock|All products|        4|
+--------------------+-----+------+------------+------------+---------+
only showing top 5 rows

