Web Scraping:

In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [29]:
url = "https://books.toscrape.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

In [30]:
books = soup.find_all("article", class_="product_pod")

data = []

for book in books:
    # Title
    title = book.h3.a["title"]

    # Price
    price = book.find("p", class_="price_color").text.strip()

    # Rating (stored in class name, e.g., "star-rating Three")
    rating = book.find("p", class_="star-rating")["class"][1]

    # Availability
    availability = book.find("p", class_="instock availability").text.strip()

    # Genre ‚Äî comes from sidebar (for this demo page, all are ‚ÄúTravel‚Äù or same genre on subpages)
    genre = soup.find("ul", class_="nav-list").find("a").text.strip()

    data.append({
        "Title": title,
        "Price": price,
        "Rating": rating,
        "Availability": availability,
        "Genre": genre
    })

In [31]:
df = pd.DataFrame(data)
print(df.head())

                                   Title    Price Rating Availability  Genre
0                   A Light in the Attic  √Ç¬£51.77  Three     In stock  Books
1                     Tipping the Velvet  √Ç¬£53.74    One     In stock  Books
2                             Soumission  √Ç¬£50.10    One     In stock  Books
3                          Sharp Objects  √Ç¬£47.82   Four     In stock  Books
4  Sapiens: A Brief History of Humankind  √Ç¬£54.23   Five     In stock  Books


In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_books():
    base_url = "https://books.toscrape.com/catalogue/page-{}.html"
    all_books = []

    # Loop through all pages (there are 50 in total)
    for page in range(1, 51):
        url = base_url.format(page)
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Page {page} not found, stopping.")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        books = soup.find_all("article", class_="product_pod")

        for book in books:
            title = book.h3.a["title"]
            price = book.find("p", class_="price_color").text.strip()
            rating = book.find("p", class_="star-rating")["class"][1]
            availability = book.find("p", class_="instock availability").text.strip()

            # Visit each book‚Äôs page to get the exact genre
            detail_link = book.h3.a["href"]
            if "catalogue/" not in detail_link:
                detail_link = "catalogue/" + detail_link

            detail_url = "https://books.toscrape.com/" + detail_link
            detail_res = requests.get(detail_url)
            detail_soup = BeautifulSoup(detail_res.text, "html.parser")
            genre = detail_soup.find("ul", class_="breadcrumb").find_all("a")[2].text.strip()

            all_books.append({
                "Title": title,
                "Price": price,
                "Rating": rating,
                "Availability": availability,
                "Genre": genre
            })

        print(f"‚úÖ Page {page} scraped successfully!")

    return pd.DataFrame(all_books)

# Run the scraper
df = scrape_books()

# Save to CSV
df.to_csv("all_books.csv", index=False)
print("üéâ All book data saved to all_books.csv successfully!")

# Display first few rows
print(df.head())

‚úÖ Page 1 scraped successfully!
‚úÖ Page 2 scraped successfully!
‚úÖ Page 3 scraped successfully!
‚úÖ Page 4 scraped successfully!
‚úÖ Page 5 scraped successfully!
‚úÖ Page 6 scraped successfully!
‚úÖ Page 7 scraped successfully!
‚úÖ Page 8 scraped successfully!
‚úÖ Page 9 scraped successfully!
‚úÖ Page 10 scraped successfully!
‚úÖ Page 11 scraped successfully!
‚úÖ Page 12 scraped successfully!
‚úÖ Page 13 scraped successfully!
‚úÖ Page 14 scraped successfully!
‚úÖ Page 15 scraped successfully!
‚úÖ Page 16 scraped successfully!
‚úÖ Page 17 scraped successfully!
‚úÖ Page 18 scraped successfully!
‚úÖ Page 19 scraped successfully!
‚úÖ Page 20 scraped successfully!
‚úÖ Page 21 scraped successfully!
‚úÖ Page 22 scraped successfully!
‚úÖ Page 23 scraped successfully!
‚úÖ Page 24 scraped successfully!
‚úÖ Page 25 scraped successfully!
‚úÖ Page 26 scraped successfully!
‚úÖ Page 27 scraped successfully!
‚úÖ Page 28 scraped successfully!
‚úÖ Page 29 scraped successfully!
‚úÖ Page 30 scraped suc

In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Define a function to scrape book data (with pagination)
def scrape_books():
    base_url = "https://books.toscrape.com/catalogue/page-{}.html"
    all_books = []

    # Loop through all 50 pages
    for page in range(1, 51):
        url = base_url.format(page)
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Page {page} not found. Stopping.")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        books = soup.find_all("article", class_="product_pod")

        for book in books:
            title = book.h3.a["title"]
            price = book.find("p", class_="price_color").text.strip()
            rating = book.find("p", class_="star-rating")["class"][1]
            availability = book.find("p", class_="instock availability").text.strip()

            # Get genre from the book‚Äôs detail page
            detail_link = book.h3.a["href"]
            if "catalogue/" not in detail_link:
                detail_link = "catalogue/" + detail_link

            detail_url = "https://books.toscrape.com/" + detail_link
            detail_res = requests.get(detail_url)
            detail_soup = BeautifulSoup(detail_res.text, "html.parser")

            genre = detail_soup.find("ul", class_="breadcrumb").find_all("a")[2].text.strip()

            all_books.append({
                "Title": title,
                "Price": price,
                "Rating": rating,
                "Availability": availability,
                "Genre": genre
            })

        print(f"‚úÖ Page {page} scraped successfully!")

    # Step 2: Convert to DataFrame
    df = pd.DataFrame(all_books)

    # Step 3: Save to CSV
    df.to_csv("books_data.csv", index=False, encoding="utf-8-sig")
    print("üìÅ Data successfully saved to books_data.csv")

    return df

# Run the scraper
books_df = scrape_books()

# Step 4: Display first few rows
print(books_df.head())

‚úÖ Page 1 scraped successfully!
‚úÖ Page 2 scraped successfully!
‚úÖ Page 3 scraped successfully!
‚úÖ Page 4 scraped successfully!
‚úÖ Page 5 scraped successfully!
‚úÖ Page 6 scraped successfully!
‚úÖ Page 7 scraped successfully!
‚úÖ Page 8 scraped successfully!
‚úÖ Page 9 scraped successfully!
‚úÖ Page 10 scraped successfully!
‚úÖ Page 11 scraped successfully!
‚úÖ Page 12 scraped successfully!
‚úÖ Page 13 scraped successfully!
‚úÖ Page 14 scraped successfully!
‚úÖ Page 15 scraped successfully!
‚úÖ Page 16 scraped successfully!
‚úÖ Page 17 scraped successfully!
‚úÖ Page 18 scraped successfully!
‚úÖ Page 19 scraped successfully!
‚úÖ Page 20 scraped successfully!
‚úÖ Page 21 scraped successfully!
‚úÖ Page 22 scraped successfully!
‚úÖ Page 23 scraped successfully!
‚úÖ Page 24 scraped successfully!
‚úÖ Page 25 scraped successfully!
‚úÖ Page 26 scraped successfully!
‚úÖ Page 27 scraped successfully!
‚úÖ Page 28 scraped successfully!
‚úÖ Page 29 scraped successfully!
‚úÖ Page 30 scraped suc

PySpark:
1. Setup PySpark Environment

In [4]:
!apt-get install openjdk-11-jdk-y
!pip install pyspark

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Waiting for headers] [Connecting to r2u.stat.illinois.edu] [Connected to de                                                                               Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
0% [5 InRelease 127 kB/127 kB 100%] [Connecting to r2u.stat.illinois.edu] [Wait0% [Connecting to r2u.stat.illinois.edu] [Waitin

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [6]:
pyspark.__version__

'3.5.1'

In [7]:
spark = SparkSession.builder.appName("Logistics").getOrCreate()

2. Create a Spark DataFrame

In [14]:
df = spark.read.csv("/content/Logistics_dataset.csv", header = True, inferSchema=True)

In [15]:
df.show()

+-------+----------+---------+---------+-----------+-----------+---------------+---------------+-------------+------------+-----------+----------------+------------+---------+--------------+
|Trip_ID|Vehicle_ID|Driver_ID|Departure|Desgination|Distance_km|Fuel_Consumed_L|Delivery_Status|Delivery_Date|Vehicle_Type|Capacity_kg|Maintenance_Cost|On-Time Trip|Fuel_Cost|Total_TripCost|
+-------+----------+---------+---------+-----------+-----------+---------------+---------------+-------------+------------+-----------+----------------+------------+---------+--------------+
|   T001|       V04|      D01|    Delhi|       Pune|       1173|         108.42|        On-Time|   27/01/2023|  Mini-Truck|       8803|            9033|           1|  9269.91|      18302.91|
|   T004|       V04|      D09|Hyderabad|       Pune|        382|           26.6|        On-Time|   18/02/2023|  Mini-Truck|       8803|            9033|           1|   2274.3|       11307.3|
|   T002|       V06|      D08|   Mumbai|  Ban

In [16]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [17]:
df.printSchema()

root
 |-- Trip_ID: string (nullable = true)
 |-- Vehicle_ID: string (nullable = true)
 |-- Driver_ID: string (nullable = true)
 |-- Departure: string (nullable = true)
 |-- Desgination: string (nullable = true)
 |-- Distance_km: integer (nullable = true)
 |-- Fuel_Consumed_L: double (nullable = true)
 |-- Delivery_Status: string (nullable = true)
 |-- Delivery_Date: string (nullable = true)
 |-- Vehicle_Type: string (nullable = true)
 |-- Capacity_kg: integer (nullable = true)
 |-- Maintenance_Cost: integer (nullable = true)
 |-- On-Time Trip: integer (nullable = true)
 |-- Fuel_Cost: double (nullable = true)
 |-- Total_TripCost: double (nullable = true)



In [20]:
df.show(10)

+-------+----------+---------+---------+-----------+-----------+---------------+---------------+-------------+------------+-----------+----------------+------------+---------+--------------+
|Trip_ID|Vehicle_ID|Driver_ID|Departure|Desgination|Distance_km|Fuel_Consumed_L|Delivery_Status|Delivery_Date|Vehicle_Type|Capacity_kg|Maintenance_Cost|On-Time Trip|Fuel_Cost|Total_TripCost|
+-------+----------+---------+---------+-----------+-----------+---------------+---------------+-------------+------------+-----------+----------------+------------+---------+--------------+
|   T001|       V04|      D01|    Delhi|       Pune|       1173|         108.42|        On-Time|   27/01/2023|  Mini-Truck|       8803|            9033|           1|  9269.91|      18302.91|
|   T004|       V04|      D09|Hyderabad|       Pune|        382|           26.6|        On-Time|   18/02/2023|  Mini-Truck|       8803|            9033|           1|   2274.3|       11307.3|
|   T002|       V06|      D08|   Mumbai|  Ban

In [24]:
df.describe().select('summary',"Distance_km",'Fuel_Consumed_L','Capacity_kg','Maintenance_Cost','Fuel_Cost','Total_TripCost').show()

+-------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|summary|      Distance_km|   Fuel_Consumed_L|       Capacity_kg| Maintenance_Cost|         Fuel_Cost|    Total_TripCost|
+-------+-----------------+------------------+------------------+-----------------+------------------+------------------+
|  count|               50|                50|                50|               50|                50|                50|
|   mean|          1058.82| 91.15680000000002|            6218.7|         10400.54|         7793.9064|18194.446399999993|
| stddev|576.7382489094182|54.197731048222956|3136.1710324690457|4774.131297024046|4633.9060046230625| 7301.501093818775|
|    min|               64|              4.96|              1207|             5633|            424.08|           6338.08|
|    max|             1956|            211.34|              9941|            18031|          18069.57|          33869.46|
+-------+---------------

3. Filtering Data

In [27]:
df.filter((df.Fuel_Cost<=8000) & (df.Total_TripCost < 10000)).select('Vehicle_ID','Fuel_Cost','Total_TripCost').show()

+----------+---------+--------------+
|Vehicle_ID|Fuel_Cost|Total_TripCost|
+----------+---------+--------------+
|       V06|   2838.6|        8752.6|
|       V02|   448.02|       7224.02|
|       V02|  1386.81|       8162.81|
|       V06| 2665.035|      8579.035|
|       V02| 2464.965|      9240.965|
|       V01| 3752.595|      9385.595|
|       V02| 3217.365|      9993.365|
|       V06|   424.08|       6338.08|
|       V06|  744.705|      6658.705|
+----------+---------+--------------+

