<a href="https://colab.research.google.com/github/vanya642004/vanya642004/blob/main/book_price_prediction_using_web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the website to scrape
url = "http://books.toscrape.com/"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Parse the page content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all book containers
books = soup.find_all('article', class_='product_pod')

# List to store scraped data
book_data = []

# Loop through each book container to extract details
for book in books:
    # Extract book title
    title = book.h3.a['title']

    # Extract book price
    price = book.find('p', class_='price_color').text

    # Add details to the list
    book_data.append({'title': title, 'price': price})

# Display the scraped data
for i, book in enumerate(book_data, start=1):
    print(f"{i}. {book['title']} - {book['price']}")


import csv

# Save the data to a CSV file
with open('books.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['title', 'price'])
    writer.writeheader()
    writer.writerows(book_data)
print("Data saved to books.csv")


1. A Light in the Attic - £51.77
2. Tipping the Velvet - £53.74
3. Soumission - £50.10
4. Sharp Objects - £47.82
5. Sapiens: A Brief History of Humankind - £54.23
6. The Requiem Red - £22.65
7. The Dirty Little Secrets of Getting Your Dream Job - £33.34
8. The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull - £17.93
9. The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics - £22.60
10. The Black Maria - £52.15
11. Starving Hearts (Triangular Trade Trilogy, #1) - £13.99
12. Shakespeare's Sonnets - £20.66
13. Set Me Free - £17.46
14. Scott Pilgrim's Precious Little Life (Scott Pilgrim #1) - £52.29
15. Rip it Up and Start Again - £35.02
16. Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991 - £57.25
17. Olio - £23.88
18. Mesaerion: The Best Science Fiction Stories 1800-1849 - £37.59
19. Libertarianism for Beginners - £51.33
20. It's Only the Himalayas - £45.17
Data saved to books.cs

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the CSV file containing the book data
df = pd.read_csv('books.csv')

# Check the shape of the DataFrame and first few rows
print(df.shape)
print(df.head())

# Preprocessing: Clean and prepare the data
df['price'] = df['price'].replace({'£': '', '€': '', '$': ''}, regex=True).astype(float)

# Split the dataset into features (X) and target (y)
X = df['title']
y = df['price']

# Check if X and y are correctly populated
print(f"X length: {len(X)}")
print(f"y length: {len(y)}")

# Convert the titles to numeric using TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(X)

# Check the shape of the TF-IDF features
print(f"Shape of X_tfidf: {X_tfidf.shape}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Check the size of the train and test sets
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Train a machine learning model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Display actual vs predicted prices for a few test samples
for i in range(min(5, len(y_test))):  # Avoid out-of-bounds access
    print(f"Actual Price: {y_test.iloc[i]} | Predicted Price: {y_pred[i]}")


(20, 2)
                                   title   price
0                   A Light in the Attic  £51.77
1                     Tipping the Velvet  £53.74
2                             Soumission  £50.10
3                          Sharp Objects  £47.82
4  Sapiens: A Brief History of Humankind  £54.23
X length: 20
y length: 20
Shape of X_tfidf: (20, 71)
Train size: (16, 71), Test size: (4, 71)
Mean Squared Error: 286.2662215948846
Actual Price: 51.77 | Predicted Price: 34.939944557456286
Actual Price: 37.59 | Predicted Price: 34.939944557456286
Actual Price: 57.25 | Predicted Price: 34.859170589521035
Actual Price: 53.74 | Predicted Price: 34.939944557456286
