In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# base url of the best sellers page for teaching & education books
base_url = "import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import numpy as np
from urllib.parse import urljoin
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import os

def scrape_books_toscrape(base_url='http://books.toscrape.com/', num_pages=5):
    all_books = []
    for page in range(1, num_pages + 1):
        url = base_url if page == 1 else urljoin(base_url, f'catalogue/page-{page}.html')
        print(f"Scraping page {page}/{num_pages}: {url}")

        try:
            time.sleep(random.uniform(1, 3))
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            book_containers = soup.select('article.product_pod')

            for book in book_containers:
                title = book.h3.a['title']
                price_text = book.select_one('div.product_price p.price_color').text
                price = re.sub(r'[^0-9.]', '', price_text)
                price = pd.to_numeric(price, errors='coerce')
                rating_class = book.select_one('p.star-rating')['class'][1]
                rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
                rating = rating_map.get(rating_class, 0)

                availability_text = book.select_one('p.availability').text.strip()
                in_stock = 1 if 'In stock' in availability_text else 0
                detail_url = urljoin(base_url, book.h3.a['href'])
                category, description = "Unknown", "No description available"

                try:
                    time.sleep(random.uniform(0.5, 1.5))
                    detail_response = requests.get(detail_url, headers=headers)
                    detail_response.raise_for_status()
                    detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
                    breadcrumb = detail_soup.select('ul.breadcrumb li')
                    if len(breadcrumb) >= 3:
                        category = breadcrumb[2].text.strip()
                    desc_element = detail_soup.select_one('div#product_description + p')
                    if desc_element:
                        description = desc_element.text.strip()
                except Exception as e:
                    print(f"Error fetching detail page for {title}: {e}")

                book_data = {'title': title, 'price': price, 'rating': rating, 'category': category,
                             'in_stock': in_stock, 'description': description, 'url': detail_url}
                all_books.append(book_data)
        except Exception as e:
            print(f"Error scraping page {page}: {e}")

    return pd.DataFrame(all_books)

def clean_book_data(df):
    df_clean = df.copy()
    df_clean['description'].fillna('No description available', inplace=True)
    df_clean['title_length'] = df_clean['title'].str.len()
    df_clean['desc_length'] = df_clean['description'].str.len()
    category_dummies = pd.get_dummies(df_clean['category'], prefix='category')
    return pd.concat([df_clean, category_dummies], axis=1)

def build_book_price_model(df):
    feature_cols = ['rating', 'in_stock', 'title_length', 'desc_length'] + [col for col in df.columns if col.startswith('category_')]
    y = df['price']
    X_numerical = df[feature_cols]
    X_num_train, X_num_test, y_train, y_test = train_test_split(X_numerical, y, test_size=0.2, random_state=42)

    title_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
    desc_vectorizer = TfidfVectorizer(max_features=200, stop_words='english')

    X_title_train = title_vectorizer.fit_transform(df.loc[X_num_train.index, 'title'])
    X_title_test = title_vectorizer.transform(df.loc[X_num_test.index, 'title'])
    X_desc_train = desc_vectorizer.fit_transform(df.loc[X_num_train.index, 'description'])
    X_desc_test = desc_vectorizer.transform(df.loc[X_num_test.index, 'description'])

    scaler = StandardScaler()
    X_num_train_scaled = scaler.fit_transform(X_num_train)
    X_num_test_scaled = scaler.transform(X_num_test)

    X_train_combined = np.hstack((X_num_train_scaled, X_title_train.toarray(), X_desc_train.toarray()))
    X_test_combined = np.hstack((X_num_test_scaled, X_title_test.toarray(), X_desc_test.toarray()))

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_combined, y_train)
    y_pred = model.predict(X_test_combined)

    return {'model': model, 'feature_cols': feature_cols, 'title_vectorizer': title_vectorizer,
            'desc_vectorizer': desc_vectorizer, 'scaler': scaler,
            'metrics': {'mse': mean_squared_error(y_test, y_pred),
                        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
                        'r2': r2_score(y_test, y_pred)}}

def save_model_components(model_data, output_dir='book_model'):
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(model_data['model'], f"{output_dir}/model.joblib")
    joblib.dump(model_data['title_vectorizer'], f"{output_dir}/title_vectorizer.joblib")
    joblib.dump(model_data['desc_vectorizer'], f"{output_dir}/desc_vectorizer.joblib")
    joblib.dump(model_data['scaler'], f"{output_dir}/scaler.joblib")
    pd.Series(model_data['feature_cols']).to_csv(f"{output_dir}/feature_cols.csv", index=False)

def main():
    books_df = scrape_books_toscrape(num_pages=10)
    books_df.to_csv("scraped_books_raw.csv", index=False)
    books_df_clean = clean_book_data(books_df)
    books_df_clean.to_csv("scraped_books_clean.csv", index=False)
    model_data = build_book_price_model(books_df_clean)
    save_model_components(model_data)
    print("Model training complete and components saved.")

if __name__ == "__main__":
    main()
}"



# http headers to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

In [3]:
# initialize a list to store book data
book_list = []

# iterate over the first 3 pages to get top 50 books (assuming each page has about 20 items)
for page in range(1, 4):
    # construct the URL for the current page
    url = base_url.format(page, page)

    # send a GET request to the url
    response = requests.get(url, headers=headers)

    # parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "lxml")

    # find all the book elements
    books = soup.find_all("div", {"class": "zg-grid-general-faceout"})

    # iterate over each book element to extract data
    for book in books:
        if len(book_list) < 50:  # stop once we've collected 50 books
            author = book.find("a", class_="a-size-small a-link-child").get_text(strip=True) if book.find("a", class_="a-size-small a-link-child") else "N/A"
            rating = book.find("span", class_="a-icon-alt").get_text(strip=True) if book.find("span", class_="a-icon-alt") else "N/A"

            # append the extracted data to the book_list
            book_list.append({
                "Author": author,
                "Rating": rating
            })
        else:
            break

In [4]:
# convert the list of dictionaries into a DataFrame
df = pd.DataFrame(book_list)

print(df.head())

# save the DataFrame to a CSV file
df.to_csv("amazon_top_50_books_authors_ratings.csv", index=False)

                   Author              Rating
0                PR Yadav  4.4 out of 5 stars
1                     N/A  4.4 out of 5 stars
2  Oswaal Editorial Board  4.7 out of 5 stars
3          एम लक्ष्मीकांत  4.4 out of 5 stars
4      Wonder House Books  4.7 out of 5 stars


In [5]:
print(df.sample(10))

                           Author              Rating
43  Scholastic Teaching Resources  4.6 out of 5 stars
30                            N/A                 N/A
32                   Sweta Adatia  4.0 out of 5 stars
3                  एम लक्ष्मीकांत  4.4 out of 5 stars
29                     R.K. Gupta  4.5 out of 5 stars
2          Oswaal Editorial Board  4.7 out of 5 stars
34                         Mangal  4.5 out of 5 stars
25                Timothy Ferriss  4.5 out of 5 stars
28         ALLEN Expert Faculties  4.1 out of 5 stars
6         EduGorilla Prep Experts  4.2 out of 5 stars


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import numpy as np
from urllib.parse import urljoin
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import os

def scrape_books_toscrape(base_url='http://books.toscrape.com/', num_pages=5):
    all_books = []
    for page in range(1, num_pages + 1):
        url = base_url if page == 1 else urljoin(base_url, f'catalogue/page-{page}.html')
        print(f"Scraping page {page}/{num_pages}: {url}")

        try:
            time.sleep(random.uniform(1, 3))
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            book_containers = soup.select('article.product_pod')

            for book in book_containers:
                title = book.h3.a['title']
                price_text = book.select_one('div.product_price p.price_color').text
                price = re.sub(r'[^0-9.]', '', price_text)
                price = pd.to_numeric(price, errors='coerce')
                rating_class = book.select_one('p.star-rating')['class'][1]
                rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
                rating = rating_map.get(rating_class, 0)

                availability_text = book.select_one('p.availability').text.strip()
                in_stock = 1 if 'In stock' in availability_text else 0
                detail_url = urljoin(base_url, book.h3.a['href'])
                category, description = "Unknown", "No description available"

                try:
                    time.sleep(random.uniform(0.5, 1.5))
                    detail_response = requests.get(detail_url, headers=headers)
                    detail_response.raise_for_status()
                    detail_soup = BeautifulSoup(detail_response.text, 'html.parser')
                    breadcrumb = detail_soup.select('ul.breadcrumb li')
                    if len(breadcrumb) >= 3:
                        category = breadcrumb[2].text.strip()
                    desc_element = detail_soup.select_one('div#product_description + p')
                    if desc_element:
                        description = desc_element.text.strip()
                except Exception as e:
                    print(f"Error fetching detail page for {title}: {e}")

                book_data = {'title': title, 'price': price, 'rating': rating, 'category': category,
                             'in_stock': in_stock, 'description': description, 'url': detail_url}
                all_books.append(book_data)
        except Exception as e:
            print(f"Error scraping page {page}: {e}")

    return pd.DataFrame(all_books)

def clean_book_data(df):
    df_clean = df.copy()
    df_clean['description'].fillna('No description available', inplace=True)
    df_clean['title_length'] = df_clean['title'].str.len()
    df_clean['desc_length'] = df_clean['description'].str.len()
    category_dummies = pd.get_dummies(df_clean['category'], prefix='category')
    return pd.concat([df_clean, category_dummies], axis=1)

def build_book_price_model(df):
    feature_cols = ['rating', 'in_stock', 'title_length', 'desc_length'] + [col for col in df.columns if col.startswith('category_')]
    y = df['price']
    X_numerical = df[feature_cols]
    X_num_train, X_num_test, y_train, y_test = train_test_split(X_numerical, y, test_size=0.2, random_state=42)

    title_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
    desc_vectorizer = TfidfVectorizer(max_features=200, stop_words='english')

    X_title_train = title_vectorizer.fit_transform(df.loc[X_num_train.index, 'title'])
    X_title_test = title_vectorizer.transform(df.loc[X_num_test.index, 'title'])
    X_desc_train = desc_vectorizer.fit_transform(df.loc[X_num_train.index, 'description'])
    X_desc_test = desc_vectorizer.transform(df.loc[X_num_test.index, 'description'])

    scaler = StandardScaler()
    X_num_train_scaled = scaler.fit_transform(X_num_train)
    X_num_test_scaled = scaler.transform(X_num_test)

    X_train_combined = np.hstack((X_num_train_scaled, X_title_train.toarray(), X_desc_train.toarray()))
    X_test_combined = np.hstack((X_num_test_scaled, X_title_test.toarray(), X_desc_test.toarray()))

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_combined, y_train)
    y_pred = model.predict(X_test_combined)

    return {'model': model, 'feature_cols': feature_cols, 'title_vectorizer': title_vectorizer,
            'desc_vectorizer': desc_vectorizer, 'scaler': scaler,
            'metrics': {'mse': mean_squared_error(y_test, y_pred),
                        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
                        'r2': r2_score(y_test, y_pred)}}

def save_model_components(model_data, output_dir='book_model'):
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(model_data['model'], f"{output_dir}/model.joblib")
    joblib.dump(model_data['title_vectorizer'], f"{output_dir}/title_vectorizer.joblib")
    joblib.dump(model_data['desc_vectorizer'], f"{output_dir}/desc_vectorizer.joblib")
    joblib.dump(model_data['scaler'], f"{output_dir}/scaler.joblib")
    pd.Series(model_data['feature_cols']).to_csv(f"{output_dir}/feature_cols.csv", index=False)

def main():
    books_df = scrape_books_toscrape(num_pages=10)
    books_df.to_csv("scraped_books_raw.csv", index=False)
    books_df_clean = clean_book_data(books_df)
    books_df_clean.to_csv("scraped_books_clean.csv", index=False)
    model_data = build_book_price_model(books_df_clean)
    save_model_components(model_data)
    print("Model training complete and components saved.")

if __name__ == "__main__":
    main()


Scraping page 1/10: http://books.toscrape.com/
Scraping page 2/10: http://books.toscrape.com/catalogue/page-2.html
Error fetching detail page for In Her Wake: 404 Client Error: Not Found for url: http://books.toscrape.com/in-her-wake_980/index.html
Error fetching detail page for How Music Works: 404 Client Error: Not Found for url: http://books.toscrape.com/how-music-works_979/index.html
Error fetching detail page for Foolproof Preserving: A Guide to Small Batch Jams, Jellies, Pickles, Condiments, and More: A Foolproof Guide to Making Small Batch Jams, Jellies, Pickles, Condiments, and More: 404 Client Error: Not Found for url: http://books.toscrape.com/foolproof-preserving-a-guide-to-small-batch-jams-jellies-pickles-condiments-and-more-a-foolproof-guide-to-making-small-batch-jams-jellies-pickles-condiments-and-more_978/index.html
Error fetching detail page for Chase Me (Paris Nights #2): 404 Client Error: Not Found for url: http://books.toscrape.com/chase-me-paris-nights-2_977/index.h