# Preprocessing:
This document pulls our data from GoodRead's website (uses a get request to obtain the image URL), and sorts the data into testing and training data (into CSV files). Each image URL also has a corresponding book genre, which is added as a column in the CSV files.

In [None]:
# import statements
import requests
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display, HTML

In [None]:
# links to pull image urls from and their corresponding genre
genre_links = {
    "Fantasy": [
        "https://www.goodreads.com/choiceawards/readers-favorite-fantasy-books-2024",
        "https://www.goodreads.com/choiceawards/best-fantasy-books-2023",
        "https://www.goodreads.com/choiceawards/best-fantasy-books-2022",
        "https://www.goodreads.com/choiceawards/best-fantasy-books-2021",
        "https://www.goodreads.com/choiceawards/best-fantasy-books-2020",
        "https://www.goodreads.com/choiceawards/best-fantasy-books-2019"
    ],
    "Fiction": [
        "https://www.goodreads.com/choiceawards/readers-favorite-fiction-books-2024",
        "https://www.goodreads.com/choiceawards/best-fiction-books-2023",
        "https://www.goodreads.com/choiceawards/best-fiction-books-2022",
        "https://www.goodreads.com/choiceawards/best-fiction-books-2021",
        "https://www.goodreads.com/choiceawards/best-fiction-books-2020",
        "https://www.goodreads.com/choiceawards/best-fiction-books-2019"
    ],
    "Historical Fiction": [
        "https://www.goodreads.com/choiceawards/readers-favorite-historical-fiction-books-2024",
        "https://www.goodreads.com/choiceawards/best-historical-fiction-books-2023",
        "https://www.goodreads.com/choiceawards/best-historical-fiction-books-2022",
        "https://www.goodreads.com/choiceawards/best-historical-fiction-books-2021",
        "https://www.goodreads.com/choiceawards/best-historical-fiction-books-2020",
        "https://www.goodreads.com/choiceawards/best-historical-fiction-books-2019"
    ],
    "Mystery & Thriller": [
        "https://www.goodreads.com/choiceawards/readers-favorite-mystery-thriller-books-2024",
        "https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2023",
        "https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2022",
        "https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2021",
        "https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2020",
        "https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2019"
    ],
    "Romance": [
        "https://www.goodreads.com/choiceawards/readers-favorite-romance-books-2024",
        "https://www.goodreads.com/choiceawards/best-romance-books-2023",
        "https://www.goodreads.com/choiceawards/best-romance-books-2022",
        "https://www.goodreads.com/choiceawards/best-romance-books-2021",
        "https://www.goodreads.com/choiceawards/best-romance-books-2020",
        "https://www.goodreads.com/choiceawards/best-romance-books-2019"
    ],
    "Science Fiction": [
        "https://www.goodreads.com/choiceawards/readers-favorite-science-fiction-books-2024",
        "https://www.goodreads.com/choiceawards/best-science-fiction-books-2023",
        "https://www.goodreads.com/choiceawards/best-science-fiction-books-2022",
        "https://www.goodreads.com/choiceawards/best-science-fiction-books-2021",
        "https://www.goodreads.com/choiceawards/best-science-fiction-books-2020",
        "https://www.goodreads.com/choiceawards/best-science-fiction-books-2019"
    ],
    "Horror": [
        "https://www.goodreads.com/choiceawards/readers-favorite-horror-books-2024",
        "https://www.goodreads.com/choiceawards/best-horror-books-2023",
        "https://www.goodreads.com/choiceawards/best-horror-books-2022",
        "https://www.goodreads.com/choiceawards/best-horror-books-2021",
        "https://www.goodreads.com/choiceawards/best-horror-books-2020",
        "https://www.goodreads.com/choiceawards/best-horror-books-2019"
    ],
    "Young Adult Fantasy": [
        "https://www.goodreads.com/choiceawards/readers-favorite-ya-fantasy-books-2024",
        "https://www.goodreads.com/choiceawards/best-young-adult-fantasy-books-2023",
        "https://www.goodreads.com/choiceawards/best-young-adult-fantasy-books-2022",
        "https://www.goodreads.com/choiceawards/best-young-adult-fantasy-books-2021",
        "https://www.goodreads.com/choiceawards/best-young-adult-fantasy-books-2020",
        "https://www.goodreads.com/choiceawards/best-young-adult-fantasy-books-2019"
    ],
    "Young Adult Fiction": [
        "https://www.goodreads.com/choiceawards/readers-favorite-ya-fiction-books-2024",
        "https://www.goodreads.com/choiceawards/best-young-adult-fiction-books-2023",
        "https://www.goodreads.com/choiceawards/best-young-adult-fiction-books-2022",
        "https://www.goodreads.com/choiceawards/best-young-adult-fiction-books-2021",
        "https://www.goodreads.com/choiceawards/best-young-adult-fiction-books-2020",
        "https://www.goodreads.com/choiceawards/best-young-adult-fiction-books-2019"
    ],
    "Humor": [

        "https://www.goodreads.com/choiceawards/best-humor-books-2023",
        "https://www.goodreads.com/choiceawards/best-humor-books-2022",
        "https://www.goodreads.com/choiceawards/best-humor-books-2021",
        "https://www.goodreads.com/choiceawards/best-humor-books-2020",
        "https://www.goodreads.com/choiceawards/best-humor-books-2019"
    ],
    "History & Biography": [
        "https://www.goodreads.com/choiceawards/readers-favorite-history-bio-books-2024",
        "https://www.goodreads.com/choiceawards/best-history-biography-books-2023",
        "https://www.goodreads.com/choiceawards/best-history-biography-books-2022",
        "https://www.goodreads.com/choiceawards/best-history-biography-books-2021",
        "https://www.goodreads.com/choiceawards/best-history-biography-books-2020",
        "https://www.goodreads.com/choiceawards/best-history-biography-books-2019"
    ],
    "Memoir and Autobiography": [
        "https://www.goodreads.com/choiceawards/readers-favorite-memoir-books-2024",
        "https://www.goodreads.com/choiceawards/best-memoir-autobiography-books-2023",
        "https://www.goodreads.com/choiceawards/best-memoir-autobiography-books-2022",
        "https://www.goodreads.com/choiceawards/best-memoir-autobiography-books-2021",
        "https://www.goodreads.com/choiceawards/best-memoir-autobiography-books-2020",
        "https://www.goodreads.com/choiceawards/best-memoir-autobiography-books-2019"
    ],
    "Nonfiction": [
        "https://www.goodreads.com/choiceawards/readers-favorite-nonfiction-books-2024",
        "https://www.goodreads.com/choiceawards/best-nonfiction-books-2023",
        "https://www.goodreads.com/choiceawards/best-nonfiction-books-2022",
        "https://www.goodreads.com/choiceawards/best-nonfiction-books-2021",
        "https://www.goodreads.com/choiceawards/best-nonfiction-books-2020",
        "https://www.goodreads.com/choiceawards/best-nonfiction-books-2019"
    ]
}

In [None]:
# function to pull data from website url using get request
#https://www.geeksforgeeks.org/image-scraping-with-python/
def getdata(url):
    #https://stackoverflow.com/questions/67706387/page-404-through-python-requests-but-loads-fine-through-browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
    }
    r = requests.get(url, headers=headers)
    return r.text

In [None]:
# separate into testing/training data
train_data = []
test_data = []

#loop through each genre and its links in the dictionary
for genre, urls in genre_links.items():
    for url in urls:
        #get the webpage content
        htmldata = getdata(url)
        soup = BeautifulSoup(htmldata, 'html.parser')

        #find all the image tags on the page
        img_tags = soup.find_all('img', alt=True)
        #use set to track unique image URLs for each page
        seen_images = set()

        for img in img_tags:
            #get only the images of book covers
            if 'src' in img.attrs and 'by' in img['alt']:
                img_url = img['src']
                if img_url not in seen_images:
                    if "2024" in url or (genre == "Humor" and "2023" in url):
                        test_data.append((genre, img_url))
                    else:
                        train_data.append((genre, img_url))
                    #avoid duplicates
                    seen_images.add(img_url)

# transform and save as csv
df1 = pd.DataFrame(train_data, columns=['Genre', 'Book Cover Image URL'])
df2 = pd.DataFrame(test_data, columns=['Genre', 'Book Cover Image URL'])

df1.to_csv('train_book_covers.csv', index=False)
df2.to_csv('test_book_covers.csv', index=False)
