## Goodreads details

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
BASE_URL = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page={}"
BOOK_URL_PREFIX = "https://www.goodreads.com"

In [None]:
soup = BeautifulSoup(response.text, "html.parser")
soup

<!DOCTYPE html>

<html class="desktop withSiteHeaderTopFullImage">
<head>
<title>Best Books Ever (72651 books)</title>
<meta content="72,651 books based on 266885 votes: The Hunger Games by Suzanne Collins, Harry Potter and the Order of the Phoenix by J.K. Rowling, Pride and Prejudice b..." name="description"/>
<meta content="telephone=no" name="format-detection"/>
<link href="https://www.goodreads.com/list/show/1.Best_Books_Ever" rel="canonical"/>
<script type="text/javascript"> var ue_t0=window.ue_t0||+new Date();
 </script>
<script type="text/javascript">
    var ue_mid = "A1PQBFHBHS6YH1";
    var ue_sn = "www.goodreads.com";
    var ue_furl = "fls-na.amazon.com";
    var ue_sid = "666-6266187-9494708";
    var ue_id = "FJC6XRYJT9S868Q3E3V6";

    (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n:h,a:arguments,t:a.d()})}}function b(m,l,h,j,i){v

## Complete goodreads_books_detailed.json

In [None]:
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
REQUEST_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

JSON_FILE = "goodreads_books_details.json"

books_data = []
target = 5000  # จำนวนหนังสือที่ต้องการดึงข้อมูล
book_count = 0
page = 1

def get_book_details(book_url):
    """เข้าไปยังหน้าของหนังสือเพื่อดึง published_date, genres, ratings, reviews"""
    full_url = BOOK_URL_PREFIX + book_url
    response = requests.get(full_url, headers=REQUEST_HEADERS)

    if response.status_code != 200:
        return "N/A", [], "N/A", "N/A"

    soup = BeautifulSoup(response.text, "html.parser")

    # ดึง published_date
    published_tag = soup.find("p", {"data-testid": "publicationInfo"})
    published_date = published_tag.text.replace("First published ", "").strip() if published_tag else "N/A"

    # ดึง genres (จำกัด 5 genre แรก)
    genre_tags = soup.select("span.BookPageMetadataSection__genreButton a")
    genres = [genre.text.strip() for genre in genre_tags[:5]] if genre_tags else []

    # ดึง ratings และ reviews
    rating_stats = soup.find("div", class_="ReviewsSectionStatistics__ratingStatistics")
    if rating_stats:
        ratings_tag = rating_stats.find("span", {"data-testid": "ratingsCount"})
        reviews_tag = rating_stats.find("span", {"data-testid": "reviewsCount"})
        ratings = ratings_tag.text.strip() if ratings_tag else "N/A"
        reviews = reviews_tag.text.strip() if reviews_tag else "N/A"
    else:
        ratings, reviews = "N/A", "N/A"

    return published_date, genres, ratings, reviews

def process_book(book, book_count):
    title_tag = book.find("a", class_="bookTitle")
    title = title_tag.text.strip() if title_tag else "N/A"
    book_url = title_tag["href"] if title_tag else ""

    author_tag = book.find("a", class_="authorName")
    author = author_tag.text.strip() if author_tag else "N/A"

    rating_tag = book.find("span", class_="minirating")
    avg_rating = rating_tag.text.split(" — ")[0].strip() if rating_tag else "N/A"

    published_date, genres, ratings, reviews = get_book_details(book_url)

    return {
        "rank": book_count + 1,
        "title": title,
        "author": author,
        "avg_rating": avg_rating,
        "ratings": ratings,
        "reviews": reviews,
        "published_date": published_date,
        "genres": genres
    }

while book_count < target:
    url = BASE_URL.format(page)
    print(f"Scraping page {page}...")

    response = requests.get(url, headers=REQUEST_HEADERS)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}. Status Code: {response.status_code}")
        break

    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("tr")

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_book = {executor.submit(process_book, book, book_count + i): i for i, book in enumerate(books) if book_count + i < target}

        for future in as_completed(future_to_book):
            books_data.append(future.result())
            book_count += 1
            print(f"{book_count}. {future.result()['title']} - {future.result()['author']}")

    # ไปหน้าถัดไป
    page += 1

    # บันทึก JSON ทุกครั้งหลัง scrape แต่ละหน้า
    with open(JSON_FILE, "w", encoding="utf-8") as json_file:
        json.dump(books_data, json_file, ensure_ascii=False, indent=4)

    print(f"Sleeping for 5 seconds...\n")
    time.sleep(5)

print(f"\nScraping Completed! Total books scraped: {book_count}")

[1;30;43mเอาต์พุตของการสตรีมมีการตัดเหลือเพียง 5000 บรรทัดสุดท้าย[0m
149. American Gods - Neil Gaiman
150. Lonesome Dove (Lonesome Dove, #1) - Larry McMurtry
151. The Little House Collection (Little House, #1-9) - Laura Ingalls Wilder
152. Catching Fire (The Hunger Games #2) - Suzanne Collins
153. Good Omens: The Nice and Accurate Prophecies of Agnes Nutter, Witch - Terry Pratchett
154. Peter Pan - J.M. Barrie
155. The Battle of the Labyrinth (Percy Jackson and the Olympians, #4) - Rick Riordan
156. The Song of Achilles - Madeline Miller
157. Love in the Time of Cholera - Gabriel García Márquez
158. The Titan’s Curse (Percy Jackson and the Olympians, #3) - Rick Riordan
159. Gone Girl - Gillian Flynn
160. Insurgent (Divergent, #2) - Veronica Roth
161. The Secret History - Donna Tartt
162. The Addiction Manifesto - Jerry Weaver
163. A Walk to Remember - Nicholas Sparks
164. The Sea of Monsters (Percy Jackson and the Olympians, #2) - Rick Riordan
165. Clockwork Princess (The Infernal De

In [None]:
import pandas as pd
Book = pd.read_json('/content/goodreads_books_details.json')
print(type(Book))
Book

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,rank,title,author,avg_rating,ratings,reviews,published_date,genres
0,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.50 avg rating,"3,612,786 ratings","71,341 reviews","June 21, 2003","[Fantasy, Young Adult, Fiction, Magic, Audiobook]"
1,4,To Kill a Mockingbird,Harper Lee,4.26 avg rating,"6,557,047 ratings","124,967 reviews","July 11, 1960","[Fiction, Historical Fiction, School, Literatu..."
2,1,The Hunger Games (The Hunger Games #1),Suzanne Collins,4.34 avg rating,"9,306,998 ratings","233,187 reviews","September 14, 2008","[Young Adult, Fiction, Fantasy, Science Fictio..."
3,5,The Book Thief,Markus Zusak,4.39 avg rating,"2,743,291 ratings","153,149 reviews","September 1, 2005","[Historical Fiction, Fiction, Young Adult, Cla..."
4,3,Pride and Prejudice,Jane Austen,4.29 avg rating,"4,513,002 ratings","129,806 reviews","January 28, 1813","[Fiction, Historical Fiction, Historical, Lite..."
5,9,The Chronicles of Narnia (The Chronicles of Na...,C.S. Lewis,4.28 avg rating,"685,086 ratings","13,154 reviews","January 1, 1956","[Fantasy, Classics, Fiction, Young Adult, Chil..."
6,6,"Twilight (The Twilight Saga, #1)",Stephenie Meyer,3.66 avg rating,"6,994,911 ratings","139,193 reviews","October 5, 2005","[Fantasy, Young Adult, Romance, Fiction, Vampi..."
7,8,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.61 avg rating,"139,574 ratings","2,378 reviews","January 1, 1954","[Fantasy, Fiction, Classics, Adventure, Scienc..."
8,10,The Fault in Our Stars,John Green,4.13 avg rating,"5,475,670 ratings","183,785 reviews","January 10, 2012","[Young Adult, Fiction, Contemporary, Realistic..."
9,11,The Picture of Dorian Gray,Oscar Wilde,4.13 avg rating,"1,716,313 ratings","84,001 reviews","June 1, 1890","[Classics, Fiction, Horror, Gothic, Fantasy]"


#### Sorted Object Rank

In [None]:
def sort_json_by_rank(input_file, output_file):
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{input_file}'.")
        return

    sorted_data = sorted(data, key=lambda x: x['rank'])

    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(sorted_data, f, ensure_ascii=False, indent=4)
        print(f"Successfully sorted and saved to '{output_file}'.")
    except Exception as e:
        print(f"An error occurred while saving the file: {e}")

sort_json_by_rank('goodreads_books_details.json', 'sorted_goodreads_books_details.json')


Successfully sorted and saved to 'sorted_goodreads_books_details.json'.


In [None]:
rank = pd.read_json('sorted_goodreads_books_details.json')
print(type(rank))
rank

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,rank,title,author,avg_rating,ratings,reviews,published_date,genres
0,1,The Hunger Games (The Hunger Games #1),Suzanne Collins,4.34 avg rating,"9,306,998 ratings","233,187 reviews","September 14, 2008","[Young Adult, Fiction, Fantasy, Science Fictio..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.50 avg rating,"3,612,786 ratings","71,341 reviews","June 21, 2003","[Fantasy, Young Adult, Fiction, Magic, Audiobook]"
2,3,Pride and Prejudice,Jane Austen,4.29 avg rating,"4,513,002 ratings","129,806 reviews","January 28, 1813","[Fiction, Historical Fiction, Historical, Lite..."
3,4,To Kill a Mockingbird,Harper Lee,4.26 avg rating,"6,557,047 ratings","124,967 reviews","July 11, 1960","[Fiction, Historical Fiction, School, Literatu..."
4,5,The Book Thief,Markus Zusak,4.39 avg rating,"2,743,291 ratings","153,149 reviews","September 1, 2005","[Historical Fiction, Fiction, Young Adult, Cla..."
5,6,"Twilight (The Twilight Saga, #1)",Stephenie Meyer,3.66 avg rating,"6,994,911 ratings","139,193 reviews","October 5, 2005","[Fantasy, Young Adult, Romance, Fiction, Vampi..."
6,7,Animal Farm,George Orwell,really liked it 4.00 avg rating,"4,221,245 ratings","113,267 reviews","August 17, 1945","[Fiction, Dystopia, Fantasy, School, Literature]"
7,8,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.61 avg rating,"139,574 ratings","2,378 reviews","January 1, 1954","[Fantasy, Fiction, Classics, Adventure, Scienc..."
8,9,The Chronicles of Narnia (The Chronicles of Na...,C.S. Lewis,4.28 avg rating,"685,086 ratings","13,154 reviews","January 1, 1956","[Fantasy, Classics, Fiction, Young Adult, Chil..."
9,10,The Fault in Our Stars,John Green,4.13 avg rating,"5,475,670 ratings","183,785 reviews","January 10, 2012","[Young Adult, Fiction, Contemporary, Realistic..."


In [2]:
!git config --global user.name "wittgenstein-byte"
!git config --global user.email "nrvsrpl@gmail.com"

In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [11]:
!ls


drive  sample_data


In [12]:
!cd /content/drive/MyDrive/

In [14]:
!find /content/drive/MyDrive -name "*.ipynb"


/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code หาข้อมูล/goodreads_books_metadata.ipynb
/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code หาข้อมูล/movies_metadata.ipynb
/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code หาข้อมูล/all_books_into_films.ipynb
/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code วิเคราะห์ข้อมูล/Matched.ipynb
/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code วิเคราะห์ข้อมูล/Ranking_movie.ipynb
/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code วิเคราะห์ข้อมูล/Analyst_Quarter.ipynb
/content/drive/MyDrive/Classroom/[CS01] Data Engineering 2024 LAB: Fri 15:00-17:00/LAB02_66338214-4.ipynb
/content/drive/MyDrive/Classroom/[CS01] Data Engineering 2024 LAB: Fri 15:00-17:00/LAB04_663380214_4.ipynb
/content/drive/MyDrive/Classroom/[CS01] Data Engineering 2024 LAB: Fri 15:00-17:00/Lab03_663380214_4.ipynb
/content/drive/MyDrive/Classroom/[CS01] Data Engineering 2024 LAB: Fri 15:00-17:00/66338021

In [21]:
%cd /content/drive/MyDrive/CS11\ Project\ Data\ Engineering/Code/Code\ หาข้อมูล
!ls

/content/drive/MyDrive/CS11 Project Data Engineering/Code/Code หาข้อมูล
all_books_into_films.ipynb	movies_metadata.ipynb
goodreads_books_metadata.ipynb


In [23]:
!git init
!git add goodreads_books_metadata.ipynb

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/CS11 Project Data Engineering/Code/Code หาข้อมูล/.git/


In [24]:
!git status


On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)
	[32mnew file:   goodreads_books_metadata.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mall_books_into_films.ipynb[m
	[31mmovies_metadata.ipynb[m



In [25]:
!git commit -m "goodreads_books_metadata.ipynb"
!git branch -M main
!git remote add origin https://github.com/wittgenstein-byte/Scrap_Books-film_metadata.git
!git push -u origin main

[master (root-commit) f14236d] goodreads_books_metadata.ipynb
 1 file changed, 1 insertion(+)
 create mode 100644 goodreads_books_metadata.ipynb
fatal: could not read Username for 'https://github.com': No such device or address


In [27]:
!git remote add origin git@github.com:wittgenstein-byte/Scrap_Books-film_metadata.git
!git branch -M main
!git push -u origin main

error: remote origin already exists.
fatal: could not read Username for 'https://github.com': No such device or address


In [28]:
!git remote -v
!git remote remove origin
!git remote add origin git@github.com:wittgenstein-byte/Scrap_Books-film_metadata.git
!git remote -v


origin	https://github.com/wittgenstein-byte/Scrap_Books-film_metadata.git (fetch)
origin	https://github.com/wittgenstein-byte/Scrap_Books-film_metadata.git (push)
origin	git@github.com:wittgenstein-byte/Scrap_Books-film_metadata.git (fetch)
origin	git@github.com:wittgenstein-byte/Scrap_Books-film_metadata.git (push)
