# Web Scraping

Here is the website that I plan to scrap: http://books.toscrape.com/index.html
It is a website designed for web scrapping practice - though I'm not sure if that means it will be any easier.
My initial goal is to come up with a list of books and prices and look into book price averages.

# Install Libraries

In [None]:
#pip install selenium

In [1]:
# Selenium for webdriver window
from selenium import webdriver

In [2]:
# Beautiful Soup for web scrapping
from bs4 import BeautifulSoup

In [3]:
# For finalized dataframe
import pandas as pd

In [None]:
# Used along with webdriver to limit speed of search
import time

# Website and Driver

In [4]:
# Set the webpage to scrape
books_url = 'http://books.toscrape.com/index.html'

In [5]:
# Initialize the webcriver with Chrome
driver = webdriver.Chrome('/Users/whipxach/Downloads/chromedriver')

In [6]:
# Set the driver to open the webpage
driver.get(books_url)

# Beautiful Soup and Page Source

In [None]:
# Page source html
my_html = driver.page_source

In [None]:
# bs4 scrapes the page html
soup = BeautifulSoup(my_html, 'html.parser')

# Book Prices

At this point you need to look at the open window of the webpage and right-click on 'inspect' button. Then click on the arrown in the square icon (Command-Shift-C). Now highlight the area in the webpage that you wish to focus on. Under 'Elements' in the Inspect window look at the html so you can use it with the .find_all method!

In [None]:
# Look for the price under ALL div classes that say "product_price"
prices = soup.find_all('div', class_ = "product_price")

In [None]:
# View the first one
prices[0]

In [None]:
# Go further with .find() and .text to get just the price info.
prices[0].find('p', class_ = "price_color" ).text

In [None]:
# Get rid of pound symbol and convert to float
# float(price_list[0].strip('£'))

In [None]:
# Put all these prices in a list
price_list = []
for price in prices:
    amount = price.find(class_ = 'price_color').text
    price_list.append(float(amount.strip('£')))
price_list

In [None]:
# Alternatively, I could have done it this way.
# amounts = soup.find_all('p', class_ = "price_color")
# amounts[0].text
# for amount in amounts:
#    print(amount.text)

# Star Rating

In [None]:
# Trying to figure out number of stars
stars = soup.find_all('p', class_ = "star-rating")
stars

In [None]:
# How can I extract the 'class'
stars[1]

In [None]:
# All the icons are the same - it's the CSS that is different
stars[0]

In [None]:
# What is this thing?
type(stars[0])

In [None]:
# THIS TOOK ME FOREVER!
stars[0].get('class')[1]

In [None]:
len(stars)

In [None]:
# Put the ratings into a list
star_list = []
for i in range(len(stars)):
    star_list.append(stars[i].get('class')[1])
star_list

In [None]:
# Create a dictionary of ratings
ratings = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}

In [None]:
# Convert list of strings to integers
new_star_list = []
for i in star_list:
        new_star_list.append(ratings[i])
new_star_list

In [None]:
# Another way to do it with list comprehension
#another_star_list = [ratings[ele] for ele in star_list]
#another_star_list

# Book Titles

In [None]:
# Trying to figure out the titles
titles = soup.find_all('h3')
titles

In [None]:
# First title?
titles[0]

In [None]:
# The text won't work, since it's not complete!
for t in titles:
    print(t.text)

In [None]:
# Another good example
titles[4]

In [None]:
# Using what I learned from the star rating
for t in titles:
    print(t.find('a').get('title'))

In [None]:
# Put titles in a list
title_list = []
for t in titles:
    title_list.append(t.find('a').get('title'))
title_list

# Dataframe

In [None]:
import pandas as pd

In [None]:
# Turn our three lists into a dataframe
df = pd.DataFrame(list(zip(title_list, price_list, new_star_list)), 
               columns =['Title', 'Price (£)', 'Number of Stars'])

In [None]:
# Check it out
df.head()

In [None]:
# Check out my columns
df.info()

In [None]:
# Look at the stats
df.describe()

In [None]:
# Group by rating
df.groupby('Number of Stars').mean()

# Loop through all pages

In [None]:
# This is the 'next' button tag.
#<a href="page-3.html">next</a>

In [None]:
# This is the XPath for the button
//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[3]/a

In [None]:
# Create the driver for the button
next_button = driver.find_element_by_xpath('//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[3]/a')

In [None]:
# Add the click
next_button.click()

In [None]:
1. get three lists
2. click the 'next' button
3. repeat

# Create a function

In [None]:
price_list = []

def get_info():
    # Look for the price under ALL div classes that say "product_price"
    prices = soup.find_all('div', class_ = "product_price")
    # Go further with .find() and .text to get just the price info.
    prices[0].find('p', class_ = "price_color" ).text
    # Put all these prices in a list
    for price in prices:
        amount = price.find(class_ = 'price_color').text
        price_list.append(float(amount.strip('£')))
    
    # Create the driver for the button
    next_button = driver.find_element_by_xpath('/html/body/div/div/div/div/section/div[2]/div/ul/li[3]/a')
    # Add the click
    next_button.click()
    

In [None]:
price_list = []

In [None]:
# Look for the price under ALL div classes that say "product_price"
prices = soup.find_all('div', class_ = "product_price")
# Go further with .find() and .text to get just the price info.
prices[0].find('p', class_ = "price_color" ).text
# Put all these prices in a list
for price in prices:
    amount = price.find(class_ = 'price_color').text
    price_list.append(float(amount.strip('£')))

In [None]:
price_list

In [None]:
# Create the driver for the button
next_button = driver.find_element_by_xpath('//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[3]/a')


In [None]:
# Add the click
next_button.click()

In [None]:
# Page source html
my_html = driver.page_source

In [None]:
# bs4 scrapes the page html
soup = BeautifulSoup(my_html, 'html.parser')

# Trying to get it to loop!

In [None]:
price_list = []
star_list = []
title_list = []
# Dictionary of ratings
ratings = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}

In [None]:
# Page source html
my_html = driver.page_source
# bs4 scrapes the page html
soup = BeautifulSoup(my_html, 'html.parser')

In [None]:
# All together now...

# Price
prices = soup.find_all('div', class_ = "product_price")
# Go further
prices[0].find('p', class_ = "price_color" ).text
# Put the prices in a list
for price in prices:
    amount = price.find(class_ = 'price_color').text
    # Converts to a float
    price_list.append(float(amount.strip('£')))

# Number of stars
stars = soup.find_all('p', class_ = "star-rating")    
# Put the ratings into a list
for i in range(len(stars)):
    str_rate = stars[i].get('class')[1]
    # Turns it to an integer from the dictionary
    star_list.append(ratings[str_rate])
    
# Titles
titles = soup.find_all('h3')
# Put titles in a list
for t in titles:
    title_list.append(t.find('a').get('title'))

In [None]:
# First time

In [None]:
# Button for the driver #1 (it is different somehow)
next_button = driver.find_element_by_xpath('//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[2]/a')
# Click
next_button.click()

In [None]:
# ***Return to this...

In [None]:
# Page source html
my_html = driver.page_source
# bs4 scrapes the page html
soup = BeautifulSoup(my_html, 'html.parser')

In [None]:
# All together now...

# Price
prices = soup.find_all('div', class_ = "product_price")
# Go further
prices[0].find('p', class_ = "price_color" ).text
# Put the prices in a list
for price in prices:
    amount = price.find(class_ = 'price_color').text
    price_list.append(float(amount.strip('£')))

# Number of stars
stars = soup.find_all('p', class_ = "star-rating")    
# Put the ratings into a list
for i in range(len(stars)):
    str_rate = stars[i].get('class')[1]
    star_list.append(ratings[str_rate])
    
# Titles
titles = soup.find_all('h3')
# Put titles in a list
for t in titles:
    title_list.append(t.find('a').get('title'))

In [None]:
#Second time, third time, etc.

In [None]:
# Driver for the button #2 and on... (Seems the same after this)
next_button = driver.find_element_by_xpath('//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[3]/a')
# Click
next_button.click()

In [None]:
# Then return to ***

# The Function

In [7]:
# Start with the three lists and the ratings dictionary
price_list = []
star_list = []
title_list = []
# Dictionary of ratings
ratings = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}

In [8]:
# This function will start the beautiful soup and populate the three lists.

def get_html():
    # Page source html
    my_html = driver.page_source
    # bs4 scrapes the page html
    soup = BeautifulSoup(my_html, 'html.parser')

    # Price
    prices = soup.find_all('div', class_ = "product_price")
    # Go further
    prices[0].find('p', class_ = "price_color" ).text
    # Put the prices in a list
    for price in prices:
        amount = price.find(class_ = 'price_color').text
        # Converts to a float
        price_list.append(float(amount.strip('£')))

    # Number of stars
    stars = soup.find_all('p', class_ = "star-rating")    
    # Put the ratings into a list
    for i in range(len(stars)):
        str_rate = stars[i].get('class')[1]
        # Turns it to an integer from the dictionary
        star_list.append(ratings[str_rate])

    # Titles
    titles = soup.find_all('h3')
    # Put titles in a list
    for t in titles:
        title_list.append(t.find('a').get('title'))
        
    # Page number    
    page_num = soup.find('li', class_ = 'current').text
        
    return price_list, star_list, title_list, page_num

In [9]:
# This function is used for the first click (next page)
def button_1():
    # Button for the driver #1 (it is different somehow)
    next_button = driver.find_element_by_xpath('//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[2]/a')
    # Click
    next_button.click()

In [10]:
# This function is used for all remaining clicks (next page)
def button_etc():
    # Driver for the button #2 and on... (Seems the same after this)
    next_button = driver.find_element_by_xpath('//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[3]/a')
    # Click
    next_button.click()

In [None]:
# Trying to figure out each page number
page = soup.find('li', class_ = 'current').text
page

In [None]:
# Length of text
len(page)

In [None]:
# Where is the total number of pages located?
int(page[40:43])

In [None]:
# Where is the page I'm on located?
int(page[35:37])

In [None]:
# Also tried using the number of books
soup.find_all('strong')

# Run it!

In [11]:
# Run the first time
get_html()

([51.77,
  53.74,
  50.1,
  47.82,
  54.23,
  22.65,
  33.34,
  17.93,
  22.6,
  52.15,
  13.99,
  20.66,
  17.46,
  52.29,
  35.02,
  57.25,
  23.88,
  37.59,
  51.33,
  45.17],
 [3, 1, 1, 4, 5, 1, 4, 3, 4, 1, 2, 4, 5, 5, 5, 3, 1, 1, 2, 2],
 ['A Light in the Attic',
  'Tipping the Velvet',
  'Soumission',
  'Sharp Objects',
  'Sapiens: A Brief History of Humankind',
  'The Requiem Red',
  'The Dirty Little Secrets of Getting Your Dream Job',
  'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'The Black Maria',
  'Starving Hearts (Triangular Trade Trilogy, #1)',
  "Shakespeare's Sonnets",
  'Set Me Free',
  "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
  'Rip it Up and Start Again',
  'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
  'Olio',
  'Mesaerion: The Best Science Fiction Stories 1800-1

In [12]:
# Run button 1 
button_1()

In [17]:
# While loop for the next 49 pages
i=1
while i <= 49:
    get_html()
    button_etc()
    time.sleep(1)
    i += 1
    if i == 50:
        break

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="default"]/div/div/div/div/section/div[2]/div/ul/li[3]/a"}
  (Session info: chrome=87.0.4280.141)


In [None]:
# I can't get page_num out of the function!
while int(page_num[35:37]) <= 49:
    print(int(page_num[35:37]))
    get_html()
    button_etc()
    time.sleep(1)
    if int(page[35:37]) == 50:
        break

# When done with 50 pages...

In [18]:
# Turn our three lists into a dataframe
df = pd.DataFrame(list(zip(title_list, price_list, star_list)), 
               columns =['Title', 'Price (£)', 'Number of Stars'])
df.head()

Unnamed: 0,Title,Price (£),Number of Stars
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5


In [19]:
# Check out my results
df.describe()

Unnamed: 0,Price (£),Number of Stars
count,1000.0,1000.0
mean,35.07035,2.923
std,14.44669,1.434967
min,10.0,1.0
25%,22.1075,2.0
50%,35.98,3.0
75%,47.4575,4.0
max,59.99,5.0


In [20]:
# Group by rating
df.groupby('Number of Stars').mean()

Unnamed: 0_level_0,Price (£)
Number of Stars,Unnamed: 1_level_1
1,34.561195
2,34.810918
3,34.69202
4,36.093296
5,35.37449


# Conclusions

Looks like four star books are the most expensive (by a little) and then five star books - but there really is no correlation between book price and number of stars (according to this website).