# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [1]:
import psycopg2 as psql  # PostgreSQL DBs
from sqlalchemy import create_engine  # SQL helper
import pandas as pd
import requests
import random
import time
import os
import sys
# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = False

# Data storage

In [2]:
sys.path.append("..")

In [3]:
# Custom
import data_fcns as dfc
import keys as keys  # Custom keys lib
import comic_scraper as cs

### Initialize Browser

driver_exe_path = os.path.join(
    os.getcwd(), 'drivers', 'geckodriver-windows.exe')

In [4]:
driver_exe_path = os.path.join(
    os.getcwd(), 'drivers', 'geckodriver')

In [5]:
driver_exe_path

'/Users/werlindo/Dropbox/flatiron/capstone/comics_rx/comrx/dev/drivers/geckodriver'

ls drivers/

In [6]:
browser = Firefox(options=options, executable_path=driver_exe_path)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [7]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret',
                               'aws_ps_flatiron.json')
secret_path_aws

'/Users/werlindo/.secret/aws_ps_flatiron.json'

In [8]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [9]:
# Instantiate cursor
cur = conn.cursor()

In [10]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [11]:
# Execute the query
cur.execute(query)

In [12]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [13]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [14]:
temp_df['title'] = (temp_df['title_and_num'].apply(dfc.cut_issue_num))

In [15]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [16]:
temp_df['title'] = (temp_df['title'].apply(lambda x: x.replace('&', 'and'))
                    .apply(lambda x: x.replace('?', ''))
                    .apply(lambda x: x.replace('/', ' '))
                    )

### We need to track the titles that need scraping.

In [17]:
titles = list(temp_df['title'].unique())

In [18]:
titles_test = titles[:2]

In [19]:
titles_test

['Filler Bunny', 'Gargoyles']

In [21]:
cs.scrape_series_covers(browser, titles_test)

Scraped 0.Filler Bunny!
Scraped 1.Gargoyles!
Total Runtime: 25.25 seconds


In [None]:
titles_test

In [None]:
test_title = 'Vampironica'

In [None]:
search_title(browser, test_title)

In [None]:
click_first_link(browser, test_title, True)

In [None]:
go_cover_gallery(browser)

In [None]:
click_first_image(browser)

In [None]:
click_cover_image(browser)

In [None]:
    """
Find the cover image and click it!"""
cover_img_path = ('/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/' + 
                  'td/table[1]/tbody/tr[1]/td[1]/a[1]/img')

In [None]:
cover_img = browser.find_element_by_xpath(cover_img_path)

In [None]:
cover_img.click()

In [None]:
#    cover_img.click()
url = cover_img.get_attribute('src')

In [None]:
cover_img.get_attribute

In [None]:
print(url)

In [None]:
cover_box_path = '/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/td/table[1]/tbody/tr[1]/td[1]/a[1]'

cover_box = browser.find_element_by_xpath(cover_box_path)

In [None]:
url = cover_box.get_attribute('href')

In [None]:
save_large_image(browser, test_title)

### Update the code to scrape the large images.

In [None]:
def scrape_series_covers(browser, titles):
    """Use Selenium to scrape images for comic book titles"""
    start_time = time.time()

    for idx, title in enumerate(titles):
        # Search for the title
        search_title(browser, title)
        if not no_results_found(browser):
            # Once on search results, just select first issue of results
            click_first_link(browser, title, True)

            # Go to the cover gallery of issue page
            go_cover_gallery(browser)

            # Once in cover gallery, just scrape the first image
            try:
#                 get_first_image(browser, title)
                click_first_image(browser)
                click_cover_image(browser)
                save_large_image(browser, title)
                print("Scraped {}.{}!".format(idx, title))
            except NoSuchElementException:
                print("{}.{} was skipped. No covers were found."
                      .format(idx, title))

                # Go back to homepage so can do it again!
                # go_back_home_comicbookdb(browser)
        else:
            print("{}.{} was skipped. No title matched.".format(idx, title))
            # Wait random time
            time.sleep(2 + random.random()*5)

    print('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))

#     print("All done!")

In [None]:
def no_results_found(browser):
    """Return no result found if path fails"""
    xpath = '/html/body/table/tbody/tr[2]/td[3]'
    result = browser.find_element_by_xpath(xpath)
    return result.text == 'No results found.'

In [None]:
def search_title(browser, title):
    """
    Given Selenium browser obj and a comic title to search for
    Enter title into search box and Search
    """
    # Find search box and enter search text
    text_area = browser.find_element_by_id('form_search')
    text_area.send_keys(Keys.CONTROL, "a")
    text_area.send_keys(title)

    # Find Search type dropdown and make sure it says 'Title'
    search_type = Select(browser.find_element_by_name('form_searchtype'))
    search_type.select_by_value('Title')

    # Push the search button!
    sb_xpath = ('/html/body/table/tbody/tr[2]/td[1]' +
                '/table/tbody/tr[4]/td/form/input[2]')
    search_button = browser.find_element_by_xpath(sb_xpath)
    search_button.click()

In [None]:
def search_site(browser, title):
    """
    Given Selenium browser obj and a comic title to search for
    Enter title into search box and Search
    """
    # Find search box and enter search text
    text_area = browser.find_element_by_id('form_search')
    text_area.send_keys(Keys.CONTROL, "a")
    text_area.send_keys(title)

    # Find Search type dropdown and make sure it says 'Title'
    # Push the search button!
    sb_xpath = ('/html/body/table/tbody/tr[2]/td[1]' +
                '/table/tbody/tr[4]/td/form/input[2]')
    search_button = browser.find_element_by_xpath(sb_xpath)
    search_button.click()

In [None]:
def click_first_link(browser, title, title_search_flag):
    """
    Find first issue link and click it
    """
    # Find first issue link in search results
    if title_search_flag:
        x_path = '/html/body/table/tbody/tr[2]/td[3]/a[1]'
    else:
        x_path = '/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/td/a[1]'

    first_issue_link = browser.find_element_by_xpath(x_path)

    # Click
    first_issue_link.click()

In [None]:
def go_cover_gallery(browser):
    """
    Click on Cover Gallery button
    """
    gb_xpath = ("/html/body/table/tbody/tr[2]/td[3]/table[1]" +
                "/tbody/tr/td/a[4]/img"
                )
    gb_xpath = '//a[img/@src="graphics/button_title_covergallery.gif"]'

    gallery_btn = browser.find_element_by_xpath(gb_xpath)
    gallery_btn.click()

In [None]:
def click_first_image(browser):
    """
    Find first image in cover gallery and click it!
    """
    # Find first image
    first_img_path = ('/html/body/table/tbody/tr[2]/td[3]/' +
                      'table/tbody/tr[1]/td[1]/a/img')
    first_img = browser.find_element_by_xpath(first_img_path)   
    first_img.click()

def click_cover_image(browser):
    """
    Find the cover image and click it!"""
    cover_img_path = ('/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/' + 
                      'td/table[1]/tbody/tr[1]/td[1]/a[1]/img')
    cover_img = browser.find_element_by_xpath(cover_img_path)
    cover_img.click()
#     url = cover_img.get

def save_large_image(browser, title):
    """
    Assuming you are on page with large cover image, scrape it
    """
#     cover_img_path = ('/html/body/img')
#     cover_img = browser.find_element_by_xpath(cover_img_path)    

    cover_box_path = ('/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/' + 
                      'td/table[1]/tbody/tr[1]/td[1]/a[1]')
    cover_box = browser.find_element_by_xpath(cover_box_path)
    url = cover_box.get_attribute('href')
    
    # Construct path and file name
    filename = ('./raw_data/covers_large/' + title.replace(' ', '_').lower()
                + '.jpg'
                )

    # Save the file in the file/path
    scrape_image_url(url, filename)
    

def scrape_image_url(url, filename):
    """Save an image element as filename"""
    response = requests.get(url)
    img_data = response.content
    with open(filename, 'wb') as f:
        f.write(img_data)

In [None]:
def get_first_image(browser, title):
    """
    Find first image in cover gallery and scrape it!
    """
    # Find first image
    first_img_path = ('/html/body/table/tbody/tr[2]/td[3]/' +
                      'table/tbody/tr[1]/td[1]/a/img')
    first_img = browser.find_element_by_xpath(first_img_path)
    
    
    # Construct path and file name
    filename = ('./raw_data/covers/' + title.replace(' ', '_').lower()
                + '.jpg'
                )

    # Save the file in the file/path
    scrape_image(first_img, filename)

    return

In [None]:
def scrape_image(img, filename):
    """Save an image element as filename"""
    response = requests.get(img.get_attribute('src'))
    img_data = response.content
    with open(filename, 'wb') as f:
        f.write(img_data)

In [None]:
def go_back_home_comicbookdb(browser):
    """Go directly back to comicbookdb.com home via logolink"""
    # Find image link to go back home
    home_pg_xpath = ('/html/body/table/tbody/tr[1]/td/table/tbody' +
                     '/tr[1]/td/table/tbody/tr/td[1]/a/img')
    logo_btn = browser.find_element_by_xpath(home_pg_xpath)

    # Click!
    logo_btn.click()

In [None]:
sample_titles = titles[:300]

In [None]:
sample_titles

Get list, sorted by qty sold

In [None]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum(
).sort_values(by=['qty_sold'], ascending=False)

In [None]:
qtys.head()

#### ...And scraping periodically fails. Have manually tracked the 'stopping' point.

In [None]:
done_titles = titles[:300]

In [None]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [None]:
titles_needed_df.shape

In [None]:
titles_need_list = list(titles_needed_df.title.unique())

In [None]:
# 367+246+151
827+151+376+524+5+47+1662+3+162+155+15+295+927+143+60

In [None]:
new_start = 5352  # 1932

In [None]:
titles_searching = titles_need_list[new_start:]

In [None]:
titles_searching

## It's the Scraping.

In [None]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)