# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [1]:
import psycopg2 as psql  # PostgreSQL DBs
from sqlalchemy import create_engine  # SQL helper
import pandas as pd
import requests
import random
import time
import os
import sys
# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = False

# Data storage

In [2]:
sys.path.append("..")

In [4]:
!pip install boto3

Collecting boto3
  Downloading https://files.pythonhosted.org/packages/72/1a/97ca7494fd268835f2d2ea2c6b6ea3b7cfe271f22c2adb1ef45cf007d7f3/boto3-1.9.199-py2.py3-none-any.whl (128kB)
Collecting jmespath<1.0.0,>=0.7.1 (from boto3)
  Downloading https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e101367fbdb11f425f13771d27f225bb/jmespath-0.9.4-py2.py3-none-any.whl
Collecting s3transfer<0.3.0,>=0.2.0 (from boto3)
  Downloading https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl (70kB)
Collecting botocore<1.13.0,>=1.12.199 (from boto3)
  Downloading https://files.pythonhosted.org/packages/50/f8/dbe656ee191c2d8b471a86fa07f0d37515611d865deaa034fc2b71dd71e4/botocore-1.12.199-py2.py3-none-any.whl (5.6MB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.9.199 botocore-1.12.199 jmespath-0.9.4 s3transfer-0.2.1


In [5]:
# Custom
import data_fcns as dfc
import keys as keys  # Custom keys lib
import comic_scraper as cs

### Initialize Browser

In [6]:
driver_exe_path = os.path.join(
    os.getcwd(), 'drivers', 'geckodriver-windows.exe')

driver_exe_path = os.path.join(
    os.getcwd(), 'drivers', 'geckodriver')

In [27]:
driver_exe_path

'D:\\dropbox\\flatiron\\capstone\\comics_rx\\comrx\\dev\\drivers\\geckodriver-windows.exe'

ls drivers/

In [28]:
browser = Firefox(options=options, executable_path=driver_exe_path)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [12]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret',
                               'aws_ps_flatiron.json')
secret_path_aws

'C:\\Users\\werlindo\\.secret\\aws_ps_flatiron.json'

In [13]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [14]:
# Instantiate cursor
cur = conn.cursor()

In [15]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [16]:
# Execute the query
cur.execute(query)

In [17]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [18]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [19]:
temp_df['title'] = (temp_df['title_and_num'].apply(dfc.cut_issue_num))

In [20]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [21]:
temp_df['title'] = (temp_df['title'].apply(lambda x: x.replace('&', 'and'))
                    .apply(lambda x: x.replace('?', ''))
                    .apply(lambda x: x.replace('/', ' '))
                    )

### We need to track the titles that need scraping.

In [29]:
titles = list(temp_df['title'].unique())

In [166]:
ctr = ( 77 + 45 + 1318 + 1705 + 3 + 284 + 372 + 104 + 89 + 646 + 101 + 39 + 
        33 + 78 + 352 + 400 + 649
      )

In [167]:
ctr

6295

In [168]:
titles_test = titles[ctr:]

In [169]:
titles_test

['Hoax Hunters 2015',
 'Holy F*ck',
 'Honey Badger Adventures',
 'Honey West',
 'Hookjaw',
 'Hot Lunch Special',
 'House of Montreso',
 'How To Be a Comic Book Artis',
 'Hugh Howeys Wool',
 'Humbug',
 'Hyper Force Neo',
 'I Mage',
 'I Think Our Friend Dan Might',
 'Ian Linvingstones Freeway Fig',
 'Ian Livingstone Freeway Fight',
 'Ian Livingstones Freeway Figh',
 'Igloo Barbecue Global Warmin',
 'Ignited',
 'Ikebana (One Shot) (Mr',
 'Immortal Brothers Green Knigh',
 'Incursion',
 'Indie Comics',
 'Indie Comics Magazine',
 'Indoctrination',
 'Infernoct',
 'Infinity 8',
 'Injury',
 'Insexts',
 'Intrepid Escapegoat',
 'Iron Maiden Legacy of the Bea',
 'Is This Tomorrow (One Shot',
 'Island 731',
 'It Came Out On a Wednesday',
 'It Will All Hurt',
 'Ivar Timewalker',
 'Jack Hammer',
 'Jack the Lantern',
 'Jackpot',
 'Jade Street Protection Servic',
 'Jaegir One Sho',
 'Jazz Legend',
 'Jazz Maynard',
 'Jazz Maynard VOL 2',
 'Jimmys Bastards',
 'John Carpenters Asylum',
 'Johnny Red',
 'Ju

In [170]:
cs.scrape_series_covers(browser, titles_test)

0.Hoax Hunters 2015 was skipped. No title matched.
1.Holy F*ck was skipped. No covers were found.
2.Honey Badger Adventures was skipped. No title matched.
Scraped 3.Honey West!
4.Hookjaw was skipped. No title matched.
Scraped 5.Hot Lunch Special!
Scraped 6.House of Montreso!
Scraped 7.How To Be a Comic Book Artis!
8.Hugh Howeys Wool was skipped. No title matched.
Scraped 9.Humbug!
Scraped 10.Hyper Force Neo!
Scraped 11.I Mage!
12.I Think Our Friend Dan Might was skipped. No title matched.
13.Ian Linvingstones Freeway Fig was skipped. No title matched.
Scraped 14.Ian Livingstone Freeway Fight!
15.Ian Livingstones Freeway Figh was skipped. No title matched.
16.Igloo Barbecue Global Warmin was skipped. No title matched.
17.Ignited was skipped. No covers were found.
18.Ikebana (One Shot) (Mr was skipped. No title matched.
Scraped 19.Immortal Brothers Green Knigh!
Scraped 20.Incursion!
Scraped 21.Indie Comics!
Scraped 22.Indie Comics Magazine!
Scraped 23.Indoctrination!
24.Infernoct was ski

Scraped 215.Pink Panther!
Scraped 216.Pink Panther Cartoon Hour Spe!
Scraped 217.Pink Panther Pink Winter Spec!
218.Pink Panther Super Special Bl was skipped. No title matched.
Scraped 219.Pink Panther Trick Or Pink!
Scraped 220.Pink Panther Vs Inspector!
Scraped 221.Pinocchio Vampire Slayer and th!
Scraped 222.Pitiful Human Lizard!
223.Planet of Daemons was skipped. No title matched.
Scraped 224.Planet of the Nerds!
225.Plume was skipped. No covers were found.
Scraped 226.Political Power!
Scraped 227.Pood!
Scraped 228.Pope Hats!
Scraped 229.Power Button!
Scraped 230.Powerless!
Scraped 231.Priest Purgatory!
Scraped 232.Princeless Make Yourself!
Scraped 233.Princeless Pirate Princess!
Scraped 234.Princeless Raven Pirate Princ!
Scraped 235.Prisoner!
236.Prometheus Eternal One Sho was skipped. No title matched.
Scraped 237.Ps238!
Scraped 238.Pterodactyl Hunters In the Gi!
Scraped 239.Public Relations!
Scraped 240.Punk Mambo!
241.Q2 Rtn Quantum and Woody was skipped. No title matched.
Scra

Scraped 396.Solar Flare: Season Two!
Scraped 397.Solid State Tank Girl!
Scraped 398.Songs For the Dead!
Scraped 399.Sonitus!
Scraped 400.Source!
401.Southern Dog was skipped. No covers were found.
Scraped 402.Space Riders!
403.Space Riders Galaxy of Brutal was skipped. No covers were found.
404.Spencer and Locke was skipped. No title matched.
405.Spirit Collectors Se was skipped. No title matched.
Scraped 406.Spirit Leaves!
Scraped 407.Spiritus!
Scraped 408.Spookhouse!
409.Spookhouse 2 was skipped. No title matched.
Scraped 410.Squarriors!
411.Squarriors VOL 02 Summer was skipped. No title matched.
412.Srg Presents Wolves of Odin O was skipped. No title matched.
413.Ssb Ii Polybagged Ed (Mr was skipped. No title matched.
414.Stabbity Bunny was skipped. No covers were found.
Scraped 415.Stained!
Scraped 416.Stargate Atlantis Singularit!
Scraped 417.Stargate Universe!
418.Stargate Universe Back To Des was skipped. No title matched.
Scraped 419.Starring Sonya Devereaux!
Scraped 420.Steam 

593.Zombie Tramp Xxxmas Special R was skipped. No title matched.
594.Zombie Tramp Xxxmas Special T was skipped. No title matched.
Scraped 595.Abattoir!
Scraped 596.After Dark!
597.Coil Damaged Flip Boo was skipped. No title matched.
Scraped 598.Damaged!
Scraped 599.Driver For the Dead!
Scraped 600.Fvza!
Scraped 601.Hotwire Deep Cut!
Scraped 602.Last Days of American Crime!
603.Radical Premiere Mata Har was skipped. No title matched.
Scraped 604.Rising!
Scraped 605.Ryder On the Storm!
Scraped 606.Time Bomb!
Scraped 607.After Eden!
Scraped 608.Atomic Robo Deadly Art of Sci!
Scraped 609.Atomic Robo Dogs of War!
Scraped 610.Atomic Robo Flying She Devil!
Scraped 611.Atomic Robo Ghost of Station!
Scraped 612.Atomic Robo Knights O T Golde!
Scraped 613.Atomic Robo Real Science Adv!
Scraped 614.Atomic Robo Revenge O T Vampi!
Scraped 615.Atomic Robo Savage Sword of D!
Scraped 616.Atomic Robo Shadow From Beyon!
Scraped 617.Bad Dreams!
Scraped 618.Bodie Troll!
Scraped 619.Chasing Hitler!
Scraped 6

Scraped 752.Grimm Fairy Tales 2017 Holida!
Scraped 753.Grimm Fairy Tales 2018 Holida!
754.Grimm Fairy Tales 2019 Annua was skipped. No title matched.
Scraped 755.Grimm Fairy Tales Annual 201!
756.Grimm Fairy Tales Annual Las was skipped. No title matched.
757.Hellchild Inferno One Shot was skipped. No title matched.
Scraped 758.Hellchild the Unholy!
Scraped 759.Hit List!
Scraped 760.Hollywood Zombie Apocalypse!
Scraped 761.Jasmine Crown of Kings!
Scraped 762.Musketeers!
Scraped 763.Revenge of Wonderland!
Scraped 764.Robyn Hood the Curse!
765.Salems Daughter was skipped. No title matched.
766.Science Fiction and Fantasy Ill was skipped. No title matched.
Scraped 767.Screwed!
Scraped 768.Spirit Hunters!
Scraped 769.Tales From Neverland!
770.Tfw Red Rose Oneshot a Cvr Qu was skipped. No title matched.
771.Tfw White Knight Oneshot a Cv was skipped. No title matched.
Scraped 772.Theater!
Scraped 773.Van Helsing Vs Robyn Hood!
774.Van Helsing Vs the Werewolf was skipped. No title matched.
Sc

In [None]:
titles_test

In [None]:
test_title = 'Vampironica'

In [None]:
search_title(browser, test_title)

In [None]:
click_first_link(browser, test_title, True)

In [None]:
go_cover_gallery(browser)

In [None]:
click_first_image(browser)

In [None]:
click_cover_image(browser)

In [None]:
    """
Find the cover image and click it!"""
cover_img_path = ('/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/' + 
                  'td/table[1]/tbody/tr[1]/td[1]/a[1]/img')

In [None]:
cover_img = browser.find_element_by_xpath(cover_img_path)

In [None]:
cover_img.click()

In [None]:
#    cover_img.click()
url = cover_img.get_attribute('src')

In [None]:
cover_img.get_attribute

In [None]:
print(url)

In [None]:
cover_box_path = '/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/td/table[1]/tbody/tr[1]/td[1]/a[1]'

cover_box = browser.find_element_by_xpath(cover_box_path)

In [None]:
url = cover_box.get_attribute('href')

In [None]:
save_large_image(browser, test_title)

### Update the code to scrape the large images.

In [None]:
def scrape_series_covers(browser, titles):
    """Use Selenium to scrape images for comic book titles"""
    start_time = time.time()

    for idx, title in enumerate(titles):
        # Search for the title
        search_title(browser, title)
        if not no_results_found(browser):
            # Once on search results, just select first issue of results
            click_first_link(browser, title, True)

            # Go to the cover gallery of issue page
            go_cover_gallery(browser)

            # Once in cover gallery, just scrape the first image
            try:
#                 get_first_image(browser, title)
                click_first_image(browser)
                click_cover_image(browser)
                save_large_image(browser, title)
                print("Scraped {}.{}!".format(idx, title))
            except NoSuchElementException:
                print("{}.{} was skipped. No covers were found."
                      .format(idx, title))

                # Go back to homepage so can do it again!
                # go_back_home_comicbookdb(browser)
        else:
            print("{}.{} was skipped. No title matched.".format(idx, title))
            # Wait random time
            time.sleep(2 + random.random()*5)

    print('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))

#     print("All done!")

In [None]:
def no_results_found(browser):
    """Return no result found if path fails"""
    xpath = '/html/body/table/tbody/tr[2]/td[3]'
    result = browser.find_element_by_xpath(xpath)
    return result.text == 'No results found.'

In [None]:
def search_title(browser, title):
    """
    Given Selenium browser obj and a comic title to search for
    Enter title into search box and Search
    """
    # Find search box and enter search text
    text_area = browser.find_element_by_id('form_search')
    text_area.send_keys(Keys.CONTROL, "a")
    text_area.send_keys(title)

    # Find Search type dropdown and make sure it says 'Title'
    search_type = Select(browser.find_element_by_name('form_searchtype'))
    search_type.select_by_value('Title')

    # Push the search button!
    sb_xpath = ('/html/body/table/tbody/tr[2]/td[1]' +
                '/table/tbody/tr[4]/td/form/input[2]')
    search_button = browser.find_element_by_xpath(sb_xpath)
    search_button.click()

In [None]:
def search_site(browser, title):
    """
    Given Selenium browser obj and a comic title to search for
    Enter title into search box and Search
    """
    # Find search box and enter search text
    text_area = browser.find_element_by_id('form_search')
    text_area.send_keys(Keys.CONTROL, "a")
    text_area.send_keys(title)

    # Find Search type dropdown and make sure it says 'Title'
    # Push the search button!
    sb_xpath = ('/html/body/table/tbody/tr[2]/td[1]' +
                '/table/tbody/tr[4]/td/form/input[2]')
    search_button = browser.find_element_by_xpath(sb_xpath)
    search_button.click()

In [None]:
def click_first_link(browser, title, title_search_flag):
    """
    Find first issue link and click it
    """
    # Find first issue link in search results
    if title_search_flag:
        x_path = '/html/body/table/tbody/tr[2]/td[3]/a[1]'
    else:
        x_path = '/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/td/a[1]'

    first_issue_link = browser.find_element_by_xpath(x_path)

    # Click
    first_issue_link.click()

In [None]:
def go_cover_gallery(browser):
    """
    Click on Cover Gallery button
    """
    gb_xpath = ("/html/body/table/tbody/tr[2]/td[3]/table[1]" +
                "/tbody/tr/td/a[4]/img"
                )
    gb_xpath = '//a[img/@src="graphics/button_title_covergallery.gif"]'

    gallery_btn = browser.find_element_by_xpath(gb_xpath)
    gallery_btn.click()

In [None]:
def click_first_image(browser):
    """
    Find first image in cover gallery and click it!
    """
    # Find first image
    first_img_path = ('/html/body/table/tbody/tr[2]/td[3]/' +
                      'table/tbody/tr[1]/td[1]/a/img')
    first_img = browser.find_element_by_xpath(first_img_path)   
    first_img.click()

def click_cover_image(browser):
    """
    Find the cover image and click it!"""
    cover_img_path = ('/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/' + 
                      'td/table[1]/tbody/tr[1]/td[1]/a[1]/img')
    cover_img = browser.find_element_by_xpath(cover_img_path)
    cover_img.click()
#     url = cover_img.get

def save_large_image(browser, title):
    """
    Assuming you are on page with large cover image, scrape it
    """
#     cover_img_path = ('/html/body/img')
#     cover_img = browser.find_element_by_xpath(cover_img_path)    

    cover_box_path = ('/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/' + 
                      'td/table[1]/tbody/tr[1]/td[1]/a[1]')
    cover_box = browser.find_element_by_xpath(cover_box_path)
    url = cover_box.get_attribute('href')
    
    # Construct path and file name
    filename = ('./raw_data/covers_large/' + title.replace(' ', '_').lower()
                + '.jpg'
                )

    # Save the file in the file/path
    scrape_image_url(url, filename)
    

def scrape_image_url(url, filename):
    """Save an image element as filename"""
    response = requests.get(url)
    img_data = response.content
    with open(filename, 'wb') as f:
        f.write(img_data)

In [None]:
def get_first_image(browser, title):
    """
    Find first image in cover gallery and scrape it!
    """
    # Find first image
    first_img_path = ('/html/body/table/tbody/tr[2]/td[3]/' +
                      'table/tbody/tr[1]/td[1]/a/img')
    first_img = browser.find_element_by_xpath(first_img_path)
    
    
    # Construct path and file name
    filename = ('./raw_data/covers/' + title.replace(' ', '_').lower()
                + '.jpg'
                )

    # Save the file in the file/path
    scrape_image(first_img, filename)

    return

In [None]:
def scrape_image(img, filename):
    """Save an image element as filename"""
    response = requests.get(img.get_attribute('src'))
    img_data = response.content
    with open(filename, 'wb') as f:
        f.write(img_data)

In [None]:
def go_back_home_comicbookdb(browser):
    """Go directly back to comicbookdb.com home via logolink"""
    # Find image link to go back home
    home_pg_xpath = ('/html/body/table/tbody/tr[1]/td/table/tbody' +
                     '/tr[1]/td/table/tbody/tr/td[1]/a/img')
    logo_btn = browser.find_element_by_xpath(home_pg_xpath)

    # Click!
    logo_btn.click()

In [None]:
sample_titles = titles[:300]

In [None]:
sample_titles

Get list, sorted by qty sold

In [None]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum(
).sort_values(by=['qty_sold'], ascending=False)

In [None]:
qtys.head()

#### ...And scraping periodically fails. Have manually tracked the 'stopping' point.

In [None]:
done_titles = titles[:300]

In [None]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [None]:
titles_needed_df.shape

In [None]:
titles_need_list = list(titles_needed_df.title.unique())

In [None]:
# 367+246+151
827+151+376+524+5+47+1662+3+162+155+15+295+927+143+60

In [None]:
new_start = 5352  # 1932

In [None]:
titles_searching = titles_need_list[new_start:]

In [None]:
titles_searching

## It's the Scraping.

In [None]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)