# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [13]:
import pandas as pd
import requests
import random
import time
import os
import sys
# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

In [14]:
sys.path.append("..")

In [15]:
# Custom
import data_fcns as dfc
import keys as keys  # Custom keys lib
import comic_scraper as cs

### Initialize Browser

In [16]:
driver_exe_path = os.path.join(os.getcwd(), 'drivers'
                               ,'geckodriver-windows.exe')

In [17]:
driver_exe_path

'D:\\dropbox\\flatiron\\capstone\\comics_rx\\comrx\\dev\\drivers\\geckodriver-windows.exe'

In [18]:
browser = Firefox(options=options, executable_path=driver_exe_path)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [19]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'C:\\Users\\werlindo\\.secret\\aws_ps_flatiron.json'

In [20]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [21]:
# Instantiate cursor
cur = conn.cursor()

In [22]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [23]:
# Execute the query
cur.execute(query)

In [24]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [25]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [26]:
temp_df['title'] = ( temp_df['title_and_num'].apply(dfc.cut_issue_num) )

In [27]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [28]:
temp_df['title'] = (temp_df['title'].apply(lambda x : x.replace('&' ,'and'))
                    .apply(lambda x : x.replace('?' ,''))
                    .apply(lambda x : x.replace('/' ,' '))
                   )

In [29]:
titles = list(temp_df['title'].unique())

In [30]:
sample_titles = titles[:300]

In [31]:
sample_titles

['Filler Bunny',
 'Gargoyles',
 'Royal Historian of Oz',
 'Warlord of Io and Other Storie',
 'Afterlife With Archie',
 'Afterlife With Archie Magazin',
 'Archie',
 'Boys',
 'Archie and Friends Double Diges',
 'Archie and Friends Double Doubl',
 'Archie 1941',
 'Archie and Me Comics Digest',
 'Archie and Me Jumbo Comics Di',
 'Archie Collectors E',
 'Archie Comics Annual Digest',
 'Archie Comics Double Digest',
 'Archie Double Digest',
 'Archie Jumbo Comics Digest',
 'Archie Meets Batman 66',
 'Archie Meets Ramones One Sho',
 'Archie Vs Sharknado One Shot',
 'Archies',
 'Archies Halloween Spectacula',
 'Archies One Shot Cvr A Jaime',
 'Archies Superteens Vs Crusade',
 'B and V Friends Double Digest',
 'B and V Friends Halloween Annua',
 'B and V Friends Jumbo Comics Di',
 'Betty and Veronica',
 'Betty and Veronica Comics Annua',
 'Betty and Veronica Comics Diges',
 'Betty and Veronica Comics Doubl',
 'Betty and Veronica Friends Fore',
 'Betty and Veronica Holiday Annu',
 'Betty and Vero

Get list, sorted by qty sold

In [32]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum().sort_values(by=['qty_sold'], ascending=False)

In [33]:
qtys.head()

Unnamed: 0,title,qty_sold
553,Batman,9562
6699,Walking Dead,6856
229,Amazing Spider-Man,5828
5098,Saga,5542
5928,Superman,5197


In [34]:
done_titles = titles[:300]

In [35]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [36]:
titles_needed_df.shape

(6771, 2)

In [37]:
titles_need_list = list(titles_needed_df.title.unique())

In [47]:
#367+246+151
827+151+376+524+5+47+1662+3+162+155+15+295+927+143+60

5352

In [48]:
new_start=5352 #1932

In [49]:
titles_searching = titles_need_list[new_start:]

In [50]:
titles_searching

['Hack Slash Annual 2011 Hatche',
 'Disney Magic Kingdom Comics',
 'Death of Cerebus In Hell',
 'Robocop Beta One Sho',
 'Hellcyon',
 'Bomb Queen All Girl Spec',
 'Iron Man Titanium',
 'True Believers Venom Shiver',
 'Locke and Key Clockworks #1 and',
 'Death of Dracula',
 'Skeptics',
 'Iron Siege',
 'Irredeemable Incorruptible',
 'Batman Doc Savage Special',
 'Bolts',
 'Anthem',
 'Red Sonja Annual',
 'Sif',
 'Super Dinosaur Dlx Coloring B',
 'Die Hard Year One',
 '1 For $1 X',
 'Black Widow and Marvel Girls',
 'Batman Gotham After Midnight',
 'Cavalry Shield 50th Anniversa',
 'Df Nemesis',
 'Richard Starks Parker the Out',
 'Rem 8',
 'Die Kitty Die',
 'Devil Is Due In Dreary',
 'Lcsd 2015 All New All Differe',
 'Mighty Man (One Shot) (Mr',
 'Jim Hensons Labyrinth 2017 Sp',
 'Fall Out Toy Works',
 'Harbinger Wars 2 Prelude',
 'Retailer Summit 2019 Venom',
 'Last Space Race',
 'Disney Muppets Presents Meet',
 'Jingle Belle Gift Wrapped Spe',
 'Black Panther Sound and Fury',
 'Disney Gia

## It's the Scraping.

In [None]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)

0.Hack Slash Annual 2011 Hatche was skipped. No title matched.
Scraped 1.Disney Magic Kingdom Comics!
Scraped 2.Death of Cerebus In Hell!
3.Robocop Beta One Sho was skipped. No title matched.
Scraped 4.Hellcyon!
Scraped 5.Bomb Queen All Girl Spec!
Scraped 6.Iron Man Titanium!
Scraped 7.True Believers Venom Shiver!
8.Locke and Key Clockworks #1 and was skipped. No title matched.
Scraped 9.Death of Dracula!
Scraped 10.Skeptics!
Scraped 11.Iron Siege!
Scraped 12.Irredeemable Incorruptible!
Scraped 13.Batman Doc Savage Special!
Scraped 14.Bolts!
Scraped 15.Anthem!
16.Red Sonja Annual was skipped. No title matched.
Scraped 17.Sif!
18.Super Dinosaur Dlx Coloring B was skipped. No title matched.
Scraped 19.Die Hard Year One!
20.1 For $1 X was skipped. No title matched.
21.Black Widow and Marvel Girls was skipped. No title matched.
Scraped 22.Batman Gotham After Midnight!
23.Cavalry Shield 50th Anniversa was skipped. No title matched.
24.Df Nemesis was skipped. No title matched.
25.Richard Sta

187.Tangled Hair Raising Adventur was skipped. No title matched.
Scraped 188.Millennium Girl Who Kicked th!
189.Cavalry Shield 50th Anniv was skipped. No title matched.
190.Tyrannosaurus Rex One-Shot Cv was skipped. No title matched.
191.Batman the Killing Joke New P was skipped. No title matched.
Scraped 192.Tarzan On the Planet of the A!
Scraped 193.Cave Carson Has a Interstella!
Scraped 194.Tarzan Once and Future Tarzan O!
195.Honey Badger Adventures was skipped. No title matched.
196.Tinseltown was skipped. No title matched.
Scraped 197.Captain Victory Graphite E!
198.Lil Battlestar Galactica was skipped. No title matched.
199.True Believers Venom Sybiosi was skipped. No title matched.
Scraped 200.Iron Man Coming of Melter!
Scraped 201.Iron Man Kiss and Kill!
Scraped 202.True Believers Venom Lethal P!
Scraped 203.Jack the Lantern!
Scraped 204.Blue Hour!
Scraped 205.True Believers Venom Flashpoi!
Scraped 206.Yo-Kai Watch!
Scraped 207.Jazz Legend!
208.Locke and Key Treasury E was ski

Scraped 366.Hellboy Darkness Calls!
367.Shonen Jump December 2011 (C was skipped. No title matched.
368.Ultimate Spider-Man Mgc was skipped. No title matched.
369.Wicked and Divine 1923 Cvr B Ko was skipped. No title matched.
Scraped 370.Punk Mambo!
371.Flash Gordon Mercy Wars was skipped. No title matched.
Scraped 372.Punisher Max Get Castle!
Scraped 373.Vertigo Resurrected the Eater!
374.Snake Eyes (Idw) was skipped. No title matched.
375.Foggy Notions (Mr was skipped. No title matched.
376.Avengers Earths Mightiest Her was skipped. No title matched.
377.Deadlands Death Was Silent On was skipped. No title matched.
Scraped 378.Protocol Orphans!
Scraped 379.Protocol!
Scraped 380.Son of Merlin!
381.Anita Blake Lc Necromancer was skipped. No title matched.
Scraped 382.Vertigo Resurrected Finals!
Scraped 383.Princeless Pirate Princess!
Scraped 384.Avengers Initiative!
Scraped 385.Powers Firsts!
386.Deadpool Mgc was skipped. No title matched.
Scraped 387.Slots #1 Cvr B Walking Dead!
Scrape

542.Disney Big Hero 6 Heroes San was skipped. No title matched.
Scraped 543.Shonen Jump!
544.Vu Handbook 2014 One Dollar D was skipped. No title matched.
545.Sherlock Holmes HC VOL 0 was skipped. No title matched.
546.Fall of Hulks Gamma 2nd Ptg M was skipped. No title matched.
Scraped 547.Voracious!
Scraped 548.Fallen World!
Scraped 549.Reich!
Scraped 550.Regular Show 25 Years Later!
551.Df Green Lantern was skipped. No title matched.
Scraped 552.Shiver Bureau!
Scraped 553.Wolfcop!
Scraped 554.Pilot Season the Beauty!
Scraped 555.Super Hero Squad Spectacular!
556.Gi Joe Infestation was skipped. No title matched.
557.Babybel Wax Bodysuit (One Sho was skipped. No title matched.
558.Gi Joe VOL 2 was skipped. No title matched.
559.Giant Days 2017 Specia was skipped. No title matched.
Scraped 560.Multiple Warheads!
Scraped 561.Giant Days Women Glow Men Plu!
Scraped 562.Super Human Resources!
Scraped 563.Crow Curare!
564.Alias Mgc was skipped. No title matched.
Scraped 565.Critical Millenni