# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [1]:
import pandas as pd
import requests
import random
import time
import os

# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

# Custom
import lib.data_fcns as dfc
import lib.keys as keys  # Custom keys lib
import lib.comic_scraper as cs

### Initialize Browser

In [2]:
browser = Firefox(options=options)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [3]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'/Users/werlindo/.secret/aws_ps_flatiron.json'

In [4]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [5]:
# Instantiate cursor
cur = conn.cursor()

In [6]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [7]:
# Execute the query
cur.execute(query)

In [8]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [9]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [10]:
temp_df['title'] = ( temp_df['title_and_num'].apply(dfc.cut_issue_num) )

In [11]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [12]:
temp_df['title'] = temp_df['title'].apply(lambda x : x.replace('&' ,'and'))

In [13]:
titles = list(temp_df['title'].unique())

In [14]:
sample_titles = titles[:300]

In [15]:
sample_titles

['Filler Bunny',
 'Gargoyles',
 'Royal Historian of Oz',
 'Warlord of Io and Other Storie',
 'Afterlife With Archie',
 'Afterlife With Archie Magazin',
 'Archie',
 'Boys',
 'Archie and Friends Double Diges',
 'Archie and Friends Double Doubl',
 'Archie 1941',
 'Archie and Me Comics Digest',
 'Archie and Me Jumbo Comics Di',
 'Archie Collectors E',
 'Archie Comics Annual Digest',
 'Archie Comics Double Digest',
 'Archie Double Digest',
 'Archie Jumbo Comics Digest',
 'Archie Meets Batman 66',
 'Archie Meets Ramones One Sho',
 'Archie Vs Sharknado One Shot',
 'Archies',
 'Archies Halloween Spectacula',
 'Archies One Shot Cvr A Jaime',
 'Archies Superteens Vs Crusade',
 'B and V Friends Double Digest',
 'B and V Friends Halloween Annua',
 'B and V Friends Jumbo Comics Di',
 'Betty and Veronica',
 'Betty and Veronica Comics Annua',
 'Betty and Veronica Comics Diges',
 'Betty and Veronica Comics Doubl',
 'Betty and Veronica Friends Fore',
 'Betty and Veronica Holiday Annu',
 'Betty and Vero

Get list, sorted by qty sold

In [16]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum().sort_values(by=['qty_sold'], ascending=False)

In [17]:
qtys.head()

Unnamed: 0,title,qty_sold
553,Batman,9562
6700,Walking Dead,6856
229,Amazing Spider-Man,5828
5098,Saga,5542
5929,Superman,5197


In [18]:
done_titles = titles[:300]

In [19]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [20]:
titles_needed_df.shape

(6773, 2)

In [21]:
titles_need_list = list(titles_needed_df.title.unique())

In [39]:
#367+246+151
827+151+376

1354

In [43]:
titles_need_list[1358:]

['Siege',
 'Flashpoint Deadman and the Fl',
 'Bat Mite',
 'Beasts of Burden Hellboy One',
 'Daredevil Annual',
 'Star Wars Han Solo Imperial C',
 'Other Dead',
 'Forever Evil Rogues Rebellio',
 '4 Kids Walk Into a Bank',
 'Batman Lost',
 'Baltimore Cult of the Red Kin',
 'Atomic Robo Flying She Devil',
 'Do Androids Dream Dust To Dus',
 'Occupy Avengers',
 'Dark Tower Gunslinger Man In',
 'Punks the Comic',
 'Abe Sapien Devil Does Not Jes',
 'Abe Sapien Abyssal Plain',
 'Shadow Year One',
 'Superman War of the Supermen',
 'Secret Wars Journal',
 'Chew Warrior Chicken Poyo',
 'Red Wolf',
 'Kevin Smith Green Hornet',
 'Wonder Woman Annual',
 'Kill the Minotaur',
 'Courtney Crumrin Ongoing',
 'Mass Effect Foundation',
 'Gft Red Agent',
 'All New X-Men Annual',
 'Artifacts',
 'Moonstruck',
 'Frankenstein Alive Alive',
 'Discipline',
 'Plastic',
 'Garth Ennis Red Team',
 'Fear Itself Uncanny X-Force',
 'Resident Alien',
 'Hellboy Buster Oakley Gets Hi',
 'True Believers Captain Marve',
 'Li

In [36]:
titles_searching = titles_need_list[980:]

In [37]:
titles_searching

['Rick and Morty Vs Dungeons and Dr',
 'Goon Once Upon a Hard Time',
 'Black Bolt',
 'Conan Road of Kings',
 'Constantine Hellblazer City o',
 'Trinity of Sin the Phantom St',
 'X-O Manowar (Ongoing)',
 'Death of Wolverine Logan Lega',
 'Ballad of Sang',
 'Star Wars Thrawn',
 'Phantom Stranger',
 'Larfleeze',
 'Airboy',
 'Superior Spider-Man Annual',
 'Ghost',
 'Suiciders',
 'Rasputin Voice of Dragon',
 'Spider-Man Life Story',
 'Godzilla Gangsters and Goliath',
 'Gft Robyn Hood Ongoing',
 'Animal Man Annual',
 'Axe Cop Bad Guy Earth',
 'Drumhellar',
 'Dark Tower Gunslinger',
 'Now Doctor Strange',
 'Godzilla Kingdom of Monsters',
 'Robin Rises Alpha',
 'Captain Midnight',
 'X-O Manowar',
 'Ad After Death Book 02 (of 3',
 'The Squidder',
 'Fantastic Four Annual',
 'Cry Havoc',
 'Wasteland',
 'Captain Atom',
 'Tales From the Darkside',
 'Bullseye',
 'Captain Marvel and Carol Corp',
 'Action Comics Annual',
 'Five Weapons',
 'Spider-Geddon',
 'Herc',
 'Umbral',
 'Dark Tower Gunslinger Wa

## It's the Scraping.

In [38]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)

Scraped 0.Rick and Morty Vs Dungeons and Dr!
Scraped 1.Goon Once Upon a Hard Time!
Scraped 2.Black Bolt!
Scraped 3.Conan Road of Kings!
4.Constantine Hellblazer City o was skipped. No title matched.
Scraped 5.Trinity of Sin the Phantom St!
6.X-O Manowar (Ongoing) was skipped. No title matched.
Scraped 7.Death of Wolverine Logan Lega!
Scraped 8.Ballad of Sang!
Scraped 9.Star Wars Thrawn!
Scraped 10.Phantom Stranger!
Scraped 11.Larfleeze!
Scraped 12.Airboy!
13.Superior Spider-Man Annual was skipped. No title matched.
Scraped 14.Ghost!
Scraped 15.Suiciders!
Scraped 16.Rasputin Voice of Dragon!
Scraped 17.Spider-Man Life Story!
Scraped 18.Godzilla Gangsters and Goliath!
19.Gft Robyn Hood Ongoing was skipped. No title matched.
20.Animal Man Annual was skipped. No title matched.
Scraped 21.Axe Cop Bad Guy Earth!
Scraped 22.Drumhellar!
Scraped 23.Dark Tower Gunslinger!
24.Now Doctor Strange was skipped. No title matched.
Scraped 25.Godzilla Kingdom of Monsters!
Scraped 26.Robin Rises Alpha!
S

239.Stray Bullets the Killers was skipped. No title matched.
Scraped 240.Invaders!
Scraped 241.Chew Secret Agent Poyo!
Scraped 242.Night of Living Deadpool!
Scraped 243.Fiction!
Scraped 244.Shadow Roads!
Scraped 245.Bully Wars!
Scraped 246.Star Wars Adventures!
Scraped 247.Ducktales!
Scraped 248.The Damned!
Scraped 249.Jim Henson Labyrinth Coronati!
250.Quantum Age From World of Bla was skipped. No title matched.
Scraped 251.Red One!
252.Godzilla Rulers of the Earth was skipped. No title matched.
Scraped 253.Batman Futures End!
Scraped 254.Capt Victory and Galactic Range!
Scraped 255.Ruse!
256.Bprd Hell On Earth Gods was skipped. No title matched.
Scraped 257.Rai!
Scraped 258.Wolverine Max!
Scraped 259.Naomi!
Scraped 260.Ghostbusters!
Scraped 261.Storm Dogs!
Scraped 262.Cowboy Ninja Viking!
Scraped 263.Flashpoint Project Superman!
264.Star Trek City O/T Edge of Fo was skipped. No title matched.
Scraped 265.Roche Limit Clandestiny!
Scraped 266.The Lone Ranger!
Scraped 267.Stellar!
Scrap

FileNotFoundError: [Errno 2] No such file or directory: './raw_data/covers/chew_/_revival.jpg'