# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [1]:
import pandas as pd
import requests
import random
import time
import os

# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

# Custom
import lib.data_fcns as dfc
import lib.keys as keys  # Custom keys lib
import lib.comic_scraper as cs

### Initialize Browser

In [5]:
driver_exe_path = os.path.join(os.getcwd(), 'geckodriver-windows.exe')

In [6]:
driver_exe_path

'D:\\dropbox\\flatiron\\capstone\\comics_rx\\geckodriver-windows.exe'

In [7]:
browser = Firefox(options=options, executable_path=driver_exe_path)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [8]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'C:\\Users\\werlindo\\.secret\\aws_ps_flatiron.json'

In [9]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [10]:
# Instantiate cursor
cur = conn.cursor()

In [11]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [12]:
# Execute the query
cur.execute(query)

In [13]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [14]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [15]:
temp_df['title'] = ( temp_df['title_and_num'].apply(dfc.cut_issue_num) )

In [16]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [17]:
temp_df['title'] = temp_df['title'].apply(lambda x : x.replace('&' ,'and'))

In [18]:
titles = list(temp_df['title'].unique())

In [19]:
sample_titles = titles[:300]

In [20]:
sample_titles

['Filler Bunny',
 'Gargoyles',
 'Royal Historian of Oz',
 'Warlord of Io and Other Storie',
 'Afterlife With Archie',
 'Afterlife With Archie Magazin',
 'Archie',
 'Boys',
 'Archie and Friends Double Diges',
 'Archie and Friends Double Doubl',
 'Archie 1941',
 'Archie and Me Comics Digest',
 'Archie and Me Jumbo Comics Di',
 'Archie Collectors E',
 'Archie Comics Annual Digest',
 'Archie Comics Double Digest',
 'Archie Double Digest',
 'Archie Jumbo Comics Digest',
 'Archie Meets Batman 66',
 'Archie Meets Ramones One Sho',
 'Archie Vs Sharknado One Shot',
 'Archies',
 'Archies Halloween Spectacula',
 'Archies One Shot Cvr A Jaime',
 'Archies Superteens Vs Crusade',
 'B and V Friends Double Digest',
 'B and V Friends Halloween Annua',
 'B and V Friends Jumbo Comics Di',
 'Betty and Veronica',
 'Betty and Veronica Comics Annua',
 'Betty and Veronica Comics Diges',
 'Betty and Veronica Comics Doubl',
 'Betty and Veronica Friends Fore',
 'Betty and Veronica Holiday Annu',
 'Betty and Vero

Get list, sorted by qty sold

In [21]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum().sort_values(by=['qty_sold'], ascending=False)

In [22]:
qtys.head()

Unnamed: 0,title,qty_sold
553,Batman,9562
6700,Walking Dead,6856
229,Amazing Spider-Man,5828
5098,Saga,5542
5929,Superman,5197


In [23]:
done_titles = titles[:300]

In [24]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [25]:
titles_needed_df.shape

(6773, 2)

In [26]:
titles_need_list = list(titles_needed_df.title.unique())

In [48]:
#367+246+151
827+151+376+524+5+47

1930

In [53]:
new_start=1932

In [54]:
titles_searching = titles_need_list[new_start:]

In [55]:
titles_searching

['Monocyte',
 'Original Sin Annual',
 'Disney Frozen',
 'Miss Fury',
 'Mighty Thor Gates of Valhall',
 'Secret Empire Omega',
 'Stan Lee Starborn',
 'Few',
 'Hellboy Bprd 1953 Witch Tree',
 'Transformers Sins of Wrecker',
 'Inhumanity Spider-Man',
 'Hellboy and Bprd 1953 Phantom H',
 'Kingsman Red Diamond',
 'Dead Rabbit',
 'Realm of Kings Inhumans',
 'Goners',
 'Batwoman Rebirth',
 'Usagi Yojimbo Senso',
 'Gft Jungle Book Fall Wild',
 'Archer and Armstrong (Vu)',
 'Baltimore Infernal Train',
 'Spider-Island Cloak and Dagge',
 'Hex Wives',
 'Death Vigil',
 'Future Quest Presents',
 'Convergence New Teen Titans',
 'Now Mighty Thor',
 'Batman the Red Death',
 'It Will All Hurt',
 'Mystery Society',
 'My Little Pony Micro Series',
 'Analog',
 'Forever Evil Argus',
 'Fear Itself Spider-Man',
 'Superman Secret Origin',
 'Batman the Drowned',
 'Farscape Ongoing',
 'Batman and Carrie Kelley',
 'Red She-Hulk',
 'Now Captain America Steve Rog',
 'Convergence Adventures of Sup',
 'Transformers P

## It's the Scraping.

In [None]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)

Scraped 0.Monocyte!
1.Original Sin Annual was skipped. No title matched.
Scraped 2.Disney Frozen!
Scraped 3.Miss Fury!
Scraped 4.Mighty Thor Gates of Valhall!
Scraped 5.Secret Empire Omega!
6.Stan Lee Starborn was skipped. No title matched.
Scraped 7.Few!
8.Hellboy Bprd 1953 Witch Tree was skipped. No title matched.
Scraped 9.Transformers Sins of Wrecker!
Scraped 10.Inhumanity Spider-Man!
11.Hellboy and Bprd 1953 Phantom H was skipped. No title matched.
Scraped 12.Kingsman Red Diamond!
Scraped 13.Dead Rabbit!
Scraped 14.Realm of Kings Inhumans!
Scraped 15.Goners!
Scraped 16.Batwoman Rebirth!
Scraped 17.Usagi Yojimbo Senso!
18.Gft Jungle Book Fall Wild was skipped. No title matched.
19.Archer and Armstrong (Vu) was skipped. No title matched.
Scraped 20.Baltimore Infernal Train!
Scraped 21.Spider-Island Cloak and Dagge!
Scraped 22.Hex Wives!
Scraped 23.Death Vigil!
Scraped 24.Future Quest Presents!
Scraped 25.Convergence New Teen Titans!
26.Now Mighty Thor was skipped. No title matched.


Scraped 216.Rocket!
Scraped 217.Marvel Zombies Supreme!
218.Robert E Howards Savage Swor was skipped. No title matched.
Scraped 219.Convergence Supergirl Matrix!
Scraped 220.Princess Ugg!
Scraped 221.Twilight Zone!
222.Black Hammer Cthu-Louise Cvr was skipped. No title matched.
Scraped 223.Dry County!
Scraped 224.Angel City!
Scraped 225.Misfit City!
Scraped 226.True Believers Miles Morales!
227.Dirk Gentlys Holistic Detecti was skipped. No title matched.
Scraped 228.Royals Masters of War!
Scraped 229.Death of Hawkman!
Scraped 230.American Monster!
231.Army of Darkness Bubba Hotep was skipped. No title matched.
Scraped 232.Battlestar Galactica!
233.Star Wars Galaxys Edge was skipped. No title matched.
Scraped 234.Aquaman Justice League Drowne!
Scraped 235.Age of Conan Belit!
Scraped 236.Vertigo Resurrected!
Scraped 237.What If Age of Ultron!
Scraped 238.Rat God!
239.Batman and Ras Al Ghul was skipped. No title matched.
240.Harley Quinn Annual was skipped. No title matched.
241.Spider-Gw

Scraped 431.Image Firsts Proof!
Scraped 432.Miniature Jesus!
Scraped 433.Multiple Warheads Ghost Thron!
Scraped 434.War of Realms Spider-Man and Le!
Scraped 435.Wolverine Infinity Watch!
436.Wicked and Divine Christmas Ann was skipped. No title matched.
Scraped 437.She Could Fly!
438.Codename Baboushka Conclave o was skipped. No title matched.
439.Btvs Season 11 Giles was skipped. No title matched.
Scraped 440.Thor For Asgard!
Scraped 441.Guarding the Globe!
Scraped 442.Infinity Wars Fallen Guardia!
Scraped 443.X-Files Year Zero!
444.Stephen King Joe Hill Road Ra was skipped. No title matched.
Scraped 445.National Comics Eternity!
Scraped 446.Sweets!
447.Dastardly and Muttley was skipped. No title matched.
Scraped 448.Creator Owned Heroes!
Scraped 449.Transformers Drift Empire of!
Scraped 450.Phantom Lady!
Scraped 451.Fantomex Max!
Scraped 452.Heartthrob!
Scraped 453.Shadowland Power Man!
Scraped 454.Kid Lobotomy!
Scraped 455.Last Gang In Town!
456.Harley Quinn Valentines Day S was ski