# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [1]:
import pandas as pd
import requests
import random
import time
import os

# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

# Custom
import lib.data_fcns as dfc
import lib.keys as keys  # Custom keys lib
import lib.comic_scraper as cs

### Initialize Browser

In [2]:
driver_exe_path = os.path.join(os.getcwd(), 'drivers'
                               ,'geckodriver-windows.exe')

In [3]:
driver_exe_path

'D:\\dropbox\\flatiron\\capstone\\comics_rx\\drivers\\geckodriver-windows.exe'

In [4]:
browser = Firefox(options=options, executable_path=driver_exe_path)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [5]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'C:\\Users\\werlindo\\.secret\\aws_ps_flatiron.json'

In [6]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [7]:
# Instantiate cursor
cur = conn.cursor()

In [8]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [9]:
# Execute the query
cur.execute(query)

In [10]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [11]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [12]:
temp_df['title'] = ( temp_df['title_and_num'].apply(dfc.cut_issue_num) )

In [13]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [49]:
temp_df['title'] = (temp_df['title'].apply(lambda x : x.replace('&' ,'and'))
                    .apply(lambda x : x.replace('?' ,''))
                    .apply(lambda x : x.replace('/' ,' '))
                   )

In [50]:
titles = list(temp_df['title'].unique())

In [51]:
sample_titles = titles[:300]

In [52]:
sample_titles

['Filler Bunny',
 'Gargoyles',
 'Royal Historian of Oz',
 'Warlord of Io and Other Storie',
 'Afterlife With Archie',
 'Afterlife With Archie Magazin',
 'Archie',
 'Boys',
 'Archie and Friends Double Diges',
 'Archie and Friends Double Doubl',
 'Archie 1941',
 'Archie and Me Comics Digest',
 'Archie and Me Jumbo Comics Di',
 'Archie Collectors E',
 'Archie Comics Annual Digest',
 'Archie Comics Double Digest',
 'Archie Double Digest',
 'Archie Jumbo Comics Digest',
 'Archie Meets Batman 66',
 'Archie Meets Ramones One Sho',
 'Archie Vs Sharknado One Shot',
 'Archies',
 'Archies Halloween Spectacula',
 'Archies One Shot Cvr A Jaime',
 'Archies Superteens Vs Crusade',
 'B and V Friends Double Digest',
 'B and V Friends Halloween Annua',
 'B and V Friends Jumbo Comics Di',
 'Betty and Veronica',
 'Betty and Veronica Comics Annua',
 'Betty and Veronica Comics Diges',
 'Betty and Veronica Comics Doubl',
 'Betty and Veronica Friends Fore',
 'Betty and Veronica Holiday Annu',
 'Betty and Vero

Get list, sorted by qty sold

In [53]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum().sort_values(by=['qty_sold'], ascending=False)

In [54]:
qtys.head()

Unnamed: 0,title,qty_sold
553,Batman,9562
6699,Walking Dead,6856
229,Amazing Spider-Man,5828
5098,Saga,5542
5928,Superman,5197


In [55]:
done_titles = titles[:300]

In [56]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [57]:
titles_needed_df.shape

(6771, 2)

In [58]:
titles_need_list = list(titles_needed_df.title.unique())

In [87]:
#367+246+151
827+151+376+524+5+47+1662+3+162+155+15

3927

In [88]:
new_start=3927 #1932

In [89]:
titles_searching = titles_need_list[new_start:]

In [90]:
titles_searching

['DC Comics Presents the Metal',
 'Onslaught Unleashed',
 'Star Trek Year Five',
 'Web of Venom Cult of Carnage',
 'All Star Section 8',
 'Klaus and Crying Snowman',
 '68 Last Rights',
 'DC Comics Presents Green Lant',
 'Oberon',
 'Axis Hobgoblin',
 'Emily and the Strangers Breakin',
 'Pantha',
 'Star Wars Aor Anakin Skywalke',
 'Black Af Widows and Orphans',
 'Kull Eternal',
 'Savage Skullkickers',
 'Peter Parker',
 'Spider-Women Omega',
 'Aphrodite IX Hidden Files',
 'Gwar Orgasmageddon',
 'Fall of Hulks Savage She-Hulk',
 'Muppet King Arthur',
 'DC Comics Presents J H Willia',
 'Agent Carter Shield 50th Anni',
 'Sinestro Annual',
 'Wwe Then Now Forever',
 'New Romancer',
 'Msh Adventures Spider-Man Acr',
 'Rodd Racer One Sho',
 'Star Wars Aor Jabba the Hutt',
 'Night Moves',
 'Now Moon Girl and Devil Dinos',
 'Crypt of Shadows',
 'DC Comics Presents Jack Cros',
 'Resident Alien the Man With N',
 'Rowans Ruin',
 'My Little Pony Annual 2017',
 'Victorie City',
 'Dinosaurs Attack',
 'R

## It's the Scraping.

In [None]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)

Scraped 0.DC Comics Presents the Metal!
Scraped 1.Onslaught Unleashed!
Scraped 2.Star Trek Year Five!
Scraped 3.Web of Venom Cult of Carnage!
4.All Star Section 8 was skipped. No title matched.
Scraped 5.Klaus and Crying Snowman!
6.68 Last Rights was skipped. No title matched.
Scraped 7.DC Comics Presents Green Lant!
Scraped 8.Oberon!
Scraped 9.Axis Hobgoblin!
10.Emily and the Strangers Breakin was skipped. No title matched.
Scraped 11.Pantha!
12.Star Wars Aor Anakin Skywalke was skipped. No title matched.
13.Black Af Widows and Orphans was skipped. No title matched.
Scraped 14.Kull Eternal!
Scraped 15.Savage Skullkickers!
Scraped 16.Peter Parker!
Scraped 17.Spider-Women Omega!
Scraped 18.Aphrodite IX Hidden Files!
Scraped 19.Gwar Orgasmageddon!
Scraped 20.Fall of Hulks Savage She-Hulk!
Scraped 21.Muppet King Arthur!
22.DC Comics Presents J H Willia was skipped. No title matched.
23.Agent Carter Shield 50th Anni was skipped. No title matched.
24.Sinestro Annual was skipped. No title ma