# Scraping Comic Book Covers

**Goal**: Scrape comic covers so can use them as visual touchstones for users in the app.


### Libraries

In [1]:
import pandas as pd
import requests
import random
import time
import os

# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

# Custom
import lib.data_fcns as dfc
import lib.keys as keys  # Custom keys lib
import lib.comic_scraper as cs

### Initialize Browser

In [2]:
driver_exe_path = os.path.join(os.getcwd(), 'drivers'
                               ,'geckodriver-windows.exe')

In [3]:
driver_exe_path

'D:\\dropbox\\flatiron\\capstone\\comics_rx\\drivers\\geckodriver-windows.exe'

In [4]:
browser = Firefox(options=options, executable_path=driver_exe_path)
url = "http://www.comicbookdb.com/"
browser.get(url)

### Make list of Titles!

Get list of titles to scrape covers.

In [5]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'C:\\Users\\werlindo\\.secret\\aws_ps_flatiron.json'

In [6]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [7]:
# Instantiate cursor
cur = conn.cursor()

In [8]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [9]:
# Execute the query
cur.execute(query)

In [10]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [11]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [12]:
temp_df['title'] = ( temp_df['title_and_num'].apply(dfc.cut_issue_num) )

In [13]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [49]:
temp_df['title'] = (temp_df['title'].apply(lambda x : x.replace('&' ,'and'))
                    .apply(lambda x : x.replace('?' ,''))
                    .apply(lambda x : x.replace('/' ,' '))
                   )

In [50]:
titles = list(temp_df['title'].unique())

In [51]:
sample_titles = titles[:300]

In [52]:
sample_titles

['Filler Bunny',
 'Gargoyles',
 'Royal Historian of Oz',
 'Warlord of Io and Other Storie',
 'Afterlife With Archie',
 'Afterlife With Archie Magazin',
 'Archie',
 'Boys',
 'Archie and Friends Double Diges',
 'Archie and Friends Double Doubl',
 'Archie 1941',
 'Archie and Me Comics Digest',
 'Archie and Me Jumbo Comics Di',
 'Archie Collectors E',
 'Archie Comics Annual Digest',
 'Archie Comics Double Digest',
 'Archie Double Digest',
 'Archie Jumbo Comics Digest',
 'Archie Meets Batman 66',
 'Archie Meets Ramones One Sho',
 'Archie Vs Sharknado One Shot',
 'Archies',
 'Archies Halloween Spectacula',
 'Archies One Shot Cvr A Jaime',
 'Archies Superteens Vs Crusade',
 'B and V Friends Double Digest',
 'B and V Friends Halloween Annua',
 'B and V Friends Jumbo Comics Di',
 'Betty and Veronica',
 'Betty and Veronica Comics Annua',
 'Betty and Veronica Comics Diges',
 'Betty and Veronica Comics Doubl',
 'Betty and Veronica Friends Fore',
 'Betty and Veronica Holiday Annu',
 'Betty and Vero

Get list, sorted by qty sold

In [53]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum().sort_values(by=['qty_sold'], ascending=False)

In [54]:
qtys.head()

Unnamed: 0,title,qty_sold
553,Batman,9562
6699,Walking Dead,6856
229,Amazing Spider-Man,5828
5098,Saga,5542
5928,Superman,5197


In [55]:
done_titles = titles[:300]

In [56]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [57]:
titles_needed_df.shape

(6771, 2)

In [58]:
titles_need_list = list(titles_needed_df.title.unique())

In [108]:
#367+246+151
827+151+376+524+5+47+1662+3+162+155+15+295+927

5149

In [109]:
new_start=5149 #1932

In [110]:
titles_searching = titles_need_list[new_start:]

In [111]:
titles_searching

['Supermansion',
 'Marvel Boy Uranian',
 'Green Lantern #23.4 Sinestro',
 'Uncle Grandpa Good Morning Sp',
 'A Distant Soil',
 'DC Presents Lois and Clark 100',
 'Star Trek Captains Log Pike',
 'Saga of the Swamp Thing',
 'Orson Scott Cards Speaker Fo',
 'Thor Where Walk the Frost Gia',
 'Namora',
 'Daredevil Mgc',
 'Avengers Origins Vision',
 'Avengers Origins Thor',
 'Time Lincoln Jack To the Futu',
 'Dark Knight Iii the Master Ra',
 'Immortal Brothers Green Knigh',
 'Wrath of the Titans Revenge',
 'Avengers Origins Quicksilver',
 'Amber Blake',
 'Ryder On the Storm',
 'Kiss Solo',
 'Street Tiger',
 'Heroic Age Villains',
 'Timely Comics Uncanny Inhuman',
 'Spider-Man Far From Home Prel',
 'Spider-Man Emergence Evil Jac',
 'Tails of Pet Avenger',
 'Grimm V2',
 'Magical Beatdown',
 'True Believers Star Wars Orig',
 'True Believers Hulk Wedding o',
 'Heroic Age X-Men',
 'Gft Dance of the Dead',
 'Spike 100 Page Spectacular (O',
 'Millarworld Annual 2016',
 'Judge Dredd Mega City Two Di

## It's the Scraping.

In [112]:
# for title in sample_titles:
# #     print(title)
cs.scrape_series_covers(browser, titles_searching)

Scraped 0.Supermansion!
Scraped 1.Marvel Boy Uranian!
Scraped 2.Green Lantern #23.4 Sinestro!
3.Uncle Grandpa Good Morning Sp was skipped. No title matched.
Scraped 4.A Distant Soil!
5.DC Presents Lois and Clark 100 was skipped. No title matched.
6.Star Trek Captains Log Pike was skipped. No title matched.
Scraped 7.Saga of the Swamp Thing!
8.Orson Scott Cards Speaker Fo was skipped. No title matched.
Scraped 9.Thor Where Walk the Frost Gia!
Scraped 10.Namora!
11.Daredevil Mgc was skipped. No title matched.
Scraped 12.Avengers Origins Vision!
Scraped 13.Avengers Origins Thor!
Scraped 14.Time Lincoln Jack To the Futu!
Scraped 15.Dark Knight Iii the Master Ra!
Scraped 16.Immortal Brothers Green Knigh!
17.Wrath of the Titans Revenge was skipped. No title matched.
Scraped 18.Avengers Origins Quicksilver!
Scraped 19.Amber Blake!
Scraped 20.Ryder On the Storm!
Scraped 21.Kiss Solo!
Scraped 22.Street Tiger!
Scraped 23.Heroic Age Villains!
Scraped 24.Timely Comics Uncanny Inhuman!
25.Spider-Ma

OSError: [Errno 22] Invalid argument: './raw_data/covers/holy_f*ck.jpg'