In [11]:
import pandas as pd
import requests
import random
import time
import os

# Selenium
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True

# Data storage
from sqlalchemy import create_engine # SQL helper
import psycopg2 as psql #PostgreSQL DBs

# Custom
import lib.data_fcns as dfc
import lib.keys as keys  # Custom keys lib

## Initialize Stuff

In [12]:
browser = Firefox(options=options)
url = "http://www.comicbookdb.com/"
browser.get(url)

## Make list of Titles!

In [13]:
# Define path to secret
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'/Users/werlindo/.secret/aws_ps_flatiron.json'

In [14]:
aws_keys = keys.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [15]:
# Instantiate cursor
cur = conn.cursor()

In [16]:
#  Count records.
query = """
    SELECT * from comic_trans;
"""

In [17]:
# Execute the query
cur.execute(query)

In [18]:
# Check results
temp_df = pd.DataFrame(cur.fetchall())
temp_df.columns = [col.name for col in cur.description]

In [19]:
temp_df.head(3)

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG)
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG)
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG)


In [20]:
temp_df['title'] = ( temp_df['title_and_num'].apply(dfc.cut_issue_num) )

In [21]:
temp_df.head()

Unnamed: 0,index,publisher,item_id,title_and_num,qty_sold,date_sold,account_num,comic_title,title
0,2,Amaze Ink Slave Labor Graphics,DCD151935,Filler Bunny #2,1,2011-08-14 18:01:03,174,Filler Bunny (SLG),Filler Bunny
1,3,Amaze Ink Slave Labor Graphics,DCD341726,Gargoyles #6,1,2012-06-22 14:11:37,593,Gargoyles (SLG),Gargoyles
2,4,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-21 14:03:07,226,Royal Historian of Oz (SLG),Royal Historian of Oz
3,5,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-14 19:49:40,399,Royal Historian of Oz (SLG),Royal Historian of Oz
4,6,Amaze Ink Slave Labor Graphics,DCD416182,Royal Historian of Oz #1,1,2010-07-19 10:39:04,237,Royal Historian of Oz (SLG),Royal Historian of Oz


In [22]:
temp_df['title'] = temp_df['title'].apply(lambda x : x.replace('&' ,'and'))

In [23]:
titles = list(temp_df['title'].unique())

In [26]:
sample_titles = titles[:300]

In [27]:
sample_titles

['Filler Bunny',
 'Gargoyles',
 'Royal Historian of Oz',
 'Warlord of Io and Other Storie',
 'Afterlife With Archie',
 'Afterlife With Archie Magazin',
 'Archie',
 'Boys',
 'Archie and Friends Double Diges',
 'Archie and Friends Double Doubl',
 'Archie 1941',
 'Archie and Me Comics Digest',
 'Archie and Me Jumbo Comics Di',
 'Archie Collectors E',
 'Archie Comics Annual Digest',
 'Archie Comics Double Digest',
 'Archie Double Digest',
 'Archie Jumbo Comics Digest',
 'Archie Meets Batman 66',
 'Archie Meets Ramones One Sho',
 'Archie Vs Sharknado One Shot',
 'Archies',
 'Archies Halloween Spectacula',
 'Archies One Shot Cvr A Jaime',
 'Archies Superteens Vs Crusade',
 'B and V Friends Double Digest',
 'B and V Friends Halloween Annua',
 'B and V Friends Jumbo Comics Di',
 'Betty and Veronica',
 'Betty and Veronica Comics Annua',
 'Betty and Veronica Comics Diges',
 'Betty and Veronica Comics Doubl',
 'Betty and Veronica Friends Fore',
 'Betty and Veronica Holiday Annu',
 'Betty and Vero

Get list, sorted by qty sold

In [28]:
qtys = temp_df.groupby(['title'], as_index=False).qty_sold.sum().sort_values(by=['qty_sold'], ascending=False)

In [29]:
qtys.head()

Unnamed: 0,title,qty_sold
553,Batman,9562
6700,Walking Dead,6856
229,Amazing Spider-Man,5828
5098,Saga,5542
5929,Superman,5197


In [30]:
done_titles = titles[:300]

In [31]:
titles_needed_df = qtys.loc[~qtys['title'].isin(done_titles)]

In [32]:
titles_needed_df.shape

(6773, 2)

In [33]:
titles_need_list = list(titles_needed_df.title.unique())

In [40]:
titles_need_list[367:]

['Scarlet',
 'Powers',
 'DC Universe Presents',
 'Ultimate Comics Fallout',
 'Despicable Deadpool',
 'Redlands',
 'Batman 66',
 'Batman Arkham Knight',
 'Extremity',
 'Spider-Woman',
 'Green Lantern the Lost Army',
 'Justice League 3001',
 'Irredeemable',
 'Birthright',
 'Fables the Wolf Among Us',
 'Tmnt Universe',
 'Kick-Ass 3',
 'House of Mystery',
 'Journey Star Wars Fase',
 'James Bond',
 'Mighty Avengers',
 'Shade',
 'Cable and X-Force',
 'Loki Agent of Asgard',
 'Copperhead',
 'Hinterkind',
 'Spaceman',
 'Dying and the Dead',
 'Astonishing X-Men Xenogenesi',
 'Green Wake',
 'Trinity',
 'Gwenpool',
 'Men of Wrath By Jason Aaron',
 'Old Man Hawkeye',
 'Power Man and Iron Fist',
 'Young Avengers',
 'Street Fighter Swimsuit Speci',
 'Odyc',
 'She-Hulk',
 'Thors',
 'Cyborg',
 'Avengers No Road Home',
 'Ben Reilly Scarlet Spider',
 'Stumptown V2',
 'Echo',
 'Superior',
 'Clone Conspiracy',
 'Black Panther Man Without Fea',
 'Vision',
 'Judge Dredd',
 'Green Lantern Emerald Warrior',
 

In [41]:
titles_searching = titles_need_list[367:]

In [42]:
titles_searching

['Scarlet',
 'Powers',
 'DC Universe Presents',
 'Ultimate Comics Fallout',
 'Despicable Deadpool',
 'Redlands',
 'Batman 66',
 'Batman Arkham Knight',
 'Extremity',
 'Spider-Woman',
 'Green Lantern the Lost Army',
 'Justice League 3001',
 'Irredeemable',
 'Birthright',
 'Fables the Wolf Among Us',
 'Tmnt Universe',
 'Kick-Ass 3',
 'House of Mystery',
 'Journey Star Wars Fase',
 'James Bond',
 'Mighty Avengers',
 'Shade',
 'Cable and X-Force',
 'Loki Agent of Asgard',
 'Copperhead',
 'Hinterkind',
 'Spaceman',
 'Dying and the Dead',
 'Astonishing X-Men Xenogenesi',
 'Green Wake',
 'Trinity',
 'Gwenpool',
 'Men of Wrath By Jason Aaron',
 'Old Man Hawkeye',
 'Power Man and Iron Fist',
 'Young Avengers',
 'Street Fighter Swimsuit Speci',
 'Odyc',
 'She-Hulk',
 'Thors',
 'Cyborg',
 'Avengers No Road Home',
 'Ben Reilly Scarlet Spider',
 'Stumptown V2',
 'Echo',
 'Superior',
 'Clone Conspiracy',
 'Black Panther Man Without Fea',
 'Vision',
 'Judge Dredd',
 'Green Lantern Emerald Warrior',
 

## It's the Scraping.

In [43]:
# for title in sample_titles:
# #     print(title)
scrape_series_covers(browser, titles_searching)

NameError: name 'scrape_series_covers' is not defined

## Master Function Testing

In [72]:
def scrape_series_covers(browser, titles):
    """Use Selenium to scrape images for comic book titles"""
    
    start_time = time.time()
    
    for idx, title in enumerate(titles):
        # Search for the title
        search_title(browser, title)

#         title_search_flag = not no_results_found()

#         # Check if no results found
#         if not title_search_flag:
#             search_site(browser, title)
        
        if not no_results_found():
        # Once on search results, just select first issue of results
            click_first_link(browser, title, True)

            # Go to the cover gallery of issue page
            go_cover_gallery(browser)

            # Once in cover gallery, just scrape the first image
            try:
                get_first_image(browser, title)
                print("Scraped {}.{}!".format(idx, title))
            except NoSuchElementException:
                print("{}.{} was skipped. No covers were found."
                      .format(idx, title))

                # Go back to homepage so can do it again!
                #go_back_home_comicbookdb(browser)
        else:
            print("{}.{} was skipped. No title matched.".format(idx, title))
            
            
            # Wait random time
            time.sleep(2 + random.random()*5)
    
    print ('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))

#     print("All done!")

In [29]:
def no_results_found():
    xpath = '/html/body/table/tbody/tr[2]/td[3]'
    result = browser.find_element_by_xpath(xpath)
    return result.text == 'No results found.'

## Search on Title

In [17]:
def search_title(browser, title):
    """
    Given Selenium browser obj and a comic title to search for
    Enter title into search box and Search
    """
    # Find search box and enter search text
    text_area = browser.find_element_by_id('form_search')
    text_area.send_keys(Keys.CONTROL, "a") 
    text_area.send_keys(title)
    
    # Find Search type dropdown and make sure it says 'Title'
    search_type = Select(browser.find_element_by_name('form_searchtype'))
    search_type.select_by_value('Title')

    
    # Push the search button!
    sb_xpath = ('/html/body/table/tbody/tr[2]/td[1]' + 
                '/table/tbody/tr[4]/td/form/input[2]')
    search_button = browser.find_element_by_xpath(sb_xpath)
    search_button.click()


In [18]:
def search_site(browser, title):
    """
    Given Selenium browser obj and a comic title to search for
    Enter title into search box and Search
    """
    # Find search box and enter search text
    text_area = browser.find_element_by_id('form_search')
    text_area.send_keys(Keys.CONTROL, "a") 
    text_area.send_keys(title)
    
    # Find Search type dropdown and make sure it says 'Title'
#     search_type = Select(browser.find_element_by_name('form_searchtype'))
#     search_type.select_by_value('Title')
    
    # Push the search button!
    sb_xpath = ('/html/body/table/tbody/tr[2]/td[1]' + 
                '/table/tbody/tr[4]/td/form/input[2]')
    search_button = browser.find_element_by_xpath(sb_xpath)
    search_button.click()


In [42]:
title_search = "Filler Bunny"

In [61]:
title_search = 'Royal Historian of Oz'

In [68]:
title = title_search

In [62]:
text_area = browser.find_element_by_id('form_search')

In [63]:
text_area.send_keys(Keys.CONTROL, "a") 
text_area.send_keys(title_search)

In [64]:
search_type = Select(browser.find_element_by_name('form_searchtype'))

mySelect = Select(driver.find_element_by_id("mySelectID"))

In [65]:
search_type.select_by_value('Title')

In [66]:
search_button = browser.find_element_by_xpath('/html/body/table/tbody/tr[2]/td[1]/table/tbody/tr[4]/td/form/input[2]')

In [67]:
search_button.click()

## Click on first result

In [19]:
def click_first_link(browser, title, title_search_flag):
    """
    Find first issue link and click it
    """
    # Find first issue link in search results
    if title_search_flag:
        x_path = '/html/body/table/tbody/tr[2]/td[3]/a[1]'
    else: 
        x_path = '/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr/td/a[1]'
        
    #first_issue_link = browser.find_element_by_partial_link_text(title)
    first_issue_link = browser.find_element_by_xpath(x_path)
    
    # Click
    first_issue_link.click()


In [106]:
first_issue_link = browser.find_element_by_partial_link_text(title_search)

NoSuchElementException: Message: Unable to locate element: Royal Historian of Oz


In [70]:
first_issue_link.click()

## Go to Cover Gallery

In [20]:
def go_cover_gallery(browser):
    """
    Click on Cover Gallery button
    """
    gb_xpath = ("/html/body/table/tbody/tr[2]/td[3]/table[1]" + 
                "/tbody/tr/td/a[4]/img"
               )
    gb_xpath='//a[img/@src="graphics/button_title_covergallery.gif"]'
    
    gallery_btn = browser.find_element_by_xpath(gb_xpath)
    gallery_btn.click()

In [None]:
/html/body/table/tbody/tr[2]/td[3]/table[1]/tbody/tr/td/a[4]/img

In [52]:
imgy = browser.fin

In [53]:
imgy.click()

In [None]:
brow.find_element_by_xpath('//a[img/@src="resources/img/logout.png"]').click()

In [89]:
gb_xpath="/html/body/table/tbody/tr[2]/td[3]/table[1]/tbody/tr/td/a[3]/img"
gb_xpath="/html/body/table/tbody/tr[2]/td[3]/table[1]/tbody/tr/td/a[4]/img"
gb_xpath='//a[img/@src="graphics/button_title_covergallery.gif"]'
gallery_btn = browser.find_element_by_xpath(gb_xpath)

gallery_btn.click()

## Get First Image

In [32]:
def get_first_image(browser, title):
    """
    Find first image in cover gallery and scrape it!
    """
    # Find first image
    first_img_path = ('/html/body/table/tbody/tr[2]/td[3]/' + 
                      'table/tbody/tr[1]/td[1]/a/img')
    first_img = browser.find_element_by_xpath(first_img_path)    

    # Construct path and file name
    filename = ('./raw_data/covers/' + title.replace(' ', '_').lower() 
               + '.jpg'
               )
    
    # Save the file in the file/path 
    scrape_image(first_img, filename)
    
    return

In [72]:
first_img_path = '/html/body/table/tbody/tr[2]/td[3]/table/tbody/tr[1]/td[1]/a/img'

In [73]:
first_img = browser.find_element_by_xpath(first_img_path)

In [77]:
# Construct path and file name
filename = ('./assets/covers/' + title.replace(' ', '_').lower() 
           + '.jpg'
           )

In [22]:
def scrape_image(img, filename):
    """Save an image element as filename"""
    response = requests.get(img.get_attribute('src'))
    img_data = response.content
    with open(filename, 'wb') as f:
        f.write(img_data)

In [78]:
scrape_image(first_img, filename)

In [79]:
!ls

LICENSE
README.md
[34m__pycache__[m[m
[34mals_filtered[m[m
[34mals_use[m[m
app.py
[34marchive[m[m
[34massets[m[m
code_archive.py
comic_recs.py
comics_rx-1_data_prep.ipynb
comics_rx-2_eda.ipynb
comics_rx-3_als_all_data.ipynb
comics_rx-4_als_reduced_data.ipynb
comics_rx-5_pseudo_deploy.ipynb
comics_rx-6_als_reduced_data_gs_cv.ipynb
comics_rx-7_mvp_dev.ipynb
comics_rx-8_recommender_poc.ipynb
comics_rx-9_scrape_something.ipynb
data_fcns.py
[34mdev[m[m
geckodriver.log
keys.py
[34mraw_data[m[m
[34mreferences[m[m
scrapes.py
[34mscratch[m[m
spam_model.pkl
[34mstatic[m[m
[34msupport_data[m[m
[34mtemplates[m[m
testimage.jpg
[34mversions[m[m


In [80]:
!open testimage.jpg

In [23]:
def go_back_home_comicbookdb(browser):
    # Find image link to go back home
    home_pg_xpath = '/html/body/table/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[1]/a/img'
    logo_btn = browser.find_element_by_xpath(home_pg_xpath)

    # Click!
    logo_btn.click()    
    

In [82]:
home_pg_xpath = '/html/body/table/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[1]/a/img'

logo_btn = browser.find_element_by_xpath(home_pg_xpath)

logo_btn.click()

---

# Create Loop for Titles

---

# Graveyard

In [1]:
import lib.comic_scraper as cs

In [1]:
import comic_recs as cr

In [4]:
import lib.comic_scraper as cs