In [4]:
import requests
import json
from bs4 import BeautifulSoup
import numpy as np
from urllib import request
import time

### Coin class
Create a coin class that will keep track of each coin's data, such as url, coin name and the images_url

In [5]:
# Class for each coin category.
class Coin:
    def __init__(self, coin_id, coin_name, coin_currency, coin_country):
        self.id = coin_id
        self.name = coin_name
        self.currency = coin_currency
        self.country = coin_country
        self.currency_url = ""  # Url of the web we are going to scrap.
        self.image_urls = []  # All the relevant images urls ready to download.
        self.soup = None
        
    # Return the url we are going to scrap
    def get_url (self, year="1990-2018", page=0):
        base_url = "https://en.ucoin.net/catalog/?"
        country = "country={}".format(self.country)
        year = "year={}".format(year)
        page_n = str(page)

        url = base_url + country + "&" + year + "&type=1&page=" + page_n
        
        self.currency_url = url # Update our currency_url variable.
    
    # Store the html of the web in a variable.
    def get_html (self):
        headers = {'user-agent':'Mozilla/5.0'}
        page = requests.get(self.currency_url, headers=headers)

        if page.status_code == 200:
            self.soup = BeautifulSoup(page.content, 'html.parser')
        else:
            raise ValueError(
                "Status code {} for url: {}".format(
                    page.status_code, self.currency_url))
    
    # Examine the html to find images of our specific coin.
    # First we find the coin from a list of coins
    # Second we extract a link that takes us to a gallery page 
    # with lots of photos for that coin.
    # Finally extract all the coins from the gallery.
    def get_image_urls (self):
        headers = {'user-agent':'Mozilla/5.0'}
        base_url = "https://en.ucoin.net"
        keywords = self.name.lower().split() # Our target image keywords
        pages = len(self.soup.select("div.pages a")) # number of pages
        all_coins = []
        
        # If we have multiple pages, loop through all of them.
        if (pages != 0):
            for ii in range(pages):
                page_n = ii+1
                self.get_url(page=page_n)
                self.get_html()
                all_coins = all_coins + self.soup.select("td.coin-img a")
        else:
            # If only one page, we take all the imgs from that page.
            all_coins = self.soup.select("td.coin-img a")
        
        
        all_coins_links = [a['href'] for a in all_coins]
        all_coins_names = [link.split("/")[-2].split("-") for link in all_coins_links]

        target_coin_idx = []
        
        # Get the coins from the list that match our target coin name.
        for ii, coin_name in enumerate(all_coins_names):
            matched_keywords = []
            
            # For each of our keywords see if we find coins words that match them.
            for keyword in keywords:
                matching = []
                #if there is a word match we return 1 else 0.
                for word in coin_name:
                    if (keyword == word):
                        matching.append(1) # Match.
                    else:
                        matching.append(0) # No match.
                        
                matching = sorted(matching, reverse=True)
                matching = matching[0] 
                matched_keywords.append(matching)
                
            matched_keywords = np.array(matched_keywords).mean() # if mean 1 it was a match. otherwise no match.
            # Get the index of the matched link.
            if int(matched_keywords) == 1:
                target_coin_idx.append(ii)
        
        # ONLY the links of coins we want
        target_coins_links = [base_url + coin_link 
                              for ii, coin_link in enumerate(all_coins_links) 
                              if ii in target_coin_idx]

         # Throw error if no matches were found.
        if len(target_coins_links) == 0:
            error = "No matches found for {}, {}, {}, with url {}"
            .format(self.name, self.currency, self.country, self.currency_url)
            
            print(error)
        
        target_coin_link = target_coins_links[0] # Both have the same gallery. Choose the first.
        gallery_base_link = base_url + "/gallery/" # Link with all the target coins
        target_coin_id = target_coin_link.split("/")[-1]
        target_coin_gallery_link = gallery_base_link + target_coin_id + "&list=all"
        
        # Now we have a gallery with our coins, lets get the urls.
        coin_gallery = requests.get(target_coin_gallery_link, headers=headers)
        coin_gallery_html = BeautifulSoup(coin_gallery.content, 'html.parser')
        
        # Get the list of images ready to download
        target_image_list = coin_gallery_html.select(".coin-img img")
        target_image_list = [img['src'] for img in target_image_list]

            
        # Some url formatting before we go
        for ii, image_url in enumerate(target_image_list):
            # Delete any placeholder images
            if image_url.split("/")[-2] == "samples":
                continue
            
            image_url = image_url.split("/")
            image_url[-2] = image_url[-2][:-1]
            image_url = "/".join(image_url)
            
            self.image_urls.append(image_url) # yay!!


    def save_to_dir (self):
        # Check coin id and download all the coins into that dir.
        base_dir = "/home/pablo/Desktop/stuff/coin_cnn/data/train"
        folder = "/" + str(self.id)
        
        for ii, image_url in enumerate(self.image_urls):
            image_name = "/{}__{}".format(str(ii+1).zfill(2), image_url.split("/")[-1])
            image_dir = "{}{}{}".format(base_dir, folder, image_name)
            # Download and save the image
            request.urlretrieve(image_url, image_dir)
        
    print("Done.")

I'll need to scrap more images from other sources. To do this I can just extend the Coin class and modify it to meet my needs. In this case I'll use the bing search api, which is a lot easier to implement.

In [6]:
# Create custom scrapper here. We create one for scrapping google images
class Coin_Bing (Coin):
    def __init__(self, coin_id, coin_name, coin_currency, coin_country):
        super().__init__(coin_id, coin_name, coin_currency, coin_country)
        
    def get_url(self):
        base_url = "https://duckduckgo.com/?q=" # Website url. Eg. Google
        parameters = "&atb=v130-7_i&iar=images&iax=images&ia=images"
        query = f"{self.name} coin {self.country}".split(" ")
        query = "+".join(query)
        
        self.currency_url = base_url + query + parameters
        print(self.currency_url)
    def get_html(self):
        url = self.currency_url
        browser = webdriver.PhantomJS()
        browser.get(url)
        html = browser.page_source
        
        if page.status_code == 200:
            self.soup = BeautifulSoup(html, 'lxml')
        else:
            raise ValueError("Status code {} for url: {}"
                             .format(page.status_code, self.currency_url)) # Throw error if 404
        
    # Override get_image_urls
    def get_image_urls (self):
        subscription_key = "2c909589b185478d9b3cb4e603ac547a"
        assert subscription_key
        
        search_url = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
        search_term = "{} coin {}".format(self.name, self.country)
        
        headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
        params  = {"q": search_term}
        response = requests.get(search_url, headers=headers, params=params)
        response.raise_for_status()
        search_results = response.json()
        self.image_urls = [img["thumbnailUrl"] for img in search_results["value"][:12]]

### Create a list of Coin instances

We need to create a list of coin instances. First we need to get our cat_to_name.json into a dictionary. The dictionary will update if anything is added or removed so that there are no gaps in between indices. Once cat_to_name.json has been updated we load it into a dictionary. Finally we loop through this dictionary to create a Class instance for every coin.

In [7]:
# Uncomment this if we are sure that we want to update our json.
# Only use it if changes are made manually to add or remove items.
'''
# Load the json
with open('cat_to_name.json', 'r') as file:
    cat_to_name = json.load(file)
       
corrected_cat_2_name = [] # will contain the updated json

# update indices.
for ii, key in enumerate(cat_to_name):
    new_key = str(ii + 1)
    value = cat_to_name[key]
    corrected_cat_2_name.append((new_key, value))
        
corrected_cat_2_name = dict(corrected_cat_2_name) # Convert list to dict

# Save the updated json.
with open('cat_to_name.json', 'w') as file:   
    json.dump(corrected_cat_2_name, file, indent=4)
'''

"\n# Load the json\nwith open('cat_to_name.json', 'r') as file:\n    cat_to_name = json.load(file)\n       \ncorrected_cat_2_name = [] # will contain the updated json\n\n# update indices.\nfor ii, key in enumerate(cat_to_name):\n    new_key = str(ii + 1)\n    value = cat_to_name[key]\n    corrected_cat_2_name.append((new_key, value))\n        \ncorrected_cat_2_name = dict(corrected_cat_2_name) # Convert list to dict\n\n# Save the updated json.\nwith open('cat_to_name.json', 'w') as file:   \n    json.dump(corrected_cat_2_name, file, indent=4)\n"

In [8]:
# Create a dict containing the json.
with open('cat_to_name.json', 'r') as f:
    cat_to_name = json.load(f)

# Check that it worked
cat_to_name["1"]

'1 Cent,Australian dollar,australia'

In [9]:
# A list containing an instance of Coin for each coin.
coin_list = []

# Append coins to coin_list.
for coin in cat_to_name.items():
    # Get the id, name, currency and country
    coin_id = coin[0]
    coin_data = coin[1].split(",")
    coin_name, coin_currency, coin_country = coin_data[0], coin_data[1], coin_data[2]
    
    # Create a Coin instance.
    # my_coin = Coin(coin_id, coin_name, coin_currency, coin_country)
    my_coin = Coin_Bing(coin_id, coin_name, coin_currency, coin_country)
    # Push it into coin list.
    coin_list.append(my_coin)

### Write data to dir

Time  to loop through each coin save each coin images in their folder

In [10]:
# Download our images.
for ii in range(len(coin_list)):
    # coin_list[ii].get_url() # uncomment if using instance of Coin class (not Coin_Bing)
    # coin_list[ii].get_html() # uncomment if using instance of Coin class (not Coin_Bing)
    coin_list[ii].get_image_urls()
    coin_list[ii].save_to_dir()
    time.sleep(1) # Slow down crawling to avoid gettting banned -_-
