In [131]:
import requests
import json
from bs4 import BeautifulSoup
import numpy as np
from urllib import request

### Coin class
Create a coin class that will keep track of each coin's data, such as url, coin name and the images_url

In [132]:
# Class for each coin category.
class Coin:
    def __init__(self, coin_id, coin_name, coin_currency, coin_country):
        self.id = coin_id
        self.name = coin_name
        self.currency = coin_currency
        self.country = coin_country
        self.currency_url = "" # Url of the web we are going to scrap.
        self.image_urls = [] # All the relevant images urls ready to download.
        self.soup = None # Our BeautifulSoup containing the html object.
        
    # Return the url we are going to scrap
    def get_url (self, year="2006-2018"):
        base_url = "https://en.ucoin.net/catalog/?" # Website url
        country = f"country={self.country}" # Parameters
        year = f"year={year}" # Parameters

        url = base_url + country + "&" + year + "&type=1"
        
        self.currency_url = url # Update our currency_url variable.
    
    # Store the html of the web in a variable.
    def get_html (self):
        page = requests.get(self.currency_url)
        
        if page.status_code == 200:
            self.soup = BeautifulSoup(page.content, 'html.parser')
        else:
            raise ValueError(f"Status code {page.status_code} for url: {self.currency_url}") # Throw error if 404
    
    # Examine the html to find images of our specific coin.
    # First we find the coin from a list of coins
    # Second we extract a link that takes us to a page with lots of photos for that coin.
    def get_image_urls (self):
        base_url = "https://en.ucoin.net"
        keywords = self.name.lower().split() # Our target image keywords
        all_coins = self.soup.select("td.coin-img a") # list of all the coins
        all_coins_links = [a['href'] for a in all_coins] # Link to webpage with photos
        all_coins_names = [link.split("/")[-2].split("-") for link in all_coins_links] # Get all the coin names in the html
        
        target_coin_idx = [] # indices of target coins.
        
        # Get the coins from the list that match our target coin name.
        for ii, coin_name in enumerate(all_coins_names):
            matched_keywords = [] # List with 1 if matched 0 if not matched the keyword.
            for keyword in keywords: # For each of our keywords see if we find coins words that match.
                matching = []
                #if there is a word match we return 1 else 0.
                for word in coin_name:
                    if (keyword == word): 
                        matching.append(1)
                    else:
                        matching.append(0)
                # If matching list contains a 1 we convert list to int 1, if not we convert list to int 0.       
                matching = sorted(matching, reverse=True)
                matching = matching[0] 
                matched_keywords.append(matching)
                
            matched_keywords = np.array(matched_keywords).mean() # if mean 1 it was a match. otherwise no match.
            # Get the index of the matched link.
            if int(matched_keywords) == 1:
                target_coin_idx.append(ii)
        
        # ONLY the links of coins we want
        target_coins_links = [base_url + coin_link for ii, coin_link in enumerate(all_coins_links) if ii in target_coin_idx]
        
        target_coin_link = target_coins_links[0] # Both have the same gallery. So we can choose only one.
        gallery_base_link = base_url + "/gallery/" # Link with all the coins
        target_coin_id = target_coin_link.split("/")[-1]
        target_coin_gallery_link = gallery_base_link + target_coin_id
   
        ### Now we have a gallery with our coins, lets get the links.
        coin_gallery = requests.get(target_coin_gallery_link)
        coin_gallery_html = BeautifulSoup(coin_gallery.content, 'html.parser')
        
        # Get the list of images ready to download
        target_image_list = coin_gallery_html.select(".coin-img img")
        target_image_list = [img['src'] for img in target_image_list]
        # Some formatting before we go
        for ii, image_url in enumerate(target_image_list):
            image_url = image_url.split("/")
            image_url[-2] = image_url[-2][:-1]
            image_url = "/".join(image_url)
            self.image_urls.append(image_url) # yay!!

    def save_to_dir ():
        # Check coin id and download all the coins into that dir.

['https://i.ucoin.net/coin/21/375/21375347-1/sweden-10-kronor-2006.jpg', 'https://i.ucoin.net/coin/21/375/21375347-2/sweden-10-kronor-2006.jpg', 'https://i.ucoin.net/coin/21/375/21375249-1/sweden-10-kronor-2009.jpg', 'https://i.ucoin.net/coin/21/375/21375249-2/sweden-10-kronor-2009.jpg', 'https://i.ucoin.net/coin/21/375/21375102-1/sweden-10-kronor-2002.jpg', 'https://i.ucoin.net/coin/21/375/21375102-2/sweden-10-kronor-2002.jpg', 'https://i.ucoin.net/coin/21/375/21375003-1/sweden-10-kronor-2001.jpg', 'https://i.ucoin.net/coin/21/375/21375003-2/sweden-10-kronor-2001.jpg', 'https://i.ucoin.net/coin/18/190/18190372-1/sweden-10-kronor-2004.jpg', 'https://i.ucoin.net/coin/18/190/18190372-2/sweden-10-kronor-2004.jpg', 'https://i.ucoin.net/coin/17/262/17262791-1/sweden-10-kronor-2007.jpg', 'https://i.ucoin.net/coin/17/262/17262791-2/sweden-10-kronor-2007.jpg', 'https://i.ucoin.net/coin/16/345/16345567-1/sweden-10-kronor-2008.jpg', 'https://i.ucoin.net/coin/16/345/16345567-2/sweden-10-kronor-20

We might need  to scrap data from different websites. We can do this overriding some of the methods from our coin class to meet our needs. The methods we are going to need to override are:
* ```get_url```: Each website will be different, and will take different parameters. We can adjust that for each case
* ```get_image_urls```: when scrapping the html, we will need to scrap different tags for different sites.

In [133]:
# Create custom scrapper here. We create one for scrapping google images
class Custom_Coin_Scrapper (Coin):
    def __init__(self, coin_id, coin_name, coin_currency, coin_country):
        super().__init__(coin_id, coin_name, coin_currency, coin_country) # init super class parameters.
    
    # Override parent get_url
    def get_url(self):
        base_url = "https://www.google.com/search?tbm=isch&sa=1&ei=txYYXNaPFYO5sQH99aPwBg&q=" # Website url. Eg. Google
        query = f"{self.name} coin {self.country}".split(" ")
        query = "+".join(query)
        
        self.currency_url = base_url + query
    
    # Override get_image_urls
    def get_image_urls (self):
        None

# Working example
example_coin = Custom_Coin_Scrapper(2, "25 kurus", "Turkish lira", "turkey")
example_coin.get_url()
example_coin.get_html()
print(example_coin.currency_url)

https://www.google.com/search?tbm=isch&sa=1&ei=txYYXNaPFYO5sQH99aPwBg&q=25+kurus+coin+turkey


### Create a list of Coin instances

Create a list with all the necessary data for each coin to start scrapping.

In [134]:
# Create a dict containing the json.
with open('cat_to_name.json', 'r') as f:
    cat_to_name = json.load(f)

# Check that it worked
cat_to_name["1"]

'1 Cent,Australian dollar,australia'

In [136]:
# A list containing an instance of Coin for each coin.
coin_list = []

# Append coins to coin_list.
for coin in cat_to_name.items():
    # Get the id, name, currency and country
    coin_id = coin[0]
    coin_data = coin[1].split(",")
    coin_name, coin_currency, coin_country = coin_data[0], coin_data[1], coin_data[2]
    
    # Create a Coin instance.
    my_coin = Coin(coin_id, coin_name, coin_currency, coin_country)
    # Push it into coin list.
    coin_list.append(my_coin)