In [None]:
# default_exp scraper

In [None]:
#export 
import os
from io import BytesIO
import re
import pickle
import logging
from pathlib import Path

from IPython.core.display import display, HTML
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import boto3


# Bulbapedia Scraper
> A collection of functions to help scrape Pokemon Card content from the Bulbapedia Site 

Bulbapedia is an encyclopedia about Pokémon to which anyone can contribute and a useful source for building a labelled dataset of trading card images. 

For (The Pokedexr Project)[link] the bulbapedia has images of the Japanese cards and the card descriptions in English. 

There are 3 basic parts here:

 1. Some general convenience functions

 1. Tools for extracting lists of cards from the [Deck lists](https://bulbapedia.bulbagarden.net/wiki/GX_Starter_Decks_(TCG)) or Set Lists, including a link to the [detailed description page](https://bulbapedia.bulbagarden.net/wiki/Heatmor_(TCG))) for each card.
 
 1. Tools for extracting meta-data, including links to card images, from the detailed description pages.
 
 1. Tools for fetching the card images themselves and storing them.
 
For most of the extraction rules I used the browser's developer tools to find any tags and attributes that would allow me to scrape a list of cards from the deck list pages. 



### 1. Utility functions

In [None]:
# export
def bulba_url_for(resource): 
    """Returns full url for bulbapedia page given last part or url (resource)"""
    return f"https://bulbapedia.bulbagarden.net/wiki/{resource}"

In [None]:
# export
def sanitize_name(name):
    """For extracting images we need a cleaned up version of the card name
    that appears in Deck of set lists
    """
    return re.sub(re.compile('|'.join(['[ &.-:]',"'s"])), '', name)#.replace("'s",'')

In [None]:
# export
def fetch_page_soup(resource): 
    """Fetches a page and returns a beautiful soup."""
    r = requests.get(bulba_url_for(resource))
    if r.ok:
        soup = BeautifulSoup(r.content, 'html.parser')
    else:
        soup = None
        logging.error(f"Couldn't fetch resource {bulba_url_for(resource)}.")
    return soup

In [None]:
# export
def save_card_list(cards, fname='cards.pkl'):
    """Pickles the card list"""
    with open(fname,'wb') as f:
        pickle.dump(cards,f)

In [None]:
# export       
def load_card_list(cards, fname='cards.pkl'):
    """Load a pickled card list"""
    with open(fname,'rb') as f:
        d = pickle.load(f)
    return d

In [None]:
BUCKET_NAME=os.environ.get('POKE_S3_BUCKETNAME')
LOCAL_FOLDER=Path('.')

def store_image(image, fname, location): 
    if location[:2]=='s3':
        bucket=location[3:].split('/')[0]
        f=location[3:].split('/')[1:]
        push_image_to_s3(image, fname, bucket_name=bucket, dr='/'.join(f))
    else:
        save_image_local(image, fname, dr=location)
        
def push_image_to_s3(image_data, fname, dr='',bucket_name=''):
    s3 = boto3.resource('s3')
    key = '/'.join(filter(lambda item: item!='', [dr, fname]))
    ret = s3.Bucket(bucket_name).put_object(Key=key, Body=BytesIO(image_data))
    if not ret:
        logging.warning(f"Failed to upload {fname} to S3/{bucket_name}.")
    return ret

def save_image_local(image_data, fname, dr=''):
    Path(dr).mkdir(parents=True, exist_ok=True)
    n = '/'.join(filter(lambda item: item != '', [dr, fname]))
    with open(n,'wb') as f:
        f.write(image_data)
    return None

None


## 2. Extracting cards from Deck or Set Lists

A function to fetch a list of card name, numbers and links to details page from bulbapedia.
Requires a valid Deck or Set List reference to work. Here are the assumptions about how the page is structured:

* For `set lists` the links to the details pages were contained in the `href` attribute of elements that has the class `mw-redirect`.
* `Deck Lists` are buried in nested tables. The best I could come up with was to note the headings of the tables are wrapped in \<b> tags, find those. Then work back up the parents until you find the correct table parent. Now iterate over the rows of that table to extract information

In [None]:
# export       
def fetch_card_list(list_name, req_decks=[]):
    """Returns a list of card names, numbers and links to details pages from given list
    
    list_name -- reference to the page containing the list on bulbapedia 
    decks -- decks to be taken from a desk list, ignored for set list
    """
    cards={}
    soup = fetch_page_soup(list_name)
    
    # is this a Deck List or Set List?
    list_tables = {lt:soup.find(id=lt) for lt in ['Deck_lists', 'Set_list']}
    
    if list_tables['Deck_lists'] is not None: 
        # retrieve the table right after the 'Deck Lists' heading
        deck_lists_tables = soup.find_all('table',{"class": "multicol"})

        # Pull out the required deck lists
        for t in deck_lists_tables:
            for deck in req_decks:
                e=t.find_all('b',text=deck)
                if len(e) > 0:
                    _deck = e[0].parent.parent.parent.parent.parent.parent.parent

                    for row in _deck.find_all('tr')[4:]:
                        try:
                            number, name, _ = row.find_all('td')
                            durl = name.a.attrs.get('href').split('/')[-1]
                            card = {'number':number.text.strip(), 'name': name.a.text, 'details_page_ref': durl}
                            cards[card['name']] = card
                        except:
                            pass
    elif list_tables['Set_list'] is not None: 
        set_list_table = list_tables['Set_list'].findNext('table')
        for row in set_list_table.find_all('a',{'class':'mw-redirect'}):
            if row:
                card = {
                    'number':row.parent.parent.find('td').text.strip(),
                    'name': row.text,
                    'details_page_ref': row.attrs.get('href').split('/')[-1]
                }
                cards[card['name']]=card
    else:
        logging.error(f'Could not fetch deck or set list {list_name}')
        
    # add santized names
    for card_name,card in cards.items():
        card.update({'sname':sanitize_name(card_name)})
    
    return cards

In [None]:
#r = requests.get(bulba_url_for(cards['Charmander'].get('details_page_ref','notexist')))
#soup = BeautifulSoup(r.content,'html.parser')
bulba_url_for(cards['Charmander'].get('details_page_ref','notexist'))

'https://bulbapedia.bulbagarden.net/wiki/Charmander_(GX_Starter_Deck_11)'

The content we want is in a nested table structure under (well, after) the heading 'Card Text'.

In [None]:
pokemon_card_text_table = soup.find(id='Card_text').findNext()
url_for(cards['Charmander'].get('details_page_ref','notexist'))

'https://bulbapedia.bulbagarden.net/wiki/Charmander_(GX_Starter_Deck_11)'

## 3. Updating a list of cards with information from the details pages

Also want some other information from the details page, namely the card text and the numbers of the cards.

Extracting the card text was a bit interesting as the format of the html was different for the Trainer cards than Pokemon Cards. (Were not interested in the energy cards here)

First an example of the Pokemon card. Let's stick with Charmander.

### Image Urls

[TODO] Description of Extract the image...

The card page can, but doesn't always, include a 'gallery' section where images are displayed if more than one version of the artwork exists. For example different editions of the cards use different artists. At first I'm not sure of what to do with this. On the one hand I know that in my specific case I only have 1 version of the artwork to recognise. But as a human I would still be able to recognise a pokemon across the different depictions. I think it is best to fetch the images and then experiment later with the effect of including/excluding them later.

Some of the pages have a gallery section with multiple artworks. The gallery section, if it exists, looks like this:

```<span id="Gallery> ... <span>```

But the table of images that we want is not encapsulated in a `<div>` so we can't use that section heading to get the images embedded as children within it. 

Instead we can extract all `<img>` tags from the page and then filter out unwanted images. Using developer tools we can see that the pokemon/entity name is embedded in the `alt` attribute of the `img` tag for the images we want. So we can use the name we collected earlier to extract the images we want.

A slight complication is that the name as it appears in the attribute differs from the name we've recorded. It has any underscore, dash, spaces, punctuation removed. 


#### De-duplicating the list of image urls

The list of image urls extracted often has inexact duplicates: they are just different sizes of the same image. The `dedupe_image_urls` function aims to keep only one set of each image be examining the path but ignoring the image size part of the file name. Implemented using a dictionary so we keep the last version a file that is in the list.

In [None]:
# export
def dedupe_image_urls(urls):
    """Try to de-duplicate a list of image urls given the path structure"""
    deduped_img_urls = {}
    for u in urls:
        k = '/'.join(u.split('/')[:-1])
        deduped_img_urls[k] = u.split('/')[-1]

    return [k+'/'+deduped_img_urls[k] for k in deduped_img_urls]

Now do it

In [None]:
# export
IMG_EXCEPTIONS = {'Devoured_Field_(GX_Starter_Deck_128)':'Decaying_Wasteland_()'}
    
def extract_image_urls(soup, name):
    """Given the details page soup, extract image urls"""
    card_images = soup.findAll('img',{'alt':re.compile(name)})
    return dedupe_image_urls(list(set([i.get('src') for i in card_images])))

### Card Text

The 'Card Text' is the english translation of the main text on the card (ie.. moves, instructions). I couldn't find any useful tags to extract this data so I've just had to create a loop to parse the tables relying on an assumption about the order of the content. It was just trial and error to develop this.

The 'Trainer' cards and 'Pokemon' cards have different structures so each has a different function to extract information. Energy cards have no text. Here is a function that determines the difference from the soup

In [None]:
#export 
def isTrainerCard(soup): 
    """Given the soup of card details page, detect if this is a trainer card."""
    s = soup.find(href='/wiki/Trainer_card_(TCG)')
    if s is None:
        return False
    else:
        return True

Now a function to extract the card text from a Pokemon card. [Charmander](https://bulbapedia.bulbagarden.net/wiki/Charmander_(GX_Starter_Deck_11) is an example of a pokemon card.

In [None]:
# export
def extract_pokemon_card_text(card_text_table):
    """Given the soup of the details page, return the card text for a trainer card."""
    items=[]
    current_title=None
    
    japanese_chars = [chr(c) for c in range(0x3040,0x30ff)]
    japanese_chars_re ='['+''.join(japanese_chars)+']'

    for i,table in enumerate(card_text_table.find_all('table')):
        table_text=table.text.strip().replace('\n','') 
        
        if re.search(japanese_chars_re,table_text):
            
            energy_items = [i.attrs.get('alt') for i in table.find_all('img')]
        
            # we are looking at a name item because of the japanese characters
            tokens = table_text.split(' ')
            try:
                points=int(tokens[-1])
                name={'jp':tokens[-2], 'en':' '.join(tokens[:-2])}
            except:
                # Some moves have no points associated
                points=None
                name={'jp':tokens[-1], 'en':' '.join(tokens[:-1])}

            current_title = {
                'name': name,
                'points':points
            }
        else:
                desc=table_text if len(table_text) else None
                # a description for current move
                current_item={}
                if current_title:
                    current_item['type']='move'
                    current_item['description']=desc
                    current_item['energy_items']=energy_items
                    current_item.update(current_title)
                elif len(table_text)>0:
                    current_item['type']='info'
                    current_item['description']=desc
                    


                if current_item.get('type',False):
                    items.append(current_item)

                current_title = None
    
    return items


In [None]:
# How to use it
#pokemon_card_text_table = soup.find(id='Card_text').findNext()
#extract_pokemon_card_text(pokemon_card_text_table)

And the equivalent for a Trainer card. Trainer cards are slightly different and thankfully, simpler. [Professor Kukui](https://bulbapedia.bulbagarden.net/wiki/Professor_Kukui_(GX_Starter_Deck_118)) is an example trainer card

In [None]:
#export
def extract_trainer_card_text(card_text_table):
    """Given the soup of the details page, return the card text for a trainer card."""
    items=[]
    for i,tag in enumerate(card_text_table):
        if not ('display:none' in str(tag)):
            if tag.name == 'tr':
                for td in tag.findChildren('table'):
                    ctd=td.find('td')
                    if not ctd:
                        continue
                    current_item = {'type':'info', 'description':ctd.text.strip()}
                    items.append(current_item)
    return items


In [None]:
# example
# Prof. Kukui is card 12 in our list
prof_kukui = requests.get(url_for(cards['Professor Kukui'].get('details_page_ref','notexist')))
soup = BeautifulSoup(prof_kukui.content,'html.parser')


isTrainerCard = soup.find(href='/wiki/Trainer_card_(TCG)')
print(isTrainerCard) # This is non if not a trainer card

if isTrainerCard:
    trainer_card_text_table = soup.find(id='Card_text').findNext()
    print(extract_trainer_card_text(trainer_card_text_table))


<a href="/wiki/Trainer_card_(TCG)" title="Trainer card (TCG)"><span style="color:#000;">Trainer</span></a>
[{'type': 'info', 'description': "Draw 2 cards. During this turn, your Pokémon's attacks do 20 more damage to your opponent's Active Pokémon (before applying Weakness and Resistance)."}, {'type': 'info', 'description': 'You may play only 1 Supporter card during your turn (before your attack).'}]


Wrap those functions for each card type up into something more convenient.

In [None]:
#export
def extract_card_text(soup):
    """Given the soup for a details page, return any available card text"""
    card_text_title = soup.find(id='Card_text')
    if card_text_title is not None:
        if isTrainerCard(soup):
            card_text = extract_trainer_card_text(card_text_title.findNext())
        else: 
            card_text = extract_pokemon_card_text(card_text_title.findNext())
    else:
        card_text=None
    return card_text

### Alternative Card Numbers

I'm not quite sure how they work yet, but I noticed a list of card numbers on the details page too. These feel like they may be useful in the future so let's extract those too. 

In [None]:
# export
def get_card_numbers(soup):
    """Given the details page soup, extract any available card numbers"""
    en_nums = soup.find_all(text='English card no.')
    en_nums = list(set([e.find_next().text.strip() for e in en_nums]))       
    jp_nums=soup.find_all(text='Japanese card no.')
    jp_nums = list(set([e.find_next().text.strip() for e in jp_nums]))
    return {'alt_card_num': { 'jp': jp_nums, 'en': en_nums }}


In [None]:
prof_kukui = requests.get(url_for(cards['Professor Kukui'].get('details_page_ref','notexist')))
soup = BeautifulSoup(prof_kukui.content,'html.parser')
get_card_numbers(soup)

{'alt_card_num': {'jp': ['096/SM-P',
   '004/018',
   '059/060',
   '054/059',
   '066/060',
   '118/131'],
  'en': ['128/149', '148/149']}}

#### wrapped up

In [None]:
# export
def update_card_details(cards):
    """Procedure: Given a list of cards (output of fetch_card_lists()) augment the list with the relevant details."""
    for card_name,card in tqdm(cards.items()):

        # fetch details page 
        soup=fetch_page_soup(card.get('details_page_ref','notexist'))
        
        if soup is not None:
            
            # extract and add the image urls
            card.update({'img_urls': extract_image_urls(soup,card.get('sname'))})
            
            # extract card text
            card.update({'card_text': extract_card_text(soup)})
            
            # get alternative card numbers
            card.update({'alt_card_num': get_card_numbers(soup)})

        else:
            logging.warning(f"Couldn't fetch page: {bulba_url_for(card.get('details_page_ref'))}")
    
    return None
     

## Fetching card images and storing

In [None]:
# export
def fetch_image(img_url):
    """Given a url produced by extract_image_urls() fetch the image content (bytes) and the filename."""
    r = requests.get(f'https:{img_url}')
    if r.ok:
        data = r.content
        fname = img_url.split('/')[-1]
    else:
        logging.warning(f'Failed to fetch image {img_url}')
        data=None;fname=None
    return (data, fname)



### Saving images

In [None]:
#export
def save_image(image_data, fname): 
    if 's3' in fname[:2]:
        # push to an s3 bucket
        s3 = boto3.resource('s3')
        bucket=s3.Bucket(fname[3:].split('/')[0])
        key='/'.join(fname[3:].split('/')[1:])
        ret = bucket.put_object(Key=key, Body=BytesIO(image_data))
        if not ret:
            logging.error(f"Failed to upload {fname} to S3.")
        return ret
    else:
        # assume local file
        p=Path(os.path.dirname(fname))
        p.mkdir(parents=True, exist_ok=True)
        with open(fname,'wb') as f:
            f.write(image_data)
        

In [None]:
# export 
PROJ_HOME = f"{os.environ['HOME']}/projects/pokemon/"
def fetch_images_for_cards(cards):
    for card_name, card in tqdm(cards.items()):
        for url in card.get('img_urls'):
            image_data, fname = fetch_image(url)
            # folder structure: bucket / card_images / original / <class> / <filename>
            #location = f"s3:{BUCKET_NAME}/card_images/original/{card.get('name')}"
            lfname = f"{PROJ_HOME}/original/{card.get('sname')}/{fname}"
            save_image(image_data, lfname)


In [None]:
# export
import PIL
def fetch_card_img_s3(name,bucket=os.environ.get('POKEDEXR_S3')):
    s3 = boto3.resource('s3')
    x=s3.Object(bucket, f'card_images/original/{name}').get()
    src_img = PIL.imread(BytesIO(x['Body'].read()),0)
    return src_img

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 01_scraper.ipynb.
Converted 02_augmentation.ipynb.
Converted Evaluation.ipynb.
Converted ExampleBlog.ipynb.
Converted Training.ipynb.
Converted WebService.ipynb.
Converted index.ipynb.
