## Demian's Gamebook - WebScraping Project

### Libraries

In [1]:
import requests

from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

### Retrieving Items ID

In [2]:
URL = 'https://gamebooks.org/Items'

In [3]:
def get_items():
    req = requests.get(URL)
    if req.status_code != 200:
        raise requests.ConnectionError(f'Something went wrong. [{req.status_code}]')
    
    soup = BeautifulSoup(req.text, 'html.parser')
    links = soup.find('div', class_='content').findAll('a', recursive=False)
    return [a['href'].split('/')[-1] for a in links if a.get('href')]

In [4]:
items = get_items()
print('Total Number of Entries:', len(items))

Total Number of Entries: 13798


### Collect Item Data

In [5]:
URL_ITEM = 'https://gamebooks.org/Item/{}/Editions'

In [6]:
def get_table_information(table, all_text=None, only_links=None, secondary_information=None):
    if not all_text:
        all_text = []
    
    if not only_links:
        only_links = []
    
    if not secondary_information:
        secondary_information = {}
    
    data = dict()
    rows = table.findAll('tr')
    for row in rows:
        key = row.th.text.replace(':', '').strip()
        if secondary_information and key in secondary_information.keys():
            raw = [i.text.strip() for i in row.td.children if i.name != 'a']
            if raw:
                val = ' '.join(raw).strip()
                data[secondary_information[key]] = val
            
        if key in all_text:
            raw = [i.text.strip() for i in row.td.children]
            val = ' '.join(raw).strip()
        
        elif key in only_links:
            val = [i.text.strip() for i in row.td.findAll('a')]
            if len(val) == 1:
                val = val[0] 
        
        else:
            continue
        
        data[key] = val
    
    return data

In [7]:
ED_TEXT = [
    'Date',
    'Length',
    'Number of Endings',
]

ED_LINK = [
    'Item', 
    'Series',
    'Author',
    'Illustrators',
    'ISBN'
]

ED_OTHER = {
    'Series': 'Series Number'
}

In [8]:
def get_edition_information(edition):
    name = edition[0]
    
    images = []
    tables = []
    for row in edition[1:]:
        row_images = ['https://gamebooks.org' + i['src'] for i in row.findAll('img')]
        row_tables = row.find('table')
        images.extend(row_images)
        if row_tables:
            tables.append(get_table_information(row_tables, ED_TEXT, ED_LINK, ED_OTHER))
    
    if len(tables) > 1:
        tables = {key: val for key, val in t for t in tables}
    else:
        tables = tables[0]
    
    tables['Edition'] = name
    tables['Images'] = images
    
    return tables

In [9]:
DETAILS_TEXT = [
    'Advertisement Blurb'
]

DETAILS_LINK = [
    'Translated From',
    'Translated Into',
    'Contained Into'
]

DETAILS_OTHER = {
    'Translated Into': 'Countries'
}

In [10]:
def get_gamebook_information(item):
    url = URL_ITEM.format(item)
    req = requests.get(url)
    if req.status_code != 200:
        raise requests.ConnectionError(f'Something went wrong - {item} [{req.status_code}]')

    soup = BeautifulSoup(req.text, 'html.parser')
    
    rows = soup.find('div', class_='content').findAll('div', class_='row')
    details = get_table_information(rows.pop(0), DETAILS_TEXT, DETAILS_LINK, DETAILS_OTHER)
    if 'Translated From' in details.keys():
        return
    
    editions_raw = []
    for row in rows:
        sibling = row.previous_sibling
        if sibling.name != 'h2':
            editions_raw[-1].append(row)
        else:
            editions_raw.append([sibling.text.strip(), row])
    
    editions = [get_edition_information(i) for i in editions_raw]
    
    first_edition = editions.pop(0)
    details.update(first_edition)
    del details['Edition']
    
    return details, editions

In [16]:
gamebooks = {}
editions = {}

In [17]:
for i in tqdm(items):
    if i in gamebooks.keys():
        continue 
        
    gm = get_gamebook_information(i)
    if gm:
        gamebooks[i] = gm[0]
        editions[i] = gm[1]

  0%|          | 0/13798 [00:00<?, ?it/s]

IndexError: pop from empty list