# Import Dependencies

In [1]:
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import time

# Init Splinter Browser

In [3]:
# MAC
# executable_path = { 'executable_path': '/usr/local/bin/chromedriver' }
# WINDOWS
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

# Scrape the List of Genre 

In [9]:
# URL to be scraped
url = 'https://www.vgchartz.com/gamedb/?page='

# Open webpage
browser.visit(url)

# Retrieve HTML webpage source
html = browser.html

# Parse HTML webpage source using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Scrape the list of genre
genre_list = []
result_select = soup.find('select', {'name':'genre'})
result_options = result_select.find_all('option')
for result in result_options:
    if result['value'] != '':
        genre_list.append(result['value'])
genre_list

['Action',
 'Action-Adventure',
 'Adventure',
 'Board Game',
 'Education',
 'Fighting',
 'Misc',
 'MMO',
 'Music',
 'Party',
 'Platform',
 'Puzzle',
 'Racing',
 'Role-Playing',
 'Sandbox',
 'Shooter',
 'Simulation',
 'Sports',
 'Strategy',
 'Visual Novel']

# Scrape Games data for each Genre

In [20]:
# Loop and scrape games info for each genre
for genre in genre_list:
    
    # Variable to hold page numbers
    page_num = 1
    
    # Variable to hold if page exists
    page_exist = False
    
    # Build URL to be scraped
    url_base = "https://www.vgchartz.com/games/games.php?"
    url_dyn = f"page={page_num}&results=200&genre={genre.replace(' ', '%20')}"
    url_tail = "&order=Sales&ownership=Both&direction=DESC"
    url_tail += "&showtotalsales=1&shownasales=1&showpalsales=1"
    url_tail += "&showjapansales=1&showothersales=1&showpublisher=1"
    url_tail += "&showdeveloper=1&showreleasedate=1&showlastupdate=1"
    url_tail += "&showvgchartzscore=1&showcriticscore=1&showuserscore=1"
    url_tail += "&showshipped=1"
    
    url = url_base + url_dyn + url_tail
    
    # Open webpage
    browser.visit(url)
    
    # Retrieve HTML webpage source
    html = browser.html
    
    # Parse HTML webpage source using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Logic to check if the page exists
    soup_div = soup.find("div", id="generalBody")
    page_anchors = soup_div.find("tr").find_all("th")[1].find_all("a")
    for a in page_anchors:
        if(a.text.find(str(page_num)) > 0):
            page_exist = True

    while(page_exist):
        url_dyn = f"page={page_num}&results=200&genre={genre.replace(' ', '%20')}"
        url = url_base + url_dyn + url_tail
        
        # Open webpage
        browser.visit(url)
    
        time.sleep(2)
        
        # Retrieve HTML webpage source
        html = browser.html
        
        # Parse HTML webpage source using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Scrape the game info into Dataframe
        result = soup.find('div', id='generalBody')
        html_string = result.prettify()
        tbody_start_index = html_string.find("<tbody>\n   <tr>")
        tbody_end_index = html_string.find('<tr>\n    <th style="background-image:url(')
        html_string_start = html_string[0:tbody_start_index + 11]
        html_string_end = html_string[tbody_end_index:]
        new_html_string = html_string_start + html_string_end
        dfs = pd.read_html(new_html_string)
        df = dfs[0]
        
        # Add Genre Column
        df['Genre'] = genre
        
        # Scrape the console info into a list
        console_list = []
        all_trs = result.find('table').find_all('tr')
        tr_count = 0
        for tr in all_trs:
            # Skip first 3 TR tags
            if tr_count > 2:
                console_list.append(tr.find_all("td")[3].find('img').attrs['alt'])
            tr_count += 1;
        
        # Update Console info in Dataframe
        df['Console'] = console_list

        # Create/Update Video Games Dataframe
        try:
            game_df = game_df.append(df, ignore_index=True)
            print(f'Appended Dataframe with page {page_num} of Genre {genre}')
        except NameError:
            game_df = df
            print(f'Created Dataframe with page {page_num} of Genre {genre}')
        
        # Increment page number
        page_num += 1
        # Check if link to next page exists
        page_exist = False
        page_anchors = result.find("tr").find_all("th")[1].find_all("a")
        for a in page_anchors:
            # if exists set page_exist to true and update page_link_text
            if(a.text.find(str(page_num)) > 0):
                page_exist = True
                page_link_text = a.text
                break

game_df

Created Dataframe with page 1 of Genre Action
Appended Dataframe with page 2 of Genre Action
Appended Dataframe with page 3 of Genre Action
Appended Dataframe with page 4 of Genre Action
Appended Dataframe with page 5 of Genre Action
Appended Dataframe with page 6 of Genre Action
Appended Dataframe with page 7 of Genre Action
Appended Dataframe with page 8 of Genre Action
Appended Dataframe with page 9 of Genre Action
Appended Dataframe with page 10 of Genre Action
Appended Dataframe with page 11 of Genre Action
Appended Dataframe with page 12 of Genre Action
Appended Dataframe with page 13 of Genre Action
Appended Dataframe with page 14 of Genre Action
Appended Dataframe with page 15 of Genre Action
Appended Dataframe with page 16 of Genre Action
Appended Dataframe with page 17 of Genre Action
Appended Dataframe with page 18 of Genre Action
Appended Dataframe with page 19 of Genre Action
Appended Dataframe with page 20 of Genre Action
Appended Dataframe with page 21 of Genre Action
Ap

Appended Dataframe with page 13 of Genre Puzzle
Appended Dataframe with page 14 of Genre Puzzle
Appended Dataframe with page 15 of Genre Puzzle
Appended Dataframe with page 16 of Genre Puzzle
Appended Dataframe with page 17 of Genre Puzzle
Appended Dataframe with page 1 of Genre Racing
Appended Dataframe with page 2 of Genre Racing
Appended Dataframe with page 3 of Genre Racing
Appended Dataframe with page 4 of Genre Racing
Appended Dataframe with page 5 of Genre Racing
Appended Dataframe with page 6 of Genre Racing
Appended Dataframe with page 7 of Genre Racing
Appended Dataframe with page 8 of Genre Racing
Appended Dataframe with page 9 of Genre Racing
Appended Dataframe with page 10 of Genre Racing
Appended Dataframe with page 11 of Genre Racing
Appended Dataframe with page 12 of Genre Racing
Appended Dataframe with page 13 of Genre Racing
Appended Dataframe with page 14 of Genre Racing
Appended Dataframe with page 15 of Genre Racing
Appended Dataframe with page 16 of Genre Racing
A

Unnamed: 0,Pos,Game,Game.1,Console,Publisher,Developer,VGChartz Score,Critic Score,User Score,Total Shipped,Total Sales,NA Sales,PAL Sales,Japan Sales,Other Sales,Release Date,Last Update,Genre
0,1,,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,22nd Mar 05,04th Mar 20,Action
1,2,,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,30th Jun 97,24th Mar 20,Action
2,3,,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,16th Oct 01,03rd Feb 20,Action
3,4,,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,24th Mar 20,Action
4,5,,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,17th Sep 13,,Action
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58892,436,,World End Syndrome,PS4,Arc System Works,Arc System Works,,,,,,,,,,26th Apr 18,03rd Apr 19,Visual Novel
58893,437,,XBlaze Lost: Memories,PC,Aksys Games,Arc System Works,,,,,,,,,,11th Aug 16,28th Jan 19,Visual Novel
58894,438,,"Yoru, Tomosu",NS,Nippon Ichi Software,Nippon Ichi Software,,,,,,,,,,30th Jul 20,09th May 20,Visual Novel
58895,439,,"Yoru, Tomosu",PS4,Nippon Ichi Software,Nippon Ichi Software,,,,,,,,,,30th Jul 20,09th May 20,Visual Novel
