In [1]:
#Importing required libraries.
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

#Creating a list of urls needed to scrape the entire game database with 10,000 games per page: urls.
pages = [str(i) for i in np.arange(1, 7)]

url_1 = 'https://www.vgchartz.com/games/games.php?page='
url_2 = '&results=10000&order=TotalSales&ownership=Both&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher=1&showdeveloper=0&showreleasedate=1&showlastupdate=0&showvgchartzscore=0&showcriticscore=1&showuserscore=1&showshipped=1'

urls = []
for i in pages:
    urls.append(url_1 + i + url_2)
    
#Scraping data into a list: responses
responses = []

for url in urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    responses.append(soup)
    
#Extracting game data to a list: response_games_only.
response_games_only = []

for response in responses:
    for i in response.find_all('a'):
        try:
            if i.attrs['href'].startswith('https://www.vgchartz.com/game/'):
                response_games_only.append(i.parent.parent)
        except:
            continue
            
#Creating an empty list for each variable and a list of dictionary keys.
rank = []
game = []
console = []
publisher = []
critic_score = []
user_score = []
total_shipped = []
total_sales = []
na_sales = []
pal_sales = []
jap_sales = []
other_sales = []
release_date = []

master_list = [rank,game,console,publisher,critic_score,user_score,total_shipped,
               total_sales,na_sales,pal_sales,jap_sales,other_sales,release_date]

#Extracting each game's varible data into the approriate list removing unecessary elements.
for response in response_games_only:
    rank.append(response.find_all('td')[0].get_text().strip())
    game.append(response.find_all('td')[2].get_text().strip())
    console.append(response.find_all('td')[3].find('img', alt=True)['alt'].strip())
    publisher.append(response.find_all('td')[4].get_text().strip())
    critic_score.append(response.find_all('td')[5].get_text().strip())
    user_score.append(response.find_all('td')[6].get_text().strip())
    total_shipped.append(response.find_all('td')[7].get_text().replace('m', ''))
    total_sales.append(response.find_all('td')[8].get_text().replace('m', ''))
    na_sales.append(response.find_all('td')[9].get_text().replace('m', ''))
    pal_sales.append(response.find_all('td')[10].get_text().replace('m', ''))
    jap_sales.append(response.find_all('td')[11].get_text().replace('m', ''))
    other_sales.append(response.find_all('td')[12].get_text().replace('m', ''))
    release_date.append(response.find_all('td')[13].get_text().strip()[-2:])
    
#Using the key list and the lists of variables to form a dictionary: game_dict.
keys = ['rank','game','console','publisher','critic_score','user_score','total_shipped','total_sales',
           'na_sales','pal_sales','jap_sales','other_sales','release_date']

game_dict = dict(zip(keys, master_list))

#Correcting game titles.
game_dict['game'] = [item[:-15].strip() if item[-15:] == 'Read the review' else item for item in game_dict['game']]

#Correcting release dates and converting to integers from srings and setting all missing data to 0.
game_dict['release_date'] = [0 if item == '/A' else np.int16('19'+item) if int(item) >= 70 else np.int16('20'+item) for item in game_dict['release_date']]

#Correcting non-numeric variables to numeric and setting all missing data to 0.
numeric = ['critic_score','user_score','total_shipped','total_sales',
           'na_sales','pal_sales','jap_sales','other_sales']

for key in numeric:
    game_dict[key] = [0 if item == 'N/A' else np.float64(item) for item in game_dict[key]]
    
#Creating a data frame: df
df = pd.DataFrame(game_dict)

In [2]:
df.head()

Unnamed: 0,rank,game,console,publisher,critic_score,user_score,total_shipped,total_sales,na_sales,pal_sales,jap_sales,other_sales,release_date
0,1,Wii Sports,Wii,Nintendo,7.7,0.0,82.9,0.0,0.0,0.0,0.0,0.0,2006
1,2,Super Mario Bros.,NES,Nintendo,10.0,8.2,40.24,0.0,0.0,0.0,0.0,0.0,1985
2,3,Mario Kart Wii,Wii,Nintendo,8.2,9.1,37.38,0.0,0.0,0.0,0.0,0.0,2008
3,4,Wii Sports Resort,Wii,Nintendo,8.0,8.8,33.14,0.0,0.0,0.0,0.0,0.0,2009
4,5,Pokémon Red / Green / Blue Version,GB,Nintendo,9.4,0.0,31.38,0.0,0.0,0.0,0.0,0.0,1998
