In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time

def extract_game_details(url):
    review_page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    review_soup = BeautifulSoup(review_page.text, 'html.parser')
    
    try:
        date = publisher = review_soup.find(class_='summary_detail release_data').find(class_='data')
        date_text = date.text.strip()
    except:
        print(f"{url} has no date")
        date_text = None
        
    try:
        publisher = review_soup.find(class_='summary_detail publisher')
        publisher_text = publisher.find('a').text.strip()
    except:
        print(f"{url} has no publisher details")
        publisher_text = None
    
    try:
        genres = review_soup.find(class_='product_genre').find_all(class_='data')
        genre_text = [genre.text.strip() for genre in genres]
    except:
        print(f"{url} has no genre details")
        genre_text = None
    
    
    return [date_text, genre_text, publisher_text]

def title_exists(title):
    for review in all_reviews:
        if (review[0] == title):
            return True
        
    return False


all_reviews = []
game_publishers = [{'company_name': 'ign', 'page_num': 501}, {'company_name': 'gamespot', 'page_num': 405}]

for game_media_company in game_publishers:
    for page_num in range(0, game_media_company['page_num']):
        company = game_media_company['company_name']
        url = f"https://www.metacritic.com/publication/{company}?filter=games&page={page_num}"
        page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(page.text, 'html.parser')

        for idx, review in enumerate(soup.find_all(class_='critic_review')):        
            try:
                review_anchor = review.find('a')
                review_title = review_anchor.text.strip()
                review_url = f"https://www.metacritic.com{review_anchor['href']}"
                review_score = int(review.find(class_='indiv').text)
            except:
                print(f"{url} number {idx} skipped")
                continue

            if title_exists(review_title):
                continue
            all_reviews.append([review_title, review_score, *extract_game_details(review_url)])
        print(f"Done with {company} page {page_num}")
        
    df = pd.DataFrame(all_reviews, columns=['title', 'score', 'date', 'genres', 'publisher'])
    df.to_csv(f"{game_media_company['company']}_final.csv", index=False)
    
    all_reviews = []
    


In [None]:
import matplotlib.pyplot as plt

df = pd.read_csv('gamespot_final.csv')

df['date'] = pd.to_datetime(df['date'], format='%b %d, %Y', errors='coerce')
df = df[(df['date'] < '2022-01-01') & (df['date'] > '2000-01-01')]

df['year'] = df.date.dt.year
df_count = df.groupby('year').count().title.to_frame()
df_score = df.groupby('year').mean().score.round().to_frame()

df = pd.merge(df_count, df_score, left_on='year', right_on='year')
df = df.rename(columns={'title': 'count'})
df.to_csv('blah_gamespot.csv')

In [None]:
df = pd.read_csv('ign_final.csv')

df['date'] = pd.to_datetime(df['date'], format='%b %d, %Y', errors='coerce')
df = df[(df['date'] < '2022-01-01') & (df['date'] > '2000-01-01')]

df['year'] = df.date.dt.year
df_count = df.groupby('year').count().title.to_frame()
df_score = df.groupby('year').mean().score.round().to_frame()

df = pd.merge(df_count, df_score, left_on='year', right_on='year')
df = df.rename(columns={'title': 'count'})
df.to_csv('blah_ign.csv')

In [None]:
from plotnine import *
import seaborn as sns
import pandas as pd

df = pd.read_csv('ign_final.csv')
df = df.groupby('publisher').filter(lambda x: len(x) > 15)
ign_top_10 = df.groupby('publisher').mean().score.round(1).sort_values(ascending=False).head(n=10).to_frame()
ign_top_10['company'] = 'ign'

df = pd.read_csv('gamespot_final.csv')
df = df.groupby('publisher').filter(lambda x: len(x) > 15)
gamespot_top_10 = df.groupby('publisher').mean().score.round(1).sort_values(ascending=False).head(n=10).to_frame()
gamespot_top_10['company'] = 'gamespot'

final_df = ign_top_10.append(gamespot_top_10)
final_df = final_df.reset_index()

print(final_df)
# ggplot(dfm,aes(x = Input,y = value)) + 
#     geom_bar(aes(fill = variable),stat = "identity",position = "dodge") + 
#     scale_y_log10()

final_df = final_df[~final_df['publisher'].isin(['Stardock', 'Microsoft Game Studios', 'EA Games', 'Telltale Games'])]

sns.set(rc = {'figure.figsize':(15,12)})
clrs = ['red' if x == 'ign' else 'black' for x in final_df['company']]
print(clrs)
plot = sns.barplot(data=final_df, x='score', y='publisher', hue='company', orient='h', palette=clrs)
# plot.legend_.remove()
plot

In [234]:
df_gamespot = pd.read_csv('gamespot_final.csv')
df_gamespot.genres = df_gamespot.genres.apply(lambda x: x.strip('][').split(', ') if isinstance(x, str) else '')
df_gamespot = df_gamespot.explode('genres')
df_gamespot = df_gamespot.groupby('genres').filter(lambda x: len(x) > 20)
df_gamespot.groupby('genres').mean().score.sort_values(ascending=False).head(n=10)

genres
'4X'               77.368421
'Soccer'           76.468085
'Trainer'          74.394737
'Stock Car'        74.258065
'Sim'              74.141649
'Ice Hockey'       73.934783
'Baseball'         73.321839
'Visual Novel'     73.088235
'Football'         73.054945
'Western-Style'    72.808696
Name: score, dtype: float64

In [246]:
df_ign = pd.read_csv('ign_final.csv')
df_ign.genres = df_ign.genres.apply(lambda x: x.strip('][').split(', ') if isinstance(x, str) else '')
df_ign_explode = df_ign.explode('genres')


In [260]:
df_ign_explode['date'] = pd.to_datetime(df_ign_explode['date'], format='%b %d, %Y', errors='coerce')
df_ign_explode['year'] = df_ign_explode.date.dt.year
df_ign_explode = df_ign_explode[(df_ign_explode['date'] < '2022-01-01') & (df_ign_explode['date'] > '2000-01-01')]
df_ign_explode = df_ign_explode.groupby('genres').filter(lambda x: len(x) > 15)
df_ign_explode.groupby(['year', 'genres']).mean()

print(df_ign_explode.genres.unique())
df_ign_explode[df_ign_explode['genres'] == "'Role-Playing'"].groupby(['year', 'genres']).mean()

["'Action'" "'Shooter'" "'First-Person'" "'Tactical'" "'Role-Playing'"
 "'Massively Multiplayer'" "'Action Adventure'" "'Open-World'"
 "'Platformer'" "'3D'" "'Simulation'" "'Flight'" "'Combat'" "'Puzzle'"
 "'Arcade'" "'Space'" "'Strategy'" "'Real-Time'" "'Tactics'"
 "'Turn-Based'" "'Card Battle'" "'Virtual'" "'Virtual Life'" "'General'"
 "'Trainer'" "'Miscellaneous'" "'Compilation'" "'Management'"
 "'Business / Tycoon'" "'Adventure'" "'Racing'" "'Automobile'" "'Sports'"
 "'Individual'" "'Other'" "'Survival'" "'Party / Minigame'" "'Action RPG'"
 "'Linear'" "'Command'" "'Team'" "'Ice Hockey'" "'Sim'" "'Fighting'"
 "'2D'" "'Soccer'" "'Japanese-Style'" "'Third-Person'"
 "'Skate / Skateboard'" "'Basketball'" "'Boxing / Martial Arts'"
 '"Shoot-\'Em-Up"' "'Top-Down'" '"Beat-\'Em-Up"' "'Football'"
 "'Board / Card Game'" "'MOBA'" "'4X'" "'Light Gun'" "'Golf'"
 "'Application'" "'Sandbox'" "'Rail'" "'Baseball'" "'Western-Style'"
 "'Vehicle'" "'Rhythm'" "'Music'" "'Civilian'" "'Visual Novel'"
 "'W

Unnamed: 0_level_0,Unnamed: 1_level_0,score
year,genres,Unnamed: 2_level_1
2000,'Role-Playing',77.64
2001,'Role-Playing',75.541667
2002,'Role-Playing',76.210526
2003,'Role-Playing',75.636364
2004,'Role-Playing',80.836735
2005,'Role-Playing',73.58
2006,'Role-Playing',71.656716
2007,'Role-Playing',70.097222
2008,'Role-Playing',71.521127
2009,'Role-Playing',72.369863
