In [1]:
import pandas as pd
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime
from datetime import datetime
import calendar
from sklearn.metrics import jaccard_score as jscore
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords as sw
import chart_studio.plotly as py
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import cufflinks as cf
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
%matplotlib inline

In [2]:
games = pd.read_csv('games_eda.csv').drop(['Unnamed: 0'], axis = 1)
games.head()

Unnamed: 0,title,release_date,platforms,developer,esrb_rating,ESRBs,metascore,userscore,critic_reviews,user_reviews,num_players,summary,genres
0,Burnout 3: Takedown,2004-09-07,Xbox,Criterion Games,T,Mild Language Mild Violence,94,7.7,76,191.0,"1-2 Players, 8 Players Online",Burnout 3 challenges you to crash into (and th...,"Arcade, Racing, Driving"
1,Jet Grind Radio,2000-10-30,Dreamcast,Smilebit,T,Animated Violence Mild Language,94,8.3,24,105.0,1 Player,"Join a graffiti crew, stamp your territory and...","3D, Platformer, Action"
2,Metal Gear Solid 4: Guns of the Patriots,2008-06-12,PlayStation 3,Kojima Productions,M,Blood Crude Humor Strong Language Suggestive T...,94,8.8,82,4231.0,"1 Player, 16 Players Online Up to 16 Players",Metal Gear Solid 4: Guns of the Patriots featu...,"General, Modern, Action Adventure"
3,Tom Clancy's Splinter Cell Chaos Theory,2005-03-28,Xbox,Ubisoft Montreal,M,Blood Strong Language Violence,94,9.1,70,233.0,"1-4 Players, 4 Players Online","As Sam Fisher, Third Echelon's most skilled Sp...","Modern, Action Adventure"
4,Call of Duty: Modern Warfare 2,2009-11-10,Xbox 360,Infinity Ward,M,Blood Drug Reference Intense Violence Language,94,6.6,100,3055.0,Up to 18 Players,Modern Warfare 2 continues the gripping and he...,"Arcade, First-Person, Modern, Shooter, Action"


In [3]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19317 entries, 0 to 19316
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           19317 non-null  object 
 1   release_date    19317 non-null  object 
 2   platforms       19317 non-null  object 
 3   developer       19298 non-null  object 
 4   esrb_rating     17202 non-null  object 
 5   ESRBs           7855 non-null   object 
 6   metascore       19317 non-null  int64  
 7   userscore       19317 non-null  object 
 8   critic_reviews  19317 non-null  int64  
 9   user_reviews    17953 non-null  float64
 10  num_players     19304 non-null  object 
 11  summary         19210 non-null  object 
 12  genres          19317 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 1.9+ MB


#### Platform Counts

In [4]:
platform_counts = pd.DataFrame(games.platforms.value_counts()).reset_index()\
                                        .rename(columns={'platforms':'counts', 'index':'platforms'})

fig = px.bar(platform_counts, x = 'counts', y='platforms', orientation = 'h', title = 'Platform Counts', color = 'platforms',
            template = 'seaborn', text = 'counts')
fig.update_traces(textposition='outside')
fig.update_layout(title_x = .5, title_y=.87, showlegend = False, yaxis=dict(showgrid=False, title=dict(text='')),
                                                    xaxis=dict(showgrid=False, showticklabels=False, title=dict(text='')), 
                  plot_bgcolor='white')

#### Genre Counts

In [5]:
meta_genres =\
"""
Action
Adventure
Fighting Games
First-Person Shooters
Flight/Flying
Party
Platformer
Puzzle
Racing
Real-Time Strategy
Role-Playing
Simulation
Sports
Strategy
Third-Person Shooter
Turn-Based Strategy
Wargames
Wrestling 
"""

In [6]:
meta_genres=meta_genres.split('\n')[1:-1]
g = ' '.join(' '.join(games.genres.dropna().tolist()).split(' ,')).strip().split(' ')
gs = [g[i].replace(',', '') for i in range(len(g)) if g[i]!='' and g[i]!='/']
Gs = [genre for genre in gs if genre in meta_genres]
genre_counts = pd.DataFrame(pd.DataFrame(Gs).value_counts()).rename(columns={0:'counts'}).reset_index()\
                                                                            .rename(columns={0:'genre'})
fig = px.bar(genre_counts, x = 'counts', y='genre', orientation = 'h', title = 'Genre Counts', color = 'genre', 
             template = 'seaborn', text = 'counts')
fig.update_layout(title_x = .5, title_y=.87, showlegend = False, yaxis =dict(showgrid=False, title=dict(text='')), 
                  xaxis =dict(showgrid=False, showticklabels=False, title=dict(text ='')), 
                  plot_bgcolor='white')

#### ESRB Rating Counts

In [7]:
esrb_rating_counts = pd.DataFrame(games.esrb_rating.value_counts()).reset_index()\
                                                                .rename(columns={'index':'esrb', 'esrb_rating':'counts'})
fig = px.bar(esrb_rating_counts, x = 'counts', y='esrb', orientation = 'h', title = 'ESRB Rating Counts', color = 'esrb', 
             template = 'seaborn', text = 'counts')
fig.update_layout(title_x=.45, title_y=.87, showlegend = False, 
                  yaxis =dict(showgrid=False, title=dict(text=''), tickfont=dict(size=20)), 
                  xaxis =dict(showgrid=False, showticklabels=False, title=dict(text ='')), plot_bgcolor='white')

#### ESRB Descriptor Counts

In [8]:
stopwords = sw.words('english')
e = ' '.join(games.ESRBs.dropna().tolist()).split(' ')
esrbs = pd.DataFrame(pd.DataFrame(e).value_counts()).rename(columns={0:'counts'}).reset_index()\
                                                                .rename(columns={0:'descriptor'}).set_index('descriptor')
eliminate = [word for word in esrbs.index.values if word in stopwords]
eliminate.extend(['Mild', 'Themes', 'Reference', 'Intense', 'Use', 'Lyrics', 'Drug', 'Partial', 'Content', '&',
                  'References', 'Users', 'All', 'Strong', 'Humor', 'Violent', 'Mature'])
keep = [word for word in esrbs.index.values if word not in eliminate]
descriptors = esrbs.loc[keep].reset_index()
fig = px.bar(descriptors, x = 'counts', y='descriptor', orientation = 'h', title = 'ESRB Descriptor Counts', 
             color = 'descriptor', template = 'seaborn', text = 'counts')
fig.update_layout(title_x = .5, title_y = .87, showlegend = False, yaxis =dict(showgrid=False, title=dict(text='')), 
                  xaxis =dict(showgrid=False, showticklabels=False, title=dict(text ='')), 
                  plot_bgcolor='white')
fig.update_traces(textposition='outside')

#### meta and user ratings compared to release dates

In [9]:
meta_and_user_scores = games[['release_date', 'metascore', 'userscore']]
meta_and_user_scores = meta_and_user_scores.replace('tbd', np.nan).dropna().reset_index(drop=True)
meta_and_user_scores.userscore = pd.to_numeric(meta_and_user_scores.userscore)
meta_and_user_scores.userscore = (meta_and_user_scores.userscore*10).astype('int64')
meta_and_user_scores.release_date = pd.to_datetime(meta_and_user_scores.release_date)
meta_and_user_scores = meta_and_user_scores.sort_values('release_date').set_index('release_date')
resampled_scores = meta_and_user_scores.resample('1m').mean().rolling(12).mean().dropna()
fig = px.line(resampled_scores, x=resampled_scores.index, y=['metascore', 'userscore'], 
              title = 'MetaScores vs UserScores 1999-2022')
fig.update_layout(title_x = .5, title_y = .87, showlegend = True, legend_title='', 
                  yaxis =dict(linecolor='black', title=dict(text='')), 
                  xaxis =dict(linecolor='black', showline=True, showticklabels=True, title=dict(text ='')), 
                  plot_bgcolor='white')

#### platforms to ratings

In [10]:
platform_scores = games[['platforms', 'metascore', 'userscore']].replace('tbd', np.nan).dropna().reset_index(drop=True)
platform_scores.userscore = (pd.to_numeric(platform_scores.userscore)*10).astype('int64')
platform_scores = platform_scores.groupby('platforms').mean().reset_index().round(2)
fig = px.bar(platform_scores, x = 'platforms', y=['metascore', 'userscore'], barmode='group',
             title = 'Ratings per Platform', template = 'seaborn')
fig.update_layout(title_x = .5, title_y = .87, showlegend = True, legend_title='', 
                  yaxis =dict(showgrid=True, gridcolor='grey', title=dict(text=''), range=[62,82]), 
                  xaxis =dict(showgrid=False, showticklabels=True, title=dict(text ='')), 
                  plot_bgcolor='white')
# texts = [list(platform_scores.metascore), list(platform_scores.userscore)]
# for i, t in enumerate(texts):
#     fig.data[i].text = t
#     fig.data[i].textposition = 'inside'
# fig.show()
# fig.update_traces(textfont_size=20)

#### number of reviews over time

In [11]:
reviews_time = games[['release_date', 'critic_reviews', 'user_reviews']].dropna().sort_values('release_date')
reviews_time.user_reviews = reviews_time.user_reviews.astype('int64')
reviews_time.release_date = pd.to_datetime(reviews_time.release_date)
reviews_time = reviews_time.set_index('release_date').resample('1m').sum().rolling(12).sum()

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=reviews_time.index, y=reviews_time.user_reviews, name="User Reviews"), secondary_y=False)

fig.add_trace(go.Scatter(x=reviews_time.index, y=reviews_time.critic_reviews, name="Critic Reviews"), secondary_y=True)

fig.update_layout(title_text="Amount of Reviews over Time", title_x=.43, plot_bgcolor='white', 
                  xaxis =dict(linecolor='black', showline=True, showticklabels=True, title=dict(text ='')))

fig.update_yaxes(title_font_color= 'blue', title_text="User Reviews", linecolor='black', secondary_y=False)
fig.update_yaxes(title_font_color= 'red', title_text="Critic Reviews", linecolor='black', secondary_y=True)