# EDA Steam Game Data

## Preliminary

### Import Modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Set Options

In [2]:
pd.set_option('display.max_rows', 70) # display more rows
pd.set_option('display.max_columns', 50) # display more columns
pd.set_option('display.float_format', '{:.2f}'.format) # display numbers as decimals

## Load Data

In [10]:
df = pd.read_csv('../../data/game_data_all.csv', index_col=0)

## Inspect Data

In [11]:
df.shape

(67571, 19)

In [12]:
df.head(3).T

Unnamed: 0,0,1,2
game,Pizza Tower,Resident Evil 4,The Murder of Sonic the Hedgehog
link,/app/2231450/,/app/2050650/,/app/2324650/
release,2023-01-26,2023-03-24,2023-03-31
peak_players,4529,168191,15543
positive_reviews,19807,61752,12643
negative_reviews,227,1616,213
total_reviews,20034,63368,12856
rating,96.39,95.75,95.54
primary_genre,Action (1),Action (1),Casual (4)
store_genres,"Action (1), Indie (23)","Action (1), Adventure (25)","Casual (4), Free to Play (37), Indie (23), Sim..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67571 entries, 0 to 67570
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             67571 non-null  int64  
 1   game                   67571 non-null  object 
 2   link                   67571 non-null  object 
 3   release                67571 non-null  object 
 4   peak_players           67571 non-null  int64  
 5   positive_reviews       67571 non-null  int64  
 6   negative_reviews       67571 non-null  int64  
 7   total_reviews          67571 non-null  int64  
 8   rating                 67571 non-null  float64
 9   primary_genre          67561 non-null  object 
 10  store_genres           67514 non-null  object 
 11  publisher              67110 non-null  object 
 12  developer              67443 non-null  object 
 13  detected_technologies  60265 non-null  object 
 14  store_asset_mod_time   67275 non-null  object 
 15  re

In [7]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0,67571.0,,,,33785.0,19506.21,0.0,16892.5,33785.0,50677.5,67570.0
game,67571.0,65948.0,Alone,36.0,,,,,,,
link,67571.0,66427.0,/app/1835250/,6.0,,,,,,,
release,67571.0,4023.0,2023-03-31,129.0,,,,,,,
peak_players,67571.0,,,,952.87,19790.93,0.0,3.0,7.0,46.0,3257248.0
positive_reviews,67571.0,,,,1273.53,29551.63,0.0,5.0,19.0,99.0,6307931.0
negative_reviews,67571.0,,,,216.89,5434.96,0.0,1.0,6.0,29.0,927317.0
total_reviews,67571.0,,,,1490.42,34009.23,1.0,7.0,26.0,132.0,7117776.0
rating,67571.0,,,,65.29,14.06,15.09,56.53,66.76,75.64,97.54
primary_genre,67561.0,28.0,Indie (23),17862.0,,,,,,,


## Format Data

### Set Datetime Format

In [15]:
df['release'] = pd.to_datetime(df['release'])
df['all_time_peak_date'] = pd.to_datetime(df['all_time_peak_date'])

### Check Formatted Data

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67571 entries, 0 to 67570
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   game                   67571 non-null  object        
 1   link                   67571 non-null  object        
 2   release                67571 non-null  datetime64[ns]
 3   peak_players           67571 non-null  int64         
 4   positive_reviews       67571 non-null  int64         
 5   negative_reviews       67571 non-null  int64         
 6   total_reviews          67571 non-null  int64         
 7   rating                 67571 non-null  float64       
 8   primary_genre          67561 non-null  object        
 9   store_genres           67514 non-null  object        
 10  publisher              67110 non-null  object        
 11  developer              67443 non-null  object        
 12  detected_technologies  60265 non-null  object        
 13  store_