In [3]:
#import statements
import pandas as pd

### Data Loading

Data sourced from https://www.kaggle.com/datasets/nikdavis/steam-store-games?select=steamspy_tag_data.csv

The author of the dataset documented their process of using Steam APIs here:
https://nik-davis.github.io/posts/2019/steam-data-collection/

The dataset is stored under a data folder in my local repository. 

The filepath is ./data/Steam_store_data

There are 6 CSV files in the dataset.

In [1]:
!ls

README.md
data
notebook.ipynb
scratch_notebook.ipynb


In [4]:
steam_df = pd.read_csv('./data/Steam_store_data/steam.csv')

In [5]:
steam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   appid             27075 non-null  int64  
 1   name              27075 non-null  object 
 2   release_date      27075 non-null  object 
 3   english           27075 non-null  int64  
 4   developer         27075 non-null  object 
 5   publisher         27075 non-null  object 
 6   platforms         27075 non-null  object 
 7   required_age      27075 non-null  int64  
 8   categories        27075 non-null  object 
 9   genres            27075 non-null  object 
 10  steamspy_tags     27075 non-null  object 
 11  achievements      27075 non-null  int64  
 12  positive_ratings  27075 non-null  int64  
 13  negative_ratings  27075 non-null  int64  
 14  average_playtime  27075 non-null  int64  
 15  median_playtime   27075 non-null  int64  
 16  owners            27075 non-null  object

### Exploring the data

In [23]:
steam_df.shape

(27075, 18)

There are 27,705 rows - each representing a game in Steam's store - and 18 columns.

Let's go through each column one-by-one:

In [9]:
#unique id value identifier for game
steam_df['appid']

0             10
1             20
2             30
3             40
4             50
          ...   
27070    1065230
27071    1065570
27072    1065650
27073    1066700
27074    1069460
Name: appid, Length: 27075, dtype: int64

In [10]:
#name of game
team_df['name']

0                    Counter-Strike
1             Team Fortress Classic
2                     Day of Defeat
3                Deathmatch Classic
4         Half-Life: Opposing Force
                    ...            
27070               Room of Pandora
27071                     Cyber Gun
27072              Super Star Blast
27073    New Yankee 7: Deer Hunters
27074                     Rune Lord
Name: name, Length: 27075, dtype: object

In [11]:
#date of release (YY-MM-DD)
steam_df['release_date']

0        2000-11-01
1        1999-04-01
2        2003-05-01
3        2001-06-01
4        1999-11-01
            ...    
27070    2019-04-24
27071    2019-04-23
27072    2019-04-24
27073    2019-04-17
27074    2019-04-24
Name: release_date, Length: 27075, dtype: object

In [12]:
#release_date is stored as a string
type(steam_df['release_date'][0])

str

In [13]:
steam_df['english']

0        1
1        1
2        1
3        1
4        1
        ..
27070    1
27071    1
27072    1
27073    1
27074    1
Name: english, Length: 27075, dtype: int64

In [14]:
steam_df['english'].value_counts()

1    26564
0      511
Name: english, dtype: int64

There are 26,564 English games and 511 non-English games.

In [16]:
#name of game's developer
steam_df['developer']

0                     Valve
1                     Valve
2                     Valve
3                     Valve
4          Gearbox Software
                ...        
27070           SHEN JIAWEI
27071        Semyon Maximov
27072           EntwicklerX
27073    Yustas Game Studio
27074      Adept Studios GD
Name: developer, Length: 27075, dtype: object

In [17]:
#name of game's publisher
steam_df['publisher']

0                       Valve
1                       Valve
2                       Valve
3                       Valve
4                       Valve
                 ...         
27070             SHEN JIAWEI
27071        BekkerDev Studio
27072             EntwicklerX
27073    Alawar Entertainment
27074    Alawar Entertainment
Name: publisher, Length: 27075, dtype: object

In [18]:
#platform availability for game
steam_df['platforms']

0        windows;mac;linux
1        windows;mac;linux
2        windows;mac;linux
3        windows;mac;linux
4        windows;mac;linux
               ...        
27070              windows
27071              windows
27072              windows
27073          windows;mac
27074          windows;mac
Name: platforms, Length: 27075, dtype: object

In [19]:
steam_df['platforms'].value_counts()

windows              18398
windows;mac;linux     4623
windows;mac           3439
windows;linux          610
mac                      3
mac;linux                1
linux                    1
Name: platforms, dtype: int64

In [20]:
steam_df[steam_df['platforms'] == 'mac']

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
1413,214630,Call of Duty: Black Ops - Mac Edition,2012-09-27,1,Aspyr,Aspyr,mac,18,Single-player;Multi-player;Co-op;Steam Achieve...,Action,Action;Zombies;Multiplayer,68,168,105,0,0,50000-100000,15.49
12479,569050,Paul Pixel - The Awakening,2017-01-09,1,Xoron GmbH,Xoron GmbH,mac,0,Single-player,Adventure;Indie,Adventure;Indie;Point & Click,0,5,0,0,0,0-20000,2.89
16662,694180,MobileZombie,2017-10-13,1,YIMING ZHANG,YIMING ZHANG,mac,0,Single-player;Partial Controller Support,Adventure;Casual;Free to Play;Indie,Free to Play;Adventure;Indie,0,14,11,0,0,0-20000,0.0


In [21]:
#age-requirement for game
steam_df['required_age']

0        0
1        0
2        0
3        0
4        0
        ..
27070    0
27071    0
27072    0
27073    0
27074    0
Name: required_age, Length: 27075, dtype: int64

In [22]:
steam_df['required_age'].value_counts()

0     26479
18      308
16      192
12       73
7        12
3        11
Name: required_age, dtype: int64

In [24]:
steam_df['categories']

0        Multi-player;Online Multi-Player;Local Multi-P...
1        Multi-player;Online Multi-Player;Local Multi-P...
2                    Multi-player;Valve Anti-Cheat enabled
3        Multi-player;Online Multi-Player;Local Multi-P...
4        Single-player;Multi-player;Valve Anti-Cheat en...
                               ...                        
27070                     Single-player;Steam Achievements
27071                                        Single-player
27072    Single-player;Multi-player;Co-op;Shared/Split ...
27073                            Single-player;Steam Cloud
27074                            Single-player;Steam Cloud
Name: categories, Length: 27075, dtype: object

In [25]:
#there are a lot of combinations for categories
#might have to clean this up separately to see pull out the most meaningful categories
steam_df['categories'].value_counts()

Single-player                                                                                                                                                            6110
Single-player;Steam Achievements                                                                                                                                         2334
Single-player;Steam Achievements;Steam Trading Cards                                                                                                                      848
Single-player;Partial Controller Support                                                                                                                                  804
Single-player;Steam Trading Cards                                                                                                                                         792
                                                                                                                                  

In [6]:
steam_df['genres'].value_counts()

Action;Indie                                           1852
Casual;Indie                                           1482
Action;Adventure;Indie                                 1229
Adventure;Indie                                        1170
Action;Casual;Indie                                    1004
                                                       ... 
Violent;Action;Casual;Indie;Simulation;Early Access       1
Casual;Racing;Early Access                                1
Adventure;Indie;Simulation;Sports                         1
Massively Multiplayer;Early Access                        1
Utilities;Web Publishing                                  1
Name: genres, Length: 1552, dtype: int64

Some of these items aren't games; some are developer tools or asset packs. I'll remove those items from the dataset to focuse solely on games.

In [26]:
#content genres pulled using SteamSpy API
steam_df['steamspy_tags']

0              Action;FPS;Multiplayer
1              Action;FPS;Multiplayer
2        FPS;World War II;Multiplayer
3              Action;FPS;Multiplayer
4                   FPS;Action;Sci-fi
                     ...             
27070          Adventure;Indie;Casual
27071          Action;Indie;Adventure
27072             Action;Indie;Casual
27073          Indie;Casual;Adventure
27074          Indie;Casual;Adventure
Name: steamspy_tags, Length: 27075, dtype: object

In [27]:
#content genres
steam_df['steamspy_tags'].value_counts()

Action;Indie;Casual                     845
Action;Adventure;Indie                  714
Early Access;Action;Indie               507
Adventure;Indie;Casual                  442
Indie;Casual                            378
                                       ... 
Action;Sports;Hunting                     1
Stealth;Female Protagonist;Cyberpunk      1
Casual;Space                              1
Horses;Casual;Family Friendly             1
Simulation;Realistic;Naval                1
Name: steamspy_tags, Length: 6423, dtype: int64

In [28]:
#number of achievements available
steam_df['achievements']

0         0
1         0
2         0
3         0
4         0
         ..
27070     7
27071     0
27072    24
27073     0
27074     0
Name: achievements, Length: 27075, dtype: int64

In [29]:
steam_df['achievements'].value_counts()

0       11864
10        679
12        618
20        574
15        490
        ...  
184         1
168         1
1487        1
319         1
4094        1
Name: achievements, Length: 410, dtype: int64

In [30]:
#is achievements a meaningful consideration for buyers?
steam_df[steam_df['achievements']==4094]

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
13740,604490,Running Through Russia,2017-05-21,1,Spell Helix,Spell Helix,windows;mac;linux,0,Single-player;Steam Achievements,Adventure;Casual;Indie;Early Access,Early Access;Indie;Casual,4094,751,335,22,22,50000-100000,0.79


In [31]:
steam_df['positive_ratings']

0        124534
1          3318
2          3416
3          1273
4          5250
          ...  
27070         3
27071         8
27072         0
27073         2
27074         4
Name: positive_ratings, Length: 27075, dtype: int64

In [32]:
steam_df['negative_ratings']

0        3339
1         633
2         398
3         267
4         288
         ... 
27070       0
27071       1
27072       1
27073       0
27074       0
Name: negative_ratings, Length: 27075, dtype: int64

In [39]:
no_ratings = steam_df[ (steam_df['positive_ratings'] == 0) &  (steam_df['negative_ratings'] == 0) ]
no_ratings
#every game in this dataset has at least 1 rating
#there shouldn't be any divide-by-0 errors if I create a % positive/% negative ratings feature

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price


In [40]:
#average user playtime (minutes)
steam_df['average_playtime']

0        17612
1          277
2          187
3          258
4          624
         ...  
27070        0
27071        0
27072        0
27073        0
27074        0
Name: average_playtime, Length: 27075, dtype: int64

In [41]:
steam_df[steam_df['average_playtime'] == 0]

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
26,1002,Rag Doll Kung Fu,2005-10-12,1,Mark Healey,Mark Healey,windows,0,Single-player;Multi-player,Indie,Indie;Fighting;Multiplayer,0,40,17,0,0,20000-50000,5.99
29,1300,SiN Episodes: Emergence,2006-05-10,1,Ritual Entertainment,Ritual Entertainment,windows,0,Single-player;Stats,Action,Action;FPS;Cyberpunk,0,468,61,0,0,100000-200000,7.19
34,1600,Dangerous Waters,2006-02-07,1,Sonalysts,Strategy First,windows,0,Single-player;Multi-player,Strategy,Strategy;Simulation;Naval,0,140,44,0,0,50000-100000,22.99
35,1610,Space Empires IV Deluxe,2006-02-07,1,Malfador Machinations,Strategy First,windows,0,Single-player;Multi-player,Strategy,Strategy;4X;Sci-fi,0,112,26,0,0,50000-100000,6.99
36,1630,Disciples II: Rise of the Elves,2006-07-06,1,Strategy First,Strategy First,windows,0,Single-player;Multi-player;Co-op,Strategy,Strategy;Turn-Based Strategy;Fantasy,0,451,108,0,0,100000-200000,4.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,Room of Pandora,2019-04-24,1,SHEN JIAWEI,SHEN JIAWEI,windows,0,Single-player;Steam Achievements,Adventure;Casual;Indie,Adventure;Indie;Casual,7,3,0,0,0,0-20000,2.09
27071,1065570,Cyber Gun,2019-04-23,1,Semyon Maximov,BekkerDev Studio,windows,0,Single-player,Action;Adventure;Indie,Action;Indie;Adventure,0,8,1,0,0,0-20000,1.69
27072,1065650,Super Star Blast,2019-04-24,1,EntwicklerX,EntwicklerX,windows,0,Single-player;Multi-player;Co-op;Shared/Split ...,Action;Casual;Indie,Action;Indie;Casual,24,0,1,0,0,0-20000,3.99
27073,1066700,New Yankee 7: Deer Hunters,2019-04-17,1,Yustas Game Studio,Alawar Entertainment,windows;mac,0,Single-player;Steam Cloud,Adventure;Casual;Indie,Indie;Casual;Adventure,0,2,0,0,0,0-20000,5.19


In [51]:
no_playtime = steam_df[steam_df['average_playtime'] == 0]
no_playtime.sort_values(by=['owners'], ascending=False)
no_playtime.shape

(20905, 18)

In [7]:
steam_df['median_playtime'].value_counts()

0       20905
1         155
3          72
2          52
9          48
        ...  
636         1
684         1
4164        1
748         1
6061        1
Name: median_playtime, Length: 1312, dtype: int64

In [8]:
#number of users who own this game, given as a range
steam_df['owners']

0        10000000-20000000
1         5000000-10000000
2         5000000-10000000
3         5000000-10000000
4         5000000-10000000
               ...        
27070              0-20000
27071              0-20000
27072              0-20000
27073              0-20000
27074              0-20000
Name: owners, Length: 27075, dtype: object

In [59]:
#price of game in USD
steam_df['price']

0        7.19
1        3.99
2        3.99
3        3.99
4        3.99
         ... 
27070    2.09
27071    1.69
27072    3.99
27073    5.19
27074    5.19
Name: price, Length: 27075, dtype: float64

In [60]:
steam_df['price'].value_counts()

3.99     3211
0.79     2892
0.00     2560
6.99     2050
7.19     1304
         ... 
7.42        1
3.03        1
6.10        1
11.75       1
4.70        1
Name: price, Length: 282, dtype: int64

In [63]:
steam_df.iloc[0]

appid                                                              10
name                                                   Counter-Strike
release_date                                               2000-11-01
english                                                             1
developer                                                       Valve
publisher                                                       Valve
platforms                                           windows;mac;linux
required_age                                                        0
categories          Multi-player;Online Multi-Player;Local Multi-P...
genres                                                         Action
steamspy_tags                                  Action;FPS;Multiplayer
achievements                                                        0
positive_ratings                                               124534
negative_ratings                                                 3339
average_playtime    