In [27]:
import pandas as pd
import numpy as np

In [28]:
steam = pd.read_csv('steam_top_100.csv')
display(steam.head())

Unnamed: 0,Steam id,Game,Current players,Peak players today,Release date,Review summary,Total reviews,Tags
0,730,Counter-Strike: Global Offensive,736875,775266,21 Aug 2012,Very Positive,5993058.0,FPS:Shooter:Multiplayer:Competitive:Action:Tea...
1,570,Dota 2,580933,649690,9 Jul 2013,Very Positive,1599180.0,Free to Play:MOBA:Multiplayer:Strategy:eSports...
2,1063730,New World,243815,307146,28 Sep 2021,Mostly Positive,145931.0,Massively Multiplayer:Open World:MMORPG:Advent...
3,1172470,Apex Legends,121313,245235,4 Nov 2020,Very Positive,338236.0,Free to Play:Battle Royale:Multiplayer:Shooter...
4,440,Team Fortress 2,104477,111884,10 Oct 2007,Very Positive,821568.0,Free to Play:Hero Shooter:Multiplayer:FPS:Shoo...


In [29]:
#checking data type of each variable
print(steam.shape)
steam.dtypes

(100, 8)


Steam id                int64
Game                   object
Current players         int64
Peak players today      int64
Release date           object
Review summary         object
Total reviews         float64
Tags                   object
dtype: object

### observations
- Review summary is qualitative ordinal variable therefore its better to catagorize it. Also making new column of encoding.
- Change release date to 'datetime' data type
- One-Hot-Encoding to do the tags column

### Review summary

In [30]:
#check the unique value of Review Summary
print(steam['Review summary'].unique())

['Very Positive' 'Mostly Positive' 'Mixed' 'Overwhelmingly Positive' nan]


In [31]:
#check the NaN values
print(steam.isnull().sum())

Steam id              0
Game                  0
Current players       0
Peak players today    0
Release date          1
Review summary        1
Total reviews         1
Tags                  2
dtype: int64


- Getting rid of null values in Review summary
- Replacing the null value to the most frequent value in 'Review summary' variable.

In [32]:

#first categorize this variable
steam['Review summary'] = pd.Categorical(steam['Review summary'], ['Mixed','Very Positive', 'Mostly Positive', 'Overwhelmingly Positive'], ordered = True)

#make new variable for Categorical encoding
steam['Review summary encoded'] = steam['Review summary'].cat.codes

print(steam['Review summary encoded'].mode())
print(steam['Review summary encoded'].unique())


0    1
dtype: int8
[ 1  2  0  3 -1]


- 'Very Positive' turn out to be the mode in this variable.

In [33]:
#Replacing nan value to 'Very positive'
steam = steam.fillna(value = {'Review summary': 'Very Positive'})
steam['Review summary encoded'] = steam['Review summary encoded'].replace('-1',1)
print(steam['Review summary'].unique())
print(steam['Review summary encoded'].unique())

['Very Positive', 'Mostly Positive', 'Mixed', 'Overwhelmingly Positive']
Categories (4, object): ['Mixed' < 'Very Positive' < 'Mostly Positive' < 'Overwhelmingly Positive']
[ 1  2  0  3 -1]


In [34]:
print(steam.dtypes)
steam.head()


Steam id                     int64
Game                        object
Current players              int64
Peak players today           int64
Release date                object
Review summary            category
Total reviews              float64
Tags                        object
Review summary encoded        int8
dtype: object


Unnamed: 0,Steam id,Game,Current players,Peak players today,Release date,Review summary,Total reviews,Tags,Review summary encoded
0,730,Counter-Strike: Global Offensive,736875,775266,21 Aug 2012,Very Positive,5993058.0,FPS:Shooter:Multiplayer:Competitive:Action:Tea...,1
1,570,Dota 2,580933,649690,9 Jul 2013,Very Positive,1599180.0,Free to Play:MOBA:Multiplayer:Strategy:eSports...,1
2,1063730,New World,243815,307146,28 Sep 2021,Mostly Positive,145931.0,Massively Multiplayer:Open World:MMORPG:Advent...,2
3,1172470,Apex Legends,121313,245235,4 Nov 2020,Very Positive,338236.0,Free to Play:Battle Royale:Multiplayer:Shooter...,1
4,440,Team Fortress 2,104477,111884,10 Oct 2007,Very Positive,821568.0,Free to Play:Hero Shooter:Multiplayer:FPS:Shoo...,1


### Total Reviews
- Getting rid of null values in total reviews.
- Also changing data type to int.

In [35]:

steam = steam.fillna(value={'Total reviews':steam['Total reviews'].median()})
print(steam.isnull().sum())
steam['Total reviews'] = steam['Total reviews'].astype('int')
# steam.dtypes

#Total review datatype is now int

Steam id                  0
Game                      0
Current players           0
Peak players today        0
Release date              1
Review summary            0
Total reviews             0
Tags                      2
Review summary encoded    0
dtype: int64


### Release date
- Changing release date variable to datetime datatype.
- Make new column just for release year.

In [36]:
#filling na values in release date to unknown

steam = steam.fillna(value={'Release date':'2012-01-01'})
steam['Release date'] = pd.to_datetime(steam['Release date'])

print(steam.isna().sum())
print(steam.dtypes)

Steam id                  0
Game                      0
Current players           0
Peak players today        0
Release date              0
Review summary            0
Total reviews             0
Tags                      2
Review summary encoded    0
dtype: int64
Steam id                           int64
Game                              object
Current players                    int64
Peak players today                 int64
Release date              datetime64[ns]
Review summary                  category
Total reviews                      int32
Tags                              object
Review summary encoded              int8
dtype: object


In [37]:
#making new column just for release year
steam['Release year'] = steam['Release date'].dt.year
display(steam.head())

Unnamed: 0,Steam id,Game,Current players,Peak players today,Release date,Review summary,Total reviews,Tags,Review summary encoded,Release year
0,730,Counter-Strike: Global Offensive,736875,775266,2012-08-21,Very Positive,5993058,FPS:Shooter:Multiplayer:Competitive:Action:Tea...,1,2012
1,570,Dota 2,580933,649690,2013-07-09,Very Positive,1599180,Free to Play:MOBA:Multiplayer:Strategy:eSports...,1,2013
2,1063730,New World,243815,307146,2021-09-28,Mostly Positive,145931,Massively Multiplayer:Open World:MMORPG:Advent...,2,2021
3,1172470,Apex Legends,121313,245235,2020-11-04,Very Positive,338236,Free to Play:Battle Royale:Multiplayer:Shooter...,1,2020
4,440,Team Fortress 2,104477,111884,2007-10-10,Very Positive,821568,Free to Play:Hero Shooter:Multiplayer:FPS:Shoo...,1,2007


In [38]:
#reordering columns
name = ['Steam id','Game','Current players','Peak players today','Release date','Release year','Review summary','Review summary encoded','Total reviews','Tags']
steam = steam.reindex(columns = name)
display(steam.head())

Unnamed: 0,Steam id,Game,Current players,Peak players today,Release date,Release year,Review summary,Review summary encoded,Total reviews,Tags
0,730,Counter-Strike: Global Offensive,736875,775266,2012-08-21,2012,Very Positive,1,5993058,FPS:Shooter:Multiplayer:Competitive:Action:Tea...
1,570,Dota 2,580933,649690,2013-07-09,2013,Very Positive,1,1599180,Free to Play:MOBA:Multiplayer:Strategy:eSports...
2,1063730,New World,243815,307146,2021-09-28,2021,Mostly Positive,2,145931,Massively Multiplayer:Open World:MMORPG:Advent...
3,1172470,Apex Legends,121313,245235,2020-11-04,2020,Very Positive,1,338236,Free to Play:Battle Royale:Multiplayer:Shooter...
4,440,Team Fortress 2,104477,111884,2007-10-10,2007,Very Positive,1,821568,Free to Play:Hero Shooter:Multiplayer:FPS:Shoo...


### Tags
- One-Hot-Encoding for tags variable. (Optional)

In [39]:
# replacing nan values to 'no_genre'
only_tags=steam.loc[:,"Tags"]
only_tags.fillna(value='no_genre',inplace=True)
only_tags.head(15)

0     FPS:Shooter:Multiplayer:Competitive:Action:Tea...
1     Free to Play:MOBA:Multiplayer:Strategy:eSports...
2     Massively Multiplayer:Open World:MMORPG:Advent...
3     Free to Play:Battle Royale:Multiplayer:Shooter...
4     Free to Play:Hero Shooter:Multiplayer:FPS:Shoo...
5     Survival:Shooter:Multiplayer:Battle Royale:FPS...
6     Survival:Crafting:Multiplayer:Open World:Open ...
7     PvE:PvP:Demons:MMORPG:Martial Arts:RPG:Action ...
8     Open World:Action:Multiplayer:Automobile Sim:C...
9     Free to Play:Action RPG:Hack and Slash:RPG:Mul...
10    Horror:Survival Horror:Multiplayer:Online Co-O...
11    Free to Play:PvP:Open World:FPS:Looter Shooter...
12    Multiplayer:Soccer:Competitive:Sports:Racing:T...
13                                             no_genre
14    Open World Survival Craft:Survival:Open World:...
Name: Tags, dtype: object

In [40]:
all_tags=set()
for tag_list in only_tags:
    iter = tag_list.split(':')
    for new_tag in iter:
        all_tags.add(new_tag)
all_tags=list(all_tags)
all_tags.sort()
all_tags.remove('no_genre')
print(all_tags)

['1980s', "1990's", '2D', '2D Fighter', '2D Platformer', '3D', '3D Platformer', '3D Vision', '4 Player Local', '4X', 'Action', 'Action RPG', 'Action RTS', 'Action Roguelike', 'Action-Adventure', 'Addictive', 'Adventure', 'Agriculture', 'Alternate History', 'Animation & Modeling', 'Anime', 'Arcade', 'Artificial Intelligence', 'Assassin', 'Atmospheric', 'Audio Production', 'Automation', 'Automobile Sim', 'Base Building', 'Basketball', 'Battle Royale', 'Beautiful', 'Blood', 'Building', 'Bullet Hell', 'CRPG', 'Capitalism', 'Card Game', 'Cartoon', 'Cartoony', 'Casual', 'Character Customization', 'Choices Matter', 'Choose Your Own Adventure', 'Cinematic', 'City Builder', 'Class-Based', 'Classic', 'Clicker', 'Co-op', 'Co-op Campaign', 'Cold War', 'Collectathon', 'Colony Sim', 'Colorful', 'Combat', 'Comedy', 'Competitive', 'Controller', 'Cooking', 'Crafting', 'Crime', 'Cute', 'Cyberpunk', 'Dark', 'Dark Comedy', 'Dark Fantasy', 'Dating Sim', 'Deckbuilding', 'Demons', 'Design & Illustration', 'D

In [46]:
tag_df = pd.DataFrame(columns=all_tags)
tag_df.tail(10)
print(steam.isna().sum())

Steam id                  0
Game                      0
Current players           0
Peak players today        0
Release date              0
Release year              0
Review summary            0
Review summary encoded    0
Total reviews             0
Tags                      0
dtype: int64


In [50]:

steam_new = pd.concat([steam,tag_df],axis=1)
steam_new = steam_new.replace(np.nan, '0')
display(steam_new.head(10))
print(steam_new.dtypes)
print(steam_new.isna().sum())


Unnamed: 0,Steam id,Game,Current players,Peak players today,Release date,Release year,Review summary,Review summary encoded,Total reviews,Tags,...,VR,Violent,Voxel,Walking Simulator,War,Warhammer 40K,Western,World War II,Zombies,eSports
0,730,Counter-Strike: Global Offensive,736875,775266,2012-08-21,2012,Very Positive,1,5993058,FPS:Shooter:Multiplayer:Competitive:Action:Tea...,...,0,0,0,0,0,0,0,0,0,0
1,570,Dota 2,580933,649690,2013-07-09,2013,Very Positive,1,1599180,Free to Play:MOBA:Multiplayer:Strategy:eSports...,...,0,0,0,0,0,0,0,0,0,0
2,1063730,New World,243815,307146,2021-09-28,2021,Mostly Positive,2,145931,Massively Multiplayer:Open World:MMORPG:Advent...,...,0,0,0,0,0,0,0,0,0,0
3,1172470,Apex Legends,121313,245235,2020-11-04,2020,Very Positive,1,338236,Free to Play:Battle Royale:Multiplayer:Shooter...,...,0,0,0,0,0,0,0,0,0,0
4,440,Team Fortress 2,104477,111884,2007-10-10,2007,Very Positive,1,821568,Free to Play:Hero Shooter:Multiplayer:FPS:Shoo...,...,0,0,0,0,0,0,0,0,0,0
5,578080,PUBG: BATTLEGROUNDS,103354,293759,2017-12-21,2017,Mixed,0,1530629,Survival:Shooter:Multiplayer:Battle Royale:FPS...,...,0,0,0,0,0,0,0,0,0,0
6,252490,Rust,86156,110489,2018-02-08,2018,Very Positive,1,554535,Survival:Crafting:Multiplayer:Open World:Open ...,...,0,0,0,0,0,0,0,0,0,0
7,1623660,MIR4,82361,91872,2021-08-25,2021,Mixed,0,5344,PvE:PvP:Demons:MMORPG:Martial Arts:RPG:Action ...,...,0,0,0,0,0,0,0,0,0,0
8,271590,Grand Theft Auto V,82189,102182,2015-04-13,2015,Very Positive,1,1102837,Open World:Action:Multiplayer:Automobile Sim:C...,...,0,0,0,0,0,0,0,0,0,0
9,238960,Path of Exile,56781,66571,2013-10-23,2013,Very Positive,1,172981,Free to Play:Action RPG:Hack and Slash:RPG:Mul...,...,0,0,0,0,0,0,0,0,0,0


Steam id                       int64
Game                          object
Current players                int64
Peak players today             int64
Release date          datetime64[ns]
                           ...      
Warhammer 40K                 object
Western                       object
World War II                  object
Zombies                       object
eSports                       object
Length: 259, dtype: object
Steam id              0
Game                  0
Current players       0
Peak players today    0
Release date          0
                     ..
Warhammer 40K         0
Western               0
World War II          0
Zombies               0
eSports               0
Length: 259, dtype: int64


In [52]:
for ind,row in enumerate(steam_new.loc[:,'Tags']):
    par = row.split(':')
    for tag in par:
        if(tag in steam_new.columns[8:]):
            steam_new.iloc[ind,steam_new.columns.get_loc(tag)]=1
steam_new.tail(10)


# steam_new.to_excel("some_data.xlsx")

Unnamed: 0,Steam id,Game,Current players,Peak players today,Release date,Release year,Review summary,Review summary encoded,Total reviews,Tags,...,VR,Violent,Voxel,Walking Simulator,War,Warhammer 40K,Western,World War II,Zombies,eSports
90,632360,Risk of Rain 2,8220,11724,2020-08-11,2020,Overwhelmingly Positive,3,112385,Third-Person Shooter:Action Roguelike:Multipla...,...,0,0,0,0,0,0,0,0,0,0
91,242760,The Forest,7918,11706,2018-04-30,2018,Very Positive,1,264327,Open World Survival Craft:Survival:Open World:...,...,0,0,0,0,0,0,0,0,1,0
92,284160,BeamNG.drive,7874,10572,2015-05-29,2015,Overwhelmingly Positive,3,85999,Simulation:Driving:Physics:Destruction:Automob...,...,0,0,0,0,0,0,0,0,0,0
93,1293830,Forza Horizon 4,7606,9552,2021-03-09,2021,Very Positive,1,69892,Racing:Open World:Driving:Multiplayer:Online C...,...,0,0,0,0,0,0,0,0,0,0
94,1184370,Pathfinder: Wrath of the Righteous,7182,10242,2021-09-02,2021,Very Positive,1,9550,CRPG:Story Rich:Tactical RPG:RPG:Character Cus...,...,0,0,0,0,0,0,0,0,0,0
95,1286830,STAR WARS™: The Old Republic™,7104,9345,2020-07-21,2020,Very Positive,1,33098,Free to Play:MMORPG:Character Customization:Mu...,...,0,0,0,0,0,0,0,0,0,0
96,1644960,NBA 2K22,7020,13139,2021-09-09,2021,Mixed,0,6266,Sports:Basketball:Simulation:eSports:Realistic...,...,0,0,0,0,0,0,0,0,0,1
97,244210,Assetto Corsa,6965,8707,2014-12-19,2014,Very Positive,1,42497,Racing:Automobile Sim:Simulation:Driving:Reali...,...,1,0,0,0,0,0,0,0,0,0
98,275850,No Man's Sky,6963,10122,2016-08-12,2016,Mostly Positive,2,168792,Open World:Open World Survival Craft:Space:Exp...,...,0,0,0,0,0,0,0,0,0,0
99,1238810,Battlefield™ V,6724,33310,2020-10-22,2020,Mostly Positive,2,68109,FPS:World War II:Shooter:Multiplayer:War:Singl...,...,0,1,0,0,1,0,0,1,0,0
