In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('steam-200k.csv')

In [3]:
df.head()

Unnamed: 0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [4]:
df.shape

(199999, 5)

In [7]:
cols = {'151603712':'user_id', 'The Elder Scrolls V Skyrim': 'game_name', 'purchase':'action', '1.0':'quantity'}
df.rename(columns = cols, inplace = True)

In [8]:
df.head()

Unnamed: 0,user_id,game_name,action,quantity,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [9]:
df['game_name'].nunique()

5155

In [11]:
games_histg = df['game_name'].value_counts()
games_histg

Dota 2                             9682
Team Fortress 2                    4646
Counter-Strike Global Offensive    2789
Unturned                           2632
Left 4 Dead 2                      1752
                                   ... 
Putt-Putt Joins the Parade            1
Ducati World Championship             1
Chunk of Change Knight                1
STASIS                                1
Soccertron                            1
Name: game_name, Length: 5155, dtype: int64

In [14]:
pop_games_m10 = df['game_name'].value_counts()[df['game_name'].value_counts() > 10]
pop_games_m10

Dota 2                             9682
Team Fortress 2                    4646
Counter-Strike Global Offensive    2789
Unturned                           2632
Left 4 Dead 2                      1752
                                   ... 
The Maw                              11
Air Conflicts Pacific Carriers       11
X-Tension                            11
Dogfight 1942                        11
Sam & Max 105 Reality 2.0            11
Name: game_name, Length: 2099, dtype: int64

In [15]:
df['user_id'].nunique()

12393

In [19]:
df1 = df['action'].value_counts()
print(df1)

purchase    129510
play         70489
Name: action, dtype: int64


In [21]:
df.groupby('action').count()

Unnamed: 0_level_0,user_id,game_name,quantity,0
action,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
play,70489,70489,70489,70489
purchase,129510,129510,129510,129510


In [41]:
unpop_games_u10 = df['game_name'].value_counts()[df['game_name'].value_counts() <= 10]

unpop_games_u10 = unpop_games_u10.rename_axis("game_name").reset_index(name='counts')

unpop_games_u10

Unnamed: 0,game_name,counts
0,Space Empires IV Deluxe,10
1,Contrast,10
2,Cities XXL,10
3,Fortix,10
4,Out of the Park Baseball 15,10
...,...,...
3051,Putt-Putt Joins the Parade,1
3052,Ducati World Championship,1
3053,Chunk of Change Knight,1
3054,STASIS,1


In [32]:
# how many with purchase == 1? Same as number of purchases
len(df[(df['action'] == 'purchase') & 
             (df['quantity'] == 1)])

129510

In [33]:
# how many with purchase == 0? None -- so purchase row is redundant info
len(df[(df['action'] == 'purchase') & 
             (df['quantity'] == 0)])

0

In [43]:
# create a new df2 that removes all the unpopular games (less than 11 owners)
df2 = df[~df['game_name'].isin(unpop_games_u10['game_name'])]

In [44]:
games_histg2 = df2['game_name'].value_counts()

In [45]:
games_histg2

Dota 2                                      9682
Team Fortress 2                             4646
Counter-Strike Global Offensive             2789
Unturned                                    2632
Left 4 Dead 2                               1752
                                            ... 
Crusader Kings II Hymns of Abraham            11
Europa Universalis IV American Dream DLC      11
Her Story                                     11
Black Fire                                    11
Jagged Alliance Online - Steam Edition        11
Name: game_name, Length: 2099, dtype: int64

In [46]:
df2['game_name'].nunique()

2099

In [47]:
df2.shape

(188780, 5)

In [48]:
df2.to_csv("steam_pop.csv")

In [60]:
df3 = df2.head(100)

In [61]:
df3

Unnamed: 0,user_id,game_name,action,quantity,0
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0
...,...,...,...,...,...
97,59945701,Orcs Must Die!,purchase,1.0,0
98,59945701,Orcs Must Die!,play,0.7,0
99,59945701,THE KING OF FIGHTERS XIII STEAM EDITION,purchase,1.0,0
100,59945701,THE KING OF FIGHTERS XIII STEAM EDITION,play,0.6,0


In [63]:
df3.drop(df3.columns[[4]], inplace=True, axis=1)
df3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(df3.columns[[4]], inplace=True, axis=1)


Unnamed: 0,user_id,game_name,action,quantity
0,151603712,The Elder Scrolls V Skyrim,play,273.0
1,151603712,Fallout 4,purchase,1.0
2,151603712,Fallout 4,play,87.0
3,151603712,Spore,purchase,1.0
4,151603712,Spore,play,14.9
...,...,...,...,...
97,59945701,Orcs Must Die!,purchase,1.0
98,59945701,Orcs Must Die!,play,0.7
99,59945701,THE KING OF FIGHTERS XIII STEAM EDITION,purchase,1.0
100,59945701,THE KING OF FIGHTERS XIII STEAM EDITION,play,0.6


In [64]:
df3.to_csv("steam_test.csv")

In [82]:
df2["user_id"].nunique()

12299