In [68]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [69]:
%matplotlib notebook

In [70]:
path = '../datasets/steam-200k/steam-200k.csv'

In [71]:
names = ['UserID', 'Game', 'Action', 'Hours', 'Not Needed']
data = pd.read_csv(path, header=0, names = names)

In [72]:
data.columns

Index(['UserID', 'Game', 'Action', 'Hours', 'Not Needed'], dtype='object')

In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 5 columns):
UserID        199999 non-null int64
Game          199999 non-null object
Action        199999 non-null object
Hours         199999 non-null float64
Not Needed    199999 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 7.6+ MB


In [74]:
data.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed
0,151603712,The Elder Scrolls V Skyrim,play,273.0,0
1,151603712,Fallout 4,purchase,1.0,0
2,151603712,Fallout 4,play,87.0,0
3,151603712,Spore,purchase,1.0,0
4,151603712,Spore,play,14.9,0


In [75]:
data.shape

(199999, 5)

In [76]:
data.describe(include=np.number)

Unnamed: 0,UserID,Hours,Not Needed
count,199999.0,199999.0,199999.0
mean,103655600.0,17.874468,0.0
std,72080840.0,138.057292,0.0
min,5250.0,0.1,0.0
25%,47384200.0,1.0,0.0
50%,86912010.0,1.0,0.0
75%,154230900.0,1.3,0.0
max,309903100.0,11754.0,0.0


In [77]:
#查看Hours列的值数量情况
s = data['Hours'].value_counts()

In [78]:
sns.lineplot(data = s[s>1])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1a2ea04c50>

In [41]:
#查看离散数据的分布情况
data.describe(include=np.object)

Unnamed: 0,Game,Action
count,199999,199999
unique,5155,2
top,Dota 2,purchase
freq,9682,129510


In [84]:
#查看Action 与 hours的关系
data.loc[(data['Action'] == 'purchase') & (data['Hours'] == 1),['Action','Hours']]

Unnamed: 0,Action,Hours
1,purchase,1.0
3,purchase,1.0
5,purchase,1.0
7,purchase,1.0
9,purchase,1.0
...,...,...
199989,purchase,1.0
199991,purchase,1.0
199993,purchase,1.0
199995,purchase,1.0


In [80]:
(data['Action'] == 'purchase').value_counts()

True     129510
False     70489
Name: Action, dtype: int64

In [107]:
# 由以上可以看出purchase与Hours高度相关
#创建Hours_Played字段，替代原有的Action和Hours，0表示仅购买，大于0表示购买且游戏时长
data['Hours_Played'] = data['Hours'].astype('float32')
# 如果字段Action=purchase，并且Hours=1.0，将设置Hours_Played=0
data.loc[(data['Action'] == 'purchase') & (data['Hours'] == 1), 'Hours_Played'] = 0

In [110]:
data['Hours_Played'].value_counts()

0.0       129510
0.2         3016
0.3         2517
0.4         2129
0.5         1813
           ...  
897.0          1
2563.0         1
1238.0         1
1536.0         1
2047.0         1
Name: Hours_Played, Length: 1594, dtype: int64

In [134]:
#删除无用的列
data.drop(columns='Hours_played', inplace=True)

In [135]:
#对数据从小到大进行排序，data下表也会发生变化
data.UserID = data.UserID.astype('int')
data = data.sort_values(['UserID', 'Game', 'Hours_Played'], ascending= True)


In [138]:
data.head()

Unnamed: 0,UserID,Game,Action,Hours,Not Needed,Hours_Played
65428,5250,Alien Swarm,purchase,1.0,0,0.0
65429,5250,Alien Swarm,play,4.9,0,4.9
65422,5250,Cities Skylines,purchase,1.0,0,0.0
65423,5250,Cities Skylines,play,144.0,0,144.0
65434,5250,Counter-Strike,purchase,1.0,0,0.0


In [140]:
#删除重复项，并保留最后一项出现的项（因为最后一项是用户游戏时间，第一项为购买）
#按照顺序排序后最后一项为要保留的值。
clean_data = data.drop_duplicates(['UserID', 'Game'], keep = 'last')

In [142]:
#去掉不用的列，Action, Hours， Not Need
clean_data = clean_data.drop(['Action', 'Hours', 'Not Needed'], axis= 1)

In [145]:
clean_data.head()

Unnamed: 0,UserID,Game,Hours_Played
65429,5250,Alien Swarm,4.9
65423,5250,Cities Skylines,144.0
65434,5250,Counter-Strike,0.0
65435,5250,Counter-Strike Source,0.0
65436,5250,Day of Defeat,0.0


In [147]:
# 探索下数据集的特征
n_users = len(clean_data.UserID.unique())
n_games = len(clean_data.Game.unique())
print('数据集中包含了 {0} 玩家，{1} 游戏'.format(n_users, n_games))

数据集中包含了 12393 玩家，5155 游戏


In [152]:
clean_data.shape[0]/float(n_users * n_games)

0.0020161564563957487

In [150]:
# 矩阵的稀疏性
sparsity = clean_data.shape[0] / float(n_users * n_games)
print('用户行为矩阵的稀疏性（填充比例）为{:.2%} '.format(sparsity))

用户行为矩阵的稀疏性（填充比例）为0.20% 
