# 用户互动信息

In [11]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import time
import re

In [2]:
inter = pd.read_csv('../data/user_interaction_data.csv')
inter.shape

(198608, 4)

In [3]:
inter.head()

Unnamed: 0,user_id,item_id,interact_type,date
0,10243056,22635954,1,213
1,10203565,24723827,3,213
2,10317559,20413036,2,213
3,10158940,23833050,1,213
4,10376271,22218154,1,213


In [4]:
inter['interact_type'].value_counts(),len(inter['interact_type'].value_counts())
# 总共11中互动类型

(interact_type
 1     87379
 10    63958
 7     18540
 9      9433
 2      5142
 11     4275
 4      4000
 8      2256
 3      1906
 5      1647
 6        72
 Name: count, dtype: int64,
 11)

In [5]:
# 互动用户数和视频数
inter['user_id'].unique().__len__(),inter['item_id'].unique().__len__()

(19946, 45536)

## 重复值与缺失值

In [6]:
# 无缺失
inter.isnull().sum()

user_id          0
item_id          0
interact_type    0
date             0
dtype: int64

In [7]:
inter.shape[0],inter.drop_duplicates().shape[0]
# 可以看出有大量重复，但是我们不能将这些信息删除
# 因为互动越多表示此用户越忠诚
# 那我们可以考虑将重复的行的信息给他换一种形式保存下来

(198608, 86257)

In [8]:
inter['inter_count']=1
inter.groupby(by=['user_id','item_id','interact_type','date'],as_index=False).count()
# 这里将重复行的数量加到了count中

Unnamed: 0,user_id,item_id,interact_type,date,inter_count
0,10000069,23376803,9,193,2
1,10000078,20130667,10,175,1
2,10000107,20193958,1,211,2
3,10000107,20208525,1,218,3
4,10000107,20503550,1,221,5
...,...,...,...,...,...
86252,10599985,25023986,7,198,1
86253,10599996,20665255,10,187,3
86254,10599996,22129667,10,210,4
86255,10599996,23775525,10,185,1


In [9]:
inter = inter.groupby(by=['user_id','item_id','interact_type','date'],as_index=False).count()

In [10]:
inter.to_csv(r"../data/user_interaction_data1.csv")

# 视频播放数据

In [13]:
playback = pd.read_csv('../data/user_playback_data.csv')
playback.shape

(71046035, 4)

In [14]:
playback.head()

Unnamed: 0,user_id,item_id,playtime,date
0,10057286,20628283.0,2208.612,145
1,10522615,23930557.0,31.054,145
2,10494028,20173699.0,115.952,145
3,10181987,21350426.0,1.585,145
4,10439175,22946929.0,51.726,145


## 观看时间判断

In [None]:
playback['playtime'].max(),24*60*60
# 看出一天的时间是86400秒，最大值已经超过了此数字
# 按照秒来统计，有可能是多个设备同时观看

(np.float64(251348.689), 86400)

In [None]:
# 可以看到按照秒来计时，观看时长大于24小时的只有7条记录
# 也许可以直接删掉
playback[playback['playtime']>86400]

Unnamed: 0,user_id,item_id,playtime,date
8568194,10513181,,251348.689,117
14870825,10016747,,181748.021,155
28404843,10467206,21809864.0,129378.451,191
43315479,10408573,,147897.974,162
46079312,10223051,,106300.974,154
54792331,10408573,,109552.894,161
70355834,10074406,23924688.0,138763.622,143


## 重复值与缺失值

In [23]:
playback.isnull().sum()
# 视频id可忽略不填

user_id         0
item_id     60483
playtime        0
date            0
dtype: int64

In [None]:
playback.drop_duplicates().shape[0],playback.shape[0]
# 仅仅有5条记录重复，可删

(71046030, 71046035)

## 涉及用户、视频

In [24]:
playback['user_id'].unique().__len__()

544536

### 单个用户看过的视频数量

In [25]:
playback.loc[:,['user_id','item_id']].groupby('user_id').count().describe()

Unnamed: 0,item_id
count,544536.0
mean,130.359704
std,653.054375
min,0.0
25%,4.0
50%,17.0
75%,77.0
max,56150.0


### 单个用户一共看了多长时间视频(分钟)

In [28]:
(playback.loc[:,['user_id','playtime']].groupby('user_id').sum()/60).describe()

Unnamed: 0,playtime
count,544536.0
mean,881.578446
std,2237.300825
min,1.7e-05
25%,5.637638
50%,105.233392
75%,748.77825
max,105398.483683


### 单个用户平均每天看视频时长

In [30]:
playback.loc[:,['user_id','playtime','date']].groupby(by=['user_id','date']).mean()/60

Unnamed: 0_level_0,Unnamed: 1_level_0,playtime
user_id,date,Unnamed: 2_level_1
10000000,131,30.885050
10000000,132,10.410033
10000000,164,9.293433
10000000,195,21.559750
10000001,117,41.927900
...,...,...
10599997,164,16.450492
10599997,165,22.307025
10599997,166,18.988150
10599997,173,1.088408
