In [1]:
import pandas as pd
from pandas import DataFrame
from pandas import Series

import numpy as np
import matplotlib.pyplot as plt

### 设置路径

In [2]:
path_pre = '../../'
path_original_dataset = path_pre + 'original-dataset/'
path_intermediate_dataset = path_pre + 'intermediate-dataset/'

### 加载数据

In [3]:
user = pd.read_hdf(path_intermediate_dataset + 'user.h5')
user.head()

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence
0,1,42,1,0,2,0,5,0
1,2,18,1,5,1,0,14,0
2,3,0,2,4,0,0,0,0
3,4,21,2,5,3,0,6,0
4,5,22,2,0,0,0,0,0


### 对 age 分段

In [4]:
user['age'].max()

80

In [5]:
age_interval = [0, 1, 4, 14, 29, 44, 59, 74, 84]

In [6]:
user['age'] = pd.cut(user['age'], age_interval, right=False, include_lowest=True, labels=False)
user.head()

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence
0,1,4,1,0,2,0,5,0
1,2,3,1,5,1,0,14,0
2,3,0,2,4,0,0,0,0
3,4,3,2,5,3,0,6,0
4,5,3,2,0,0,0,0,0


In [7]:
user['age'].value_counts()

3    1545002
4     489875
2     360063
0     294271
5      92414
1      17106
6       5754
7        633
Name: age, dtype: int64

0: [0, 1)    
1: [1, 4)    
2: [4, 14)   
3: [14, 29)  
4: [29, 44)  
5: [44, 59)  
6: [59, 74)  
7: [74, 84)  

### 提取用户的活跃度特征

In [3]:
# 加载数据
user_installedapps_df = pd.read_csv(path_original_dataset + 'user_installedapps.csv')
user_installedapps_df.head()

Unnamed: 0,userID,appID
0,1,357
1,1,360
2,1,362
3,1,365
4,1,375


In [8]:
count_avg_user = user_installedapps_df.groupby('userID').count()
count_avg_user.rename(columns={'appID': 'count'}, inplace=True)
count_avg_user.head()

Unnamed: 0_level_0,count
userID,Unnamed: 1_level_1
1,79
9,3
10,97
12,38
14,62


In [9]:
count_avg_user.loc[count_avg_user['count'] == 0]

Unnamed: 0_level_0,count
userID,Unnamed: 1_level_1


In [10]:
count_interval = np.ceil(np.logspace(0, 3, 6))
count_interval

array([    1.,     4.,    16.,    64.,   252.,  1000.])

In [11]:
count_avg_user['activity_user'] = \
    pd.cut(count_avg_user['count'], count_interval, include_lowest=True, labels=False)
count_avg_user.reset_index(inplace=True)
del count_avg_user['count']
count_avg_user.head()

Unnamed: 0,userID,activity_user
0,1,3
1,9,0
2,10,3
3,12,2
4,14,2


In [12]:
count_avg_user['activity_user'].value_counts()

2    867249
3    516892
1     46474
0     15124
4       366
Name: activity_user, dtype: int64

0:(16, 64]       
1:(64, 252]      
2:(4, 16]        
3:[1, 4]         
4:(252, 1000]    

### 添加用户的活跃度特征

In [20]:
user = user.merge(count_avg_user, how='left', on='userID')
user.head()

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence,activity_user
0,1,4,1,0,2,0,5,0,3.0
1,2,3,1,5,1,0,14,0,
2,3,0,2,4,0,0,0,0,
3,4,3,2,5,3,0,6,0,
4,5,3,2,0,0,0,0,0,


In [21]:
user['activity_user'].value_counts(dropna=False)

NaN     1359013
 2.0     867249
 3.0     516892
 1.0      46474
 0.0      15124
 4.0        366
Name: activity_user, dtype: int64

可见，有 1359013 的用户没有活跃度特征

### 将 activity_user 的 NaN 填充为 5

In [22]:
user['activity_user'].fillna(5, inplace=True)

In [24]:
user['activity_user'].value_counts(dropna=False)

5.0    1359013
2.0     867249
3.0     516892
1.0      46474
0.0      15124
4.0        366
Name: activity_user, dtype: int64

### 加载用户对app的品类偏好特征

In [4]:
user_pref_cat = pd.read_hdf(path_intermediate_dataset + 'user_pref_cat.h5')
user_pref_cat.head()

Unnamed: 0,userID,cat_pref
3,1,201
13,9,210
29,10,503
33,12,301
49,14,503


### 添加用户对app的品类偏好特征

In [5]:
user = user.merge(user_pref_cat, how='left', on='userID')
user.head()

Unnamed: 0,userID,age,gender,education,marriageStatus,haveBaby,hometown,residence,cat_pref
0,1,42,1,0,2,0,5,0,201.0
1,2,18,1,5,1,0,14,0,
2,3,0,2,4,0,0,0,0,
3,4,21,2,5,3,0,6,0,
4,5,22,2,0,0,0,0,0,


### 将 cat_pref 的 NaN 填充为 0

In [7]:
user['cat_pref'].value_counts(dropna=False)

NaN       1363278
 503.0     786264
 301.0     314506
 201.0     129453
 407.0      58119
 106.0      41139
 402.0      26931
 203.0      20816
 209.0      17319
 405.0      14123
 401.0       9441
 104.0       5083
 105.0       4852
 108.0       4263
 408.0       2373
 403.0       1187
 109.0       1028
 103.0        953
 211.0        937
 409.0        865
 303.0        685
 210.0        593
 406.0        385
 110.0        315
 2.0          193
 204.0         17
Name: cat_pref, dtype: int64

In [9]:
user['cat_pref'].fillna(0, inplace=True)
user['cat_pref'].value_counts(dropna=False)

0.0      1363278
503.0     786264
301.0     314506
201.0     129453
407.0      58119
106.0      41139
402.0      26931
203.0      20816
209.0      17319
405.0      14123
401.0       9441
104.0       5083
105.0       4852
108.0       4263
408.0       2373
403.0       1187
109.0       1028
103.0        953
211.0        937
409.0        865
303.0        685
210.0        593
406.0        385
110.0        315
2.0          193
204.0         17
Name: cat_pref, dtype: int64

### 品类对应于用户的权值特征