In [1]:
import pandas as pd
from pandas import DataFrame
from pandas import Series

import numpy as np
import matplotlib.pyplot as plt

### 设置路径

In [2]:
path_pre = '../../'
path_original_dataset = path_pre + 'original-dataset/'
path_intermediate_dataset = path_pre + 'intermediate-dataset/'

### 加载 ad.h5 和 app_cat.h5

In [3]:
ad = pd.read_hdf(path_intermediate_dataset + 'ad.h5')
ad.head()

Unnamed: 0,creativeID,adID,camgaignID,advertiserID,appID,appPlatform
0,4079,2318,147,80,14,2
1,4565,3593,632,3,465,1
2,3170,1593,205,54,389,1
3,6566,2390,205,54,389,1
4,5187,411,564,3,465,1


In [4]:
app_cat = pd.read_hdf(path_intermediate_dataset + 'app_cat.h5')
app_cat.head()

Unnamed: 0,appID,appCategory
0,14,2
1,25,203
2,68,104
3,75,402
4,83,203


### 合并 ad.h5 和 app_cat.h5

In [5]:
ad = ad.merge(app_cat, on='appID', how='left')
ad.head()

Unnamed: 0,creativeID,adID,camgaignID,advertiserID,appID,appPlatform,appCategory
0,4079,2318,147,80,14,2,2
1,4565,3593,632,3,465,1,209
2,3170,1593,205,54,389,1,108
3,6566,2390,205,54,389,1,108
4,5187,411,564,3,465,1,209


### 加载 app_popularity.h5

In [6]:
app_popularity = pd.read_hdf(path_intermediate_dataset + 'f_app_popularity.h5')
app_popularity.head()

Unnamed: 0_level_0,app_popularity
appID,Unnamed: 1_level_1
354,14298
355,7278
356,8180
357,311677
358,4189


### 合并表格

In [7]:
app_popularity.reset_index(inplace=True)
ad = ad.merge(app_popularity, how='left', on='appID')
ad.head()

Unnamed: 0,creativeID,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_popularity
0,4079,2318,147,80,14,2,2,
1,4565,3593,632,3,465,1,209,1879.0
2,3170,1593,205,54,389,1,108,6240.0
3,6566,2390,205,54,389,1,108,6240.0
4,5187,411,564,3,465,1,209,1879.0


In [8]:
ad['app_popularity'].value_counts(dropna=False)

NaN          2959
 6240.0      1400
 763.0        803
 102.0        763
 15958.0      193
 282777.0     146
 1577.0        93
 8180.0        84
 1879.0        56
 999.0         42
 69835.0       35
 1.0            7
 15.0           1
Name: app_popularity, dtype: int64

### 将 popularity_app 的 NaN 填充为 均值

In [14]:
mean = int(ad['app_popularity'].mean())

In [16]:
ad['app_popularity'].fillna(mean, inplace=True)
ad.head()

Unnamed: 0,creativeID,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_popularity
0,4079,2318,147,80,14,2,2,15792.0
1,4565,3593,632,3,465,1,209,1879.0
2,3170,1593,205,54,389,1,108,6240.0
3,6566,2390,205,54,389,1,108,6240.0
4,5187,411,564,3,465,1,209,1879.0


In [17]:
ad['app_popularity'].value_counts(dropna=False)

15792.0     2959
6240.0      1400
763.0        803
102.0        763
15958.0      193
282777.0     146
1577.0        93
8180.0        84
1879.0        56
999.0         42
69835.0       35
1.0            7
15.0           1
Name: app_popularity, dtype: int64

### 将 app_popularity 缩放到 [0, 1]

In [18]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
ad['app_popularity'] = \
    min_max_scaler.fit_transform(ad['app_popularity'].astype(float).values.reshape(-1, 1))
ad.head()

Unnamed: 0,creativeID,adID,camgaignID,advertiserID,appID,appPlatform,appCategory,app_popularity
0,4079,2318,147,80,14,2,2,0.055843
1,4565,3593,632,3,465,1,209,0.006641
2,3170,1593,205,54,389,1,108,0.022063
3,6566,2390,205,54,389,1,108,0.022063
4,5187,411,564,3,465,1,209,0.006641


In [22]:
ad['app_popularity'].value_counts(dropna=False)

0.055843    2959
0.022063    1400
0.002695     803
0.000357     763
0.056430     193
1.000000     146
0.005573      93
0.028924      84
0.006641      56
0.003529      42
0.246959      35
0.000000       7
0.000050       1
Name: app_popularity, dtype: int64

## 提取出部分特征

In [19]:
print(ad.columns.tolist())

['creativeID', 'adID', 'camgaignID', 'advertiserID', 'appID', 'appPlatform', 'appCategory', 'app_popularity']


In [20]:
selected_feature = [\
                    'creativeID',\
#                     'adID',\
#                     'camgaignID',\
                    'advertiserID',\
#                     'appID',\
                    'appPlatform',\
                    'appCategory',\
                    'app_popularity'
                   ]

In [21]:
ad_selected = ad[selected_feature]
ad_selected.head()

Unnamed: 0,creativeID,advertiserID,appPlatform,appCategory,app_popularity
0,4079,80,2,2,0.055843
1,4565,3,1,209,0.006641
2,3170,54,1,108,0.022063
3,6566,54,1,108,0.022063
4,5187,3,1,209,0.006641
