In [1]:
import numpy as np
import pandas as pd
import time
import ast
from tqdm import tqdm
import datetime
from multiprocessing import Pool, cpu_count
from itertools import zip_longest
from collections import defaultdict, OrderedDict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from scipy import stats
from scipy.stats import skew, norm
from itertools import product
%matplotlib inline

In [2]:
base_dir = './dataset/'

### 使用的app数据中各种类数量、每个用户app使用列表

In [3]:
app_info_df = pd.read_csv(base_dir+'app_info.csv',header=None, names=['appId','category'])
app_cate_list = app_info_df['category'].unique()
app_info_df.head()

Unnamed: 0,appId,category
0,a005179,运动健康
1,a001010,实用工具
2,a004865,新闻阅读
3,a002786,图书阅读
4,a002905,金融理财


In [14]:
train_id_set = pd.read_csv(base_dir+'age_train.csv',header=None, names=['uId', 'age_group'])
test_id_set = pd.read_csv(base_dir+'age_test.csv', header=None, names=['uId'])
print(train_id_set.shape, test_id_set.shape)

(4000000, 2) (1000000, 1)


In [5]:
user_app_usage=pd.read_csv(base_dir+'user_app_usage.csv',header=None,names=['uId','appId','duration','times','use_date'],
                           dtype={'uId':int,'duration':np.uint32,'times':np.uint32,'use_date':'category'})
user_app_usage.shape

(840560515, 5)

In [14]:
user_app_usage[valuese_date'].value_counts()

2019-07-12    30510730
2019-06-28    30452543
2019-06-29    30393199
2019-07-05    30366799
2019-07-11    30355598
2019-07-01    30317849
2019-07-06    30311028
2019-07-02    30293211
2019-07-10    30247427
2019-06-27    30221032
2019-07-03    30206266
2019-07-09    30194885
2019-07-13    30156177
2019-07-04    30155633
2019-07-08    30113619
2019-07-15    30067021
2019-06-30    29927895
2019-07-07    29768238
2019-07-17    25356987
2019-07-16    25312336
2019-07-18    25217468
2019-07-19    25189873
2019-07-14    24823793
2019-07-23    24556803
2019-07-22    24538047
2019-07-25    24511305
2019-07-24    24500719
2019-07-26    24498104
2019-07-20    24122996
2019-07-21    23872934
Name: use_date, dtype: int64

In [6]:
user_app_usage_appid_count=user_app_usage.appId.value_counts()
user_app_usage_appid_name_list=[]
user_app_usage_appid_count_list=[]
for k,v in user_app_usage_appid_count.items():
    user_app_usage_appid_name_list.append(k)
    user_app_usage_appid_count_list.append(v)
app_usage_count_df=pd.DataFrame({'appId':user_app_usage_appid_name_list,'count':user_app_usage_appid_count_list})
app_usage_count_df.to_csv(base_dir+'pd_user_app_usage_appid_count.csv',index=False)

In [7]:
app_usage_count_df.head()

Unnamed: 0,appId,count
0,a007439,101196408
1,a007440,52133265
2,a007331,37471688
3,a007104,34890613
4,a001062,31468206


In [10]:
sum(app_usage_count_df.loc[:1000,'count'])/sum(app_usage_count_df['count'])

0.9483854520575475

In [25]:
app_usage_count_df.shape

NameError: name 'app_usage_count_df' is not defined

In [None]:
### 10000 需视决赛数据集而定
# app_usage_top_10000_set = set(list(app_usage_count_df.loc[:10000, 'appId']))

In [11]:
user_app_usage_list = []
for uId,group in user_app_usage.groupby('uId'):
    all_app_set = list(group['appId'].unique())
#     top_app_list = list(app_usage_top_10000_set.intersection(all_app_set))
    appIds = '#'.join(all_app_set)
    user_app_usage_map = {}
    user_app_usage_map['uId'] = uId
    user_app_usage_map['appIds'] = appIds
    user_app_usage_list.append(user_app_usage_map)
    

In [12]:
user_app_usage_df = pd.DataFrame(user_app_usage_list)
user_app_usage_df.head()

Unnamed: 0,uId,appIds
0,1000009,a008224#a001055#a002392#a003386#a003659#a00548...
1,1000010,a006085#a003327#a001578#a007488#a00451#a001050...
2,1000011,a007629#a006461#a00604#a006400#a007439#a00326#...
3,1000012,a003570#a008587#a007439#a005893#a007447#a00588...
4,1000014,a001671#a007445#a007088#a008879#a001062


In [13]:
h5 = pd.HDFStore(base_dir+'user_app_usage_df.h5','w',complevel=4,complib='blosc')
h5['data'] = user_app_usage_df 
h5.close()

In [5]:
user_app_usage_df = pd.read_hdf(base_dir+'user_app_usage_df.h5', key='data')
user_app_usage_df.shape

(4020281, 2)

In [6]:
def calc_app_nums_each_cate(df_row): 
    index, row = df_row
    uId = row['uId']
    appIds = row['appIds'].split('#')
    cate_map = OrderedDict()
    for cate in app_cate_list:
        cate_map[cate] = 0
    cate_map['appNums'] = len(appIds)
    categories = list(app_info_df.loc[app_info_df['appId'].isin(appIds), 'category'])
    for c in categories:
        cate_map[c] += 1
    return cate_map

In [7]:
pool = Pool(8)
results = pool.map(calc_app_nums_each_cate, user_app_usage_df[['uId','appIds']].iterrows())
pool.close()
pool.join()

In [8]:
app_usage_category = pd.DataFrame(results)
app_usage_category = pd.concat([user_app_usage_df[['uId']], app_usage_category], axis=1)
app_usage_category.rename(columns=lambda x:str(x)+'_usage',inplace=True)
app_usage_category.rename(columns={'uId_usage':'uId'},inplace=True)
app_usage_category.shape

(4020281, 34)

In [9]:
h5 = pd.HDFStore(base_dir+'app_usage_category.h5','w',complevel=4,complib='blosc')
h5['data'] = app_usage_category 
h5.close()

In [7]:
app_usage_category = pd.read_hdf(base_dir+'app_usage_category.h5',key='data')
app_usage_category.shape

(4020281, 34)

In [8]:
app_actived_category = pd.read_hdf(base_dir+'app_actived_category.h5', key='data')
app_actived_category.shape

(4999341, 34)

### 转化率

In [9]:
app_stat = app_actived_category.merge(app_usage_category,on='uId',how='left')
app_stat.fillna(0,inplace=True)
app_stat.shape

(4999341, 67)

In [11]:
for col in app_actived_category.columns[1:]:
    col_usage = col+'_usage'
    app_stat[col+'_rate'] = app_stat.apply(lambda x:x[col_usage]/x[col] if x[col]!=0 else x[col_usage],axis=1)
    print(col)

运动健康
实用工具
新闻阅读
图书阅读
金融理财
社交通讯
便捷生活
休闲益智
拍摄美化
经营策略
儿童
汽车
教育
主题个性
影音娱乐
棋牌桌游
购物比价
旅游住宿
出行导航
商务
角色扮演
动作射击
体育竞速
美食
休闲娱乐
表盘个性
学习办公
网络游戏
主题铃声
动漫
休闲游戏
资讯生活
appNums


In [12]:
for col in app_actived_category.columns[1:]:
    _ = app_stat.pop(col)

In [15]:
train_app_trans_rate = train_id_set[['uId']].merge(app_stat,on='uId',how='left')
test_app_trans_rate = test_id_set.merge(app_stat,on='uId',how='left')
train_app_trans_rate.fillna(0,inplace=True)
test_app_trans_rate.fillna(0,inplace=True)
_ = train_app_trans_rate.pop('uId')
_ = test_app_trans_rate.pop('uId')
print(test_app_trans_rate.shape,train_app_trans_rate.shape)

(1000000, 66) (4000000, 66)


In [16]:
train_app_rate = train_app_trans_rate[train_app_trans_rate.columns[33:]]
train_app_rate[train_app_rate>1] = 1
train_app_trans_rate_with_usage = train_app_trans_rate[train_app_trans_rate.columns[:33]]
train_app_trans_rate_with_usage = pd.concat([train_app_trans_rate_with_usage,train_app_rate],axis=1)
train_app_trans_rate_with_usage.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


Unnamed: 0,运动健康_usage,实用工具_usage,新闻阅读_usage,图书阅读_usage,金融理财_usage,社交通讯_usage,便捷生活_usage,休闲益智_usage,拍摄美化_usage,经营策略_usage,...,美食_rate,休闲娱乐_rate,表盘个性_rate,学习办公_rate,网络游戏_rate,主题铃声_rate,动漫_rate,休闲游戏_rate,资讯生活_rate,appNums_rate
0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667
1,0.0,8.0,1.0,0.0,0.0,2.0,3.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.571429
2,0.0,9.0,0.0,0.0,0.0,3.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875
3,0.0,20.0,9.0,0.0,2.0,3.0,6.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.777778
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_app_rate = test_app_trans_rate[test_app_trans_rate.columns[33:]]
test_app_rate[test_app_rate>1]=1
test_app_trans_rate_with_usage = test_app_trans_rate[test_app_trans_rate.columns[:33]]
test_app_trans_rate_with_usage = pd.concat([test_app_trans_rate_with_usage,test_app_rate],axis=1)
test_app_trans_rate_with_usage.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,运动健康_usage,实用工具_usage,新闻阅读_usage,图书阅读_usage,金融理财_usage,社交通讯_usage,便捷生活_usage,休闲益智_usage,拍摄美化_usage,经营策略_usage,...,美食_rate,休闲娱乐_rate,表盘个性_rate,学习办公_rate,网络游戏_rate,主题铃声_rate,动漫_rate,休闲游戏_rate,资讯生活_rate,appNums_rate
0,1.0,10.0,1.0,0.0,2.0,4.0,8.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.583333
1,0.0,18.0,5.0,0.0,4.0,5.0,17.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.653333
2,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170732
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,7.0,1.0,0.0,4.0,4.0,8.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.676471


In [18]:
scaler = StandardScaler().fit(train_app_rate)
train_app_trans_rate_scaler = scaler.transform(train_app_rate)  
test_app_trans_rate_scaler = scaler.transform(test_app_rate)
print(test_app_trans_rate_scaler.shape,train_app_trans_rate_scaler.shape)

(1000000, 33) (4000000, 33)


In [20]:
h5 = pd.HDFStore(base_dir+'train_app_trans_rate.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(train_app_trans_rate_scaler,dtype=np.float32)
h5.close()

In [21]:
h5 = pd.HDFStore(base_dir+'test_app_trans_rate.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(test_app_trans_rate_scaler,dtype=np.float32)
h5.close()

In [22]:
scaler = StandardScaler().fit(train_app_trans_rate_with_usage)
train_app_trans_rate_with_usage_scaler = scaler.transform(train_app_trans_rate_with_usage)  
test_app_trans_rate_with_usage_scaler = scaler.transform(test_app_trans_rate_with_usage)
print(test_app_trans_rate_with_usage_scaler.shape,train_app_trans_rate_with_usage_scaler.shape)

(1000000, 66) (4000000, 66)


In [23]:
h5 = pd.HDFStore(base_dir+'train_app_trans_rate_with_usage.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(train_app_trans_rate_with_usage_scaler,dtype=np.float32)
h5.close()

In [24]:
h5 = pd.HDFStore(base_dir+'test_app_trans_rate_with_usage.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(test_app_trans_rate_with_usage_scaler,dtype=np.float32)
h5.close()

### 比例

In [27]:
for col in app_actived_category.columns[1:-1]:
    app_actived_category[col+'_actived_rate'] = app_actived_category.apply(lambda x:x[col]/x['appNums'] 
                                                                            if x['appNums']!=0 else 0,axis=1)
    print(col)

运动健康
实用工具
新闻阅读
图书阅读
金融理财
社交通讯
便捷生活
休闲益智
拍摄美化
经营策略
儿童
汽车
教育
主题个性
影音娱乐
棋牌桌游
购物比价
旅游住宿
出行导航
商务
角色扮演
动作射击
体育竞速
美食
休闲娱乐
表盘个性
学习办公
网络游戏
主题铃声
动漫
休闲游戏
资讯生活


In [64]:
app_actived_category.columns

Index(['uId', '运动健康', '实用工具', '新闻阅读', '图书阅读', '金融理财', '社交通讯', '便捷生活', '休闲益智',
       '拍摄美化', '经营策略', '儿童', '汽车', '教育', '主题个性', '影音娱乐', '棋牌桌游', '购物比价',
       '旅游住宿', '出行导航', '商务', '角色扮演', '动作射击', '体育竞速', '美食', '休闲娱乐', '表盘个性',
       '学习办公', '网络游戏', '主题铃声', '动漫', '休闲游戏', '资讯生活', 'appNums',
       '运动健康_actived_rate', '实用工具_actived_rate', '新闻阅读_actived_rate',
       '图书阅读_actived_rate', '金融理财_actived_rate', '社交通讯_actived_rate',
       '便捷生活_actived_rate', '休闲益智_actived_rate', '拍摄美化_actived_rate',
       '经营策略_actived_rate', '儿童_actived_rate', '汽车_actived_rate',
       '教育_actived_rate', '主题个性_actived_rate', '影音娱乐_actived_rate',
       '棋牌桌游_actived_rate', '购物比价_actived_rate', '旅游住宿_actived_rate',
       '出行导航_actived_rate', '商务_actived_rate', '角色扮演_actived_rate',
       '动作射击_actived_rate', '体育竞速_actived_rate', '美食_actived_rate',
       '休闲娱乐_actived_rate', '表盘个性_actived_rate', '学习办公_actived_rate',
       '网络游戏_actived_rate', '主题铃声_actived_rate', '动漫_actived_rate',
       '休闲游戏_actived_r

In [28]:
app_actived_rate = app_actived_category[app_actived_category.columns[34:]]
app_actived_rate = pd.concat([app_actived_category[['uId']],app_actived_rate],axis=1)
app_actived_rate.shape

(4999341, 33)

In [29]:
app_actived_rate.head()

Unnamed: 0,uId,运动健康_actived_rate,实用工具_actived_rate,新闻阅读_actived_rate,图书阅读_actived_rate,金融理财_actived_rate,社交通讯_actived_rate,便捷生活_actived_rate,休闲益智_actived_rate,拍摄美化_actived_rate,...,体育竞速_actived_rate,美食_actived_rate,休闲娱乐_actived_rate,表盘个性_actived_rate,学习办公_actived_rate,网络游戏_actived_rate,主题铃声_actived_rate,动漫_actived_rate,休闲游戏_actived_rate,资讯生活_actived_rate
0,1000110,0.0,0.583333,0.083333,0.0,0.0,0.166667,0.166667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000542,0.0,0.441176,0.088235,0.0,0.088235,0.058824,0.147059,0.0,0.058824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1000866,0.0,0.6,0.4,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001028,0.071429,0.371429,0.028571,0.0,0.028571,0.114286,0.257143,0.0,0.071429,...,0.0,0.042857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1001190,0.0,0.361702,0.085106,0.0,0.234043,0.191489,0.148936,0.0,0.085106,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
train_app_actived_rate = train_id_set[['uId']].merge(app_actived_rate,on='uId',how='left')
test_app_actived_rate = test_id_set[['uId']].merge(app_actived_rate,on='uId',how='left')
train_app_actived_rate.fillna(0,inplace=True)
test_app_actived_rate.fillna(0,inplace=True)
_ = train_app_actived_rate.pop('uId')
_ = test_app_actived_rate.pop('uId')
print(test_app_actived_rate.shape,train_app_actived_rate.shape)

(1000000, 32) (4000000, 32)


In [31]:
scaler = StandardScaler().fit(train_app_actived_rate)
train_app_actived_rate_scaler = scaler.transform(train_app_actived_rate)  
test_app_actived_rate_scaler = scaler.transform(test_app_actived_rate)
print(test_app_actived_rate_scaler.shape,train_app_actived_rate_scaler.shape)

(1000000, 32) (4000000, 32)


In [32]:
h5 = pd.HDFStore(base_dir+'train_app_actived_rate.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(train_app_actived_rate_scaler,dtype=np.float32)
h5.close()

In [33]:
h5 = pd.HDFStore(base_dir+'test_app_actived_rate.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(test_app_actived_rate_scaler,dtype=np.float32)
h5.close()

In [34]:
for col in app_usage_category.columns[1:-1]:
    app_usage_category[col+'_rate'] = app_usage_category.apply(lambda x:x[col]/x['appNums_usage'] 
                                                                            if x['appNums_usage']!=0 else 0,axis=1)
    print(col)

运动健康_usage
实用工具_usage
新闻阅读_usage
图书阅读_usage
金融理财_usage
社交通讯_usage
便捷生活_usage
休闲益智_usage
拍摄美化_usage
经营策略_usage
儿童_usage
汽车_usage
教育_usage
主题个性_usage
影音娱乐_usage
棋牌桌游_usage
购物比价_usage
旅游住宿_usage
出行导航_usage
商务_usage
角色扮演_usage
动作射击_usage
体育竞速_usage
美食_usage
休闲娱乐_usage
表盘个性_usage
学习办公_usage
网络游戏_usage
主题铃声_usage
动漫_usage
休闲游戏_usage
资讯生活_usage


In [35]:
app_usage_category.shape

(4020281, 66)

In [54]:
app_usage_category.columns

Index(['uId', '运动健康_usage', '实用工具_usage', '新闻阅读_usage', '图书阅读_usage',
       '金融理财_usage', '社交通讯_usage', '便捷生活_usage', '休闲益智_usage', '拍摄美化_usage',
       '经营策略_usage', '儿童_usage', '汽车_usage', '教育_usage', '主题个性_usage',
       '影音娱乐_usage', '棋牌桌游_usage', '购物比价_usage', '旅游住宿_usage', '出行导航_usage',
       '商务_usage', '角色扮演_usage', '动作射击_usage', '体育竞速_usage', '美食_usage',
       '休闲娱乐_usage', '表盘个性_usage', '学习办公_usage', '网络游戏_usage', '主题铃声_usage',
       '动漫_usage', '休闲游戏_usage', '资讯生活_usage', 'appNums_usage',
       '运动健康_usage_rate', '实用工具_usage_rate', '新闻阅读_usage_rate',
       '图书阅读_usage_rate', '金融理财_usage_rate', '社交通讯_usage_rate',
       '便捷生活_usage_rate', '休闲益智_usage_rate', '拍摄美化_usage_rate',
       '经营策略_usage_rate', '儿童_usage_rate', '汽车_usage_rate', '教育_usage_rate',
       '主题个性_usage_rate', '影音娱乐_usage_rate', '棋牌桌游_usage_rate',
       '购物比价_usage_rate', '旅游住宿_usage_rate', '出行导航_usage_rate',
       '商务_usage_rate', '角色扮演_usage_rate', '动作射击_usage_rate',
       '体育竞速_usage_rate', '美食_us

In [36]:
app_usage_rate = app_usage_category[app_usage_category.columns[34:]]
app_usage_rate = pd.concat([app_usage_category[['uId']],app_usage_rate],axis=1)
app_usage_rate.shape

(4020281, 33)

In [37]:
app_usage_rate.head()

Unnamed: 0,uId,运动健康_usage_rate,实用工具_usage_rate,新闻阅读_usage_rate,图书阅读_usage_rate,金融理财_usage_rate,社交通讯_usage_rate,便捷生活_usage_rate,休闲益智_usage_rate,拍摄美化_usage_rate,...,体育竞速_usage_rate,美食_usage_rate,休闲娱乐_usage_rate,表盘个性_usage_rate,学习办公_usage_rate,网络游戏_usage_rate,主题铃声_usage_rate,动漫_usage_rate,休闲游戏_usage_rate,资讯生活_usage_rate
0,1000009,0.0,0.285714,0.047619,0.0,0.095238,0.142857,0.404762,0.0,0.02381,...,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000010,0.032787,0.344262,0.147541,0.0,0.04918,0.114754,0.229508,0.0,0.065574,...,0.0,0.032787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1000011,0.0,0.181818,0.090909,0.0,0.090909,0.090909,0.272727,0.0,0.0,...,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000012,0.0,0.45,0.1,0.0,0.3,0.05,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000014,0.0,0.6,0.2,0.0,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
train_app_usage_rate = train_id_set[['uId']].merge(app_usage_rate,on='uId',how='left')
train_app_usage_rate.fillna(0,inplace=True)
test_app_usage_rate = test_id_set[['uId']].merge(app_usage_rate,on='uId',how='left')
test_app_usage_rate.fillna(0,inplace=True)
_ = train_app_usage_rate.pop('uId')
_ = test_app_usage_rate.pop('uId')
print(test_app_usage_rate.shape,train_app_usage_rate.shape)

(1000000, 32) (4000000, 32)


In [39]:
train_app_usage_rate.head()

Unnamed: 0,运动健康_usage_rate,实用工具_usage_rate,新闻阅读_usage_rate,图书阅读_usage_rate,金融理财_usage_rate,社交通讯_usage_rate,便捷生活_usage_rate,休闲益智_usage_rate,拍摄美化_usage_rate,经营策略_usage_rate,...,体育竞速_usage_rate,美食_usage_rate,休闲娱乐_usage_rate,表盘个性_usage_rate,学习办公_usage_rate,网络游戏_usage_rate,主题铃声_usage_rate,动漫_usage_rate,休闲游戏_usage_rate,资讯生活_usage_rate
0,0.0,0.5,0.25,0.0,0.25,0.25,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.666667,0.083333,0.0,0.0,0.166667,0.25,0.0,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.642857,0.0,0.0,0.0,0.214286,0.071429,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.571429,0.257143,0.0,0.057143,0.085714,0.171429,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
scaler = StandardScaler().fit(train_app_usage_rate)
train_app_usage_rate_scaler = scaler.transform(train_app_usage_rate)  
test_app_usage_rate_scaler = scaler.transform(test_app_usage_rate)
print(test_app_usage_rate_scaler.shape,train_app_usage_rate_scaler.shape)

(1000000, 32) (4000000, 32)


In [41]:
h5 = pd.HDFStore(base_dir+'train_app_usage_rate.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(train_app_usage_rate_scaler,dtype=np.float32)
h5.close()

In [42]:
h5 = pd.HDFStore(base_dir+'test_app_usage_rate.h5','w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(test_app_usage_rate_scaler,dtype=np.float32)
h5.close()

In [38]:
print(0)

0


In [43]:
a = pd.read_hdf(base_dir+'train_app_trans_rate_with_usage.h5',key='data')
a.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,-0.372728,-0.900221,-0.207556,-0.061412,-0.080507,-0.710023,-0.780397,-0.180043,-0.753423,-0.126972,...,-0.429068,-0.010631,-0.001225,-0.036363,-0.070608,0.0,0.0,-0.010112,-0.02255,0.651762
1,-0.372728,0.215917,-0.207556,-0.061412,-0.541586,-0.269268,-0.271433,-0.180043,-0.058341,-0.126972,...,-0.429068,-0.010631,-0.001225,-0.036363,-0.070608,0.0,0.0,-0.010112,-0.02255,0.317707
2,-0.372728,0.40194,-0.820125,-0.061412,-0.541586,0.171486,-0.780397,-0.180043,0.63674,-0.126972,...,-0.429068,-0.010631,-0.001225,-0.036363,-0.070608,0.0,0.0,-0.010112,-0.02255,1.382507
3,-0.372728,2.448193,4.693001,-0.061412,0.380573,0.171486,0.492014,-0.180043,2.721986,-0.126972,...,-0.429068,-0.010631,-0.001225,-0.036363,-0.070608,0.0,0.0,-0.010112,-0.02255,1.041493
4,-0.372728,-1.272267,-0.820125,-0.061412,-0.541586,-1.150777,-1.034879,-0.180043,-0.753423,-0.126972,...,-0.429068,-0.010631,-0.001225,-0.036363,-0.070608,0.0,0.0,-0.010112,-0.02255,-1.686622


In [45]:
for i in a.isna().sum():
    print(i)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
