In [2]:
import numpy as np
import pandas as pd
import time
import ast
from tqdm import tqdm
import datetime
from multiprocessing import Pool, cpu_count
from itertools import zip_longest
from collections import defaultdict, OrderedDict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from scipy import stats
from scipy.stats import skew, norm
from itertools import product
%matplotlib inline

In [3]:
base_dir = './dataset/'

### 统计安装的app数据中各种类数量

In [4]:
app_info_df = pd.read_csv(base_dir+'app_info.csv',header=None, names=['appId','category'])
app_cate_list = app_info_df['category'].unique()
app_info_df.head()

Unnamed: 0,appId,category
0,a005179,运动健康
1,a001010,实用工具
2,a004865,新闻阅读
3,a002786,图书阅读
4,a002905,金融理财


In [6]:
app_info_df['category'].unique()

array(['运动健康', '实用工具', '新闻阅读', '图书阅读', '金融理财', '社交通讯', '便捷生活', '休闲益智',
       '拍摄美化', '经营策略', '儿童', '汽车', '教育', '主题个性', '影音娱乐', '棋牌桌游', '购物比价',
       '旅游住宿', '出行导航', '商务', '角色扮演', '动作射击', '体育竞速', '美食', '休闲娱乐', '表盘个性',
       '学习办公', '网络游戏', '主题铃声', '动漫', '休闲游戏', '资讯生活'], dtype=object)

In [4]:
train_id_set = pd.read_csv(base_dir+'age_train.csv',header=None, names=['uId', 'age_group'])
test_id_set = pd.read_csv(base_dir+'age_test.csv', header=None, names=['uId'])
print(train_id_set.shape, test_id_set.shape)

(4000000, 2) (1000000, 1)


In [5]:
app_actived_df = pd.read_csv(base_dir+'user_app_actived.csv', header=None, names=['uId','appId'])
app_actived_df.shape

(4999341, 2)

In [6]:
app_actived_df.head()

Unnamed: 0,uId,appId
0,1000110,a001048#a003072#a004443#a006024#a007087#a00743...
1,1000542,a001010#a00158#a001671#a002450#a003484#a003577...
2,1000866,a001048#a00108#a004622#a007104#a0075
3,1001028,a001012#a001055#a001062#a001275#a001403#a00158...
4,1001190,a001012#a00107#a001304#a001403#a001533#a001679...


In [7]:
def calc_app_nums_each_cate(df_row): 
    index, row = df_row
    uId = row['uId']
    appIds = row['appId'].split('#')
    cate_map = OrderedDict()
    for cate in app_cate_list:
        cate_map[cate] = 0
    cate_map['appNums'] = len(appIds)
    categories = list(app_info_df.loc[app_info_df['appId'].isin(appIds), 'category'])
    for c in categories:
        cate_map[c] += 1
    return cate_map

In [8]:
pool = Pool(30)
results = pool.map(calc_app_nums_each_cate, app_actived_df.iterrows())
pool.close()
pool.join()

In [9]:
app_actived_category = pd.DataFrame(results)
app_actived_category = pd.concat([app_actived_df[['uId']], app_actived_category], axis=1)
app_actived_category.shape

(4999341, 34)

In [10]:
app_actived_category.head()

Unnamed: 0,uId,运动健康,实用工具,新闻阅读,图书阅读,金融理财,社交通讯,便捷生活,休闲益智,拍摄美化,...,美食,休闲娱乐,表盘个性,学习办公,网络游戏,主题铃声,动漫,休闲游戏,资讯生活,appNums
0,1000110,0,7,1,0,0,2,2,0,0,...,0,0,0,0,0,0,0,0,0,12
1,1000542,0,15,3,0,3,2,5,0,2,...,0,0,0,0,0,0,0,0,0,34
2,1000866,0,3,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5
3,1001028,5,26,2,0,2,8,18,0,5,...,3,0,0,0,0,0,0,0,0,70
4,1001190,0,17,4,0,11,9,7,0,4,...,0,0,0,0,0,0,0,0,0,47


In [12]:
h5 = pd.HDFStore(base_dir+'app_actived_category.h5','w',complevel=4,complib='blosc')
h5['data'] = app_actived_category 
h5.close()

In [1]:
print(0)

0
