In [1]:
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from multiprocessing import Pool as ThreadPool
from datetime import datetime
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.utils import shuffle
from tqdm import tqdm

In [2]:
data_dir='/home/uniml/work/hw/dataset/'
temp_save_dir='./temp_data/'
age_train_file_path=data_dir+'age_train.csv'
age_test_file_path=data_dir+'age_test.csv'
app_info_file_path=data_dir+'app_info.csv'
user_app_actived_file_path=data_dir+'user_app_actived.csv'
user_basic_info_file_path=data_dir+'user_basic_info.csv'
user_behavior_info_file_path=data_dir+'user_behavior_info.csv'
user_app_usage_file_path=data_dir+'user_app_usage.csv'

In [3]:
#用户label数据
#在此将对应的label减去1以方便模型操作,后面预测的时候会加回来
age_train=pd.read_csv(age_train_file_path,header=None,names=['uId','age_group'])
age_train.age_group=age_train.age_group.apply(lambda x:x-1)
age_train.age_group=age_train.age_group.astype('category')
age_test=pd.read_csv(age_test_file_path,header=None,names=['id'])

In [4]:
#App分类数据
app_info=pd.read_csv(app_info_file_path,header=None,names=['appId','category'])
app_info.category=app_info.category.astype('category')
app_category_list=list(app_info.category.unique())

In [12]:
#用户app激活文件
user_app_actived=pd.read_csv(user_app_actived_file_path,header=None,names=['uId','appId'])

In [9]:
#获得热门app的list,根据激活数量进行排序
def get_top_app(data):
    app_use_count_map=dict()
    for index,row in tqdm(data.iterrows()):
        app_list=row['appId'].split('#')
        for app in app_list:
            if app_use_count_map.get(app) is None:
                app_use_count_map[app]=0
            app_use_count_map[app]+=1
    app_use_count_result={
        'appId':[],
        'count':[]
    }
    for k,v in  app_use_count_map.items():
        app_use_count_result['appId'].append(k)
        app_use_count_result['count'].append(v)
    pd_app_use_count=pd.DataFrame(app_use_count_result)
    pd_app_use_count.to_csv(temp_save_dir+'app_use_count.csv',index=False)
    return pd_app_use_count

In [10]:
pd_app_use_count=get_top_app(user_app_actived)

4999341it [08:04, 10308.64it/s]


In [11]:
#获得热门app的list,根据激活数量进行排序
def get_top_app(data):
    index,row=data
    user_app_category_actived_map=dict()
    for app_category_name in app_category_list:
        user_app_category_actived_map[app_category_name]=0
    app_list=row['appId'].split('#')
    user_app_category_list=list(app_info.loc[app_info['appId'].isin(app_list),'category'])
    for user_app_category_name in user_app_category_list:
        user_app_category_actived_map[user_app_category_name]+=1
    
    #app数量
    user_app_category_actived_map['app_count']=len(app_list)
    #app的list
#     user_app_category_actived_map['app_list']=app_list
    
    return user_app_category_actived_map

In [12]:
def multiprocess_handle_user_app_category_actived(data):
    pool = ThreadPool(18) 
    result = pool.map(get_top_app, data.iterrows())
    pool.close()
    pool.join()
    return result

In [13]:
user_app_actived_to_category_count=multiprocess_handle_user_app_category_actived(user_app_actived)

In [15]:
user_app_actived_map={}
for app_category in app_category_list:
    user_app_actived_map[app_category]=[]
user_app_actived_map['app_count']=[]
for single_user_app_actived in user_app_actived_to_category_count:
    for k,v in single_user_app_actived.items():
        user_app_actived_map[k].append(v)

In [16]:
pd_user_app_actived=pd.DataFrame(user_app_actived_map,dtype=np.int32)

In [17]:
for app_category in app_category_list:
    pd_user_app_actived[app_category+'_rate']=pd_user_app_actived[app_category]/pd_user_app_actived['app_count']

In [18]:
pd_user_app_actived['uId']=user_app_actived['uId']

In [19]:
h5 = pd.HDFStore(temp_save_dir+'final_user_app_actived.h5','w',complevel=4,complib='blosc')
h5['data'] = pd_user_app_actived
h5.close()

In [5]:
pd_user_app_actived=pd.read_hdf(temp_save_dir+'final_user_app_actived.h5',key='data')