In [1]:
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from multiprocessing import Pool as ThreadPool
from datetime import datetime
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.utils import shuffle
from tqdm import tqdm

In [2]:
data_dir='/home/uniml/work/hw/dataset/'
temp_save_dir='./temp_data/'
age_train_file_path=data_dir+'age_train.csv'
age_test_file_path=data_dir+'age_test.csv'
app_info_file_path=data_dir+'app_info.csv'
user_app_actived_file_path=data_dir+'user_app_actived.csv'
user_basic_info_file_path=data_dir+'user_basic_info.csv'
user_behavior_info_file_path=data_dir+'user_behavior_info.csv'
user_app_usage_file_path=data_dir+'user_app_usage.csv'

In [3]:
#用户label数据
#在此将对应的label减去1以方便模型操作,后面预测的时候会加回来
age_train=pd.read_csv(age_train_file_path,header=None,names=['uId','age_group'])
age_train.age_group=age_train.age_group.apply(lambda x:x-1)
age_train.age_group=age_train.age_group.astype('category')
age_test=pd.read_csv(age_test_file_path,header=None,names=['id'])

In [4]:
#App分类数据
app_info=pd.read_csv(app_info_file_path,header=None,names=['appId','category'])
app_info.category=app_info.category.astype('category')
app_category_list=list(app_info.category.unique())

In [5]:
#用户app使用时长文件
user_app_usage=pd.read_csv(user_app_usage_file_path,header=None,names=['uId',
                                                                       'appId',
                                                                       'duration',
                                                                       'times',
                                                                       'use_date'
                                                                      ],
                           dtype={'uId':int,'duration':np.uint32,'times':np.uint32,'use_date':'category'})

In [10]:
user_app_usage.describe()

Unnamed: 0,uId,duration,times
count,840560500.0,840560500.0,840560500.0
mean,10592220.0,2062.945,71.69924
std,7249383.0,4495.257,136.5172
min,1000009.0,0.0,1.0
25%,4669176.0,60.0,6.0
50%,8975902.0,336.0,18.0
75%,15173110.0,2016.0,65.0
max,44550890.0,86400.0,806.0


In [6]:
#统计app出现的次数,从而判断哪些是热门使用app
user_app_usage_appid_count=user_app_usage.appId.value_counts()
user_app_usage_appid_name_list=[]
user_app_usage_appid_count_list=[]
for k,v in user_app_usage_appid_count.items():
    user_app_usage_appid_name_list.append(k)
    user_app_usage_appid_count_list.append(v)
pd_user_app_usage_appid_count=pd.DataFrame({'appId':user_app_usage_appid_name_list,'count':user_app_usage_appid_count_list})
pd_user_app_usage_appid_count.to_csv(temp_save_dir+'pd_user_app_usage_appid_count.csv',index=False)

In [9]:
# user_app_usage.loc[user_app_usage['times']>10000,'times']=10000
p99= user_app_usage['times'].quantile(0.99)
user_app_usage.loc[user_app_usage['times']>p99,'times']=p99

In [11]:
def get_user_app_usage_info(data):
    name,group=data
    name=int(name)
    user_app_usage_map={}
    
    user_app_usage_map['uId']=name
    
    for app_category_name in app_category_list:
        user_app_usage_map[app_category_name+'_times']=0
        user_app_usage_map[app_category_name+'_duration']=0
    user_app_usage_map['all_times']=0
    user_app_usage_map['all_duration']=0
    user_app_usage_map['use_days']=0
    user_app_usage_map['app_usage_count']=len(group['appId'].unique())
    app_list=list(group['appId'].unique())
    user_app_category_list=app_info.loc[app_info['appId'].isin(app_list),['appId','category']]
    
    for app_id,singe_app_group in group.groupby('appId'):
        user_app_usage_map['use_days']=max(user_app_usage_map['use_days'],len(singe_app_group))
        single_app_category_list=list(user_app_category_list.loc[user_app_category_list['appId']==app_id,'category'])
        
        sum_of_app_use_times=singe_app_group['times'].sum()
        sum_of_app_use_duration=singe_app_group['duration'].sum()
        
        for app_category_name in single_app_category_list:
            user_app_usage_map[app_category_name+'_times']+=sum_of_app_use_times
            user_app_usage_map[app_category_name+'_duration']+=sum_of_app_use_duration
        user_app_usage_map['all_times']+=sum_of_app_use_times
        user_app_usage_map['all_duration']+=sum_of_app_use_duration
        
        
    return user_app_usage_map

In [12]:
def multiprocess_handle_user_app_usage(data):
    pool = ThreadPool(24) 
    result = pool.map(get_user_app_usage_info, data.groupby('uId'))
    pool.close()
    pool.join()
    return result

In [None]:
user_app_usage_statistic=multiprocess_handle_user_app_usage(user_app_usage)

In [15]:
#将数据转换成DataFrame格式并存储
user_app_usage_statistic_map={}
for app_category_name in app_category_list:
    user_app_usage_statistic_map[app_category_name+'_times']=[]
    user_app_usage_statistic_map[app_category_name+'_duration']=[]
user_app_usage_statistic_map['uId']=[]
user_app_usage_statistic_map['all_times']=[]
user_app_usage_statistic_map['all_duration']=[]
user_app_usage_statistic_map['use_days']=[]
user_app_usage_statistic_map['app_usage_count']=[]
    
for single_user in user_app_usage_statistic:
    for k,v in user_app_usage_statistic_map.items():
        if single_user.get(k) is not None:
            v.append(single_user.get(k))
        else:
            v.append(0)
pd_user_app_usage=pd.DataFrame(user_app_usage_statistic_map)

for category_name in app_category_list:
    pd_user_app_usage[category_name+'_avg']=(pd_user_app_usage[category_name+'_duration']/pd_user_app_usage[category_name+'_times']).astype(np.float32)

h5 = pd.HDFStore(temp_save_dir+'user_app_usage_statistic.h5','w',complevel=4,complib='blosc')
h5['data'] = pd_user_app_usage
h5.close()

In [14]:
print(0)

0
