In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.decomposition import PCA,NMF,TruncatedSVD,SparsePCA
from keras.utils import np_utils
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler,MinMaxScaler

Using TensorFlow backend.


In [2]:
data_dir='/home/uniml/work/hw/dataset/'
temp_save_dir='./temp_data/'
age_train_file_path=data_dir+'age_train.csv'
age_test_file_path=data_dir+'age_test.csv'
app_info_file_path=data_dir+'app_info.csv'
user_app_actived_file_path=data_dir+'user_app_actived.csv'
user_basic_info_file_path=data_dir+'user_basic_info.csv'
user_behavior_info_file_path=data_dir+'user_behavior_info.csv'
user_app_usage_file_path=data_dir+'user_app_usage.csv'

In [3]:
#用户label数据
#在此将对应的label减去1以方便模型操作,后面预测的时候会加回来
age_train=pd.read_csv(age_train_file_path,header=None,names=['uId','age_group'])
age_train.age_group=age_train.age_group.apply(lambda x:x-1)
age_train.age_group=age_train.age_group.astype('category')
age_test=pd.read_csv(age_test_file_path,header=None,names=['id'])

In [4]:
actived_app_size = 10010
actived_max_length = 150

user_app_actived=pd.read_csv(user_app_actived_file_path,header=None,names=['uId','appId'])

encoded_apps = [one_hot(app[0], actived_app_size, split='#') for app in user_app_actived[['appId']].values]


In [4]:
usage_app_size = 10010
usage_max_length = 200
app_usage_info=pd.read_hdf(data_dir+'user_app_usage_df.h5',key='data')
app_usage_info.loc[pd.isna(app_usage_info['appIds']),'appIds']=''

encoded_apps = [one_hot(app[0], usage_app_size, split='#') for app in app_usage_info[['appIds']].values]


In [6]:
def get_app_crs_matrix(data_index,encoded_data,app_size):
    row_index=[]
    column_index=[]
    for i in range(len(data_index)):
        target_index=data_index.loc[i,'target_index']
        if pd.isna(target_index):
            continue
        target_index=int(target_index)
        row_index.extend([i for _ in range(len(encoded_data[target_index]))])
        column_index.extend([column for column in encoded_data[target_index]])
    matrix=csr_matrix(([1 for _ in range(len(row_index))],(row_index,column_index)),shape=(data_index.shape[0],app_size))
    return matrix

In [7]:
user_app_actived['target_index']=user_app_actived.index
train_index=age_train.merge(user_app_actived.loc[:,['uId','target_index']],on='uId',how='left')
test_index=age_test.merge(user_app_actived.loc[:,['uId','target_index']],left_on='id',right_on='uId',how='left')
_=test_index.pop('id')

In [5]:
app_usage_info['target_index']=app_usage_info.index
train_index=age_train.merge(app_usage_info.loc[:,['uId','target_index']],on='uId',how='left')
test_index=age_test.merge(app_usage_info.loc[:,['uId','target_index']],left_on='id',right_on='uId',how='left')
_=test_index.pop('id')

In [9]:
train_matrix=get_app_crs_matrix(train_index,encoded_apps,10010)

In [10]:
test_matrix=get_app_crs_matrix(test_index,encoded_apps,10010)

In [11]:
# train_data_save_name='train_app_actived_tfidf_input_csr.h5'
# test_data_save_name='test_app_actived_tfidf_input_csr.h5'
train_data_save_name='train_app_usage_tfidf_input_csr.h5'
test_data_save_name='test_app_usage_tfidf_input_csr.h5'
tfidf = TfidfTransformer()
#训练集
print('start')
train_tfidf = tfidf.fit_transform(train_matrix)
age_group=age_train['age_group']    
#卡方选择
print('start 卡方')
top_feature_select=SelectKBest(chi2,k=500)
train_app_actived_tfidf=top_feature_select.fit_transform(train_tfidf,age_group)
#归一化
print('start 归一化')
train_app_actived_tfidf=train_app_actived_tfidf.toarray()
scaler = StandardScaler()
train_app_actived_tfidf=scaler.fit_transform(train_app_actived_tfidf)
#保存训练集
h5 = pd.HDFStore(temp_save_dir+train_data_save_name,'w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(train_app_actived_tfidf,dtype=np.float32)
h5.close()

#构建测试集
print('start')
test_tfidf=tfidf.transform(test_matrix)
#卡方选择
print('start 卡方')
test_app_actived_tfidf=top_feature_select.transform(test_tfidf)
#归一化
print('start 归一化')
test_app_actived_tfidf=test_app_actived_tfidf.toarray()
test_app_actived_tfidf=scaler.transform(test_app_actived_tfidf) 
#保存测试集
h5 = pd.HDFStore(temp_save_dir+test_data_save_name,'w',complevel=4,complib='blosc')
h5['data'] = pd.DataFrame(test_app_actived_tfidf,dtype=np.float32)
h5.close()

start
start 卡方
start 归一化
start
start 卡方
start 归一化


In [None]:
'a'