In [1]:
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from multiprocessing import Pool as ThreadPool
from datetime import datetime
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
import catboost as cat
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.utils import shuffle

In [2]:
data_dir='/home/uniml/work/hw/dataset/'
temp_save_dir='./temp_data/'
age_train_file_path=data_dir+'age_train.csv'
age_test_file_path=data_dir+'age_test.csv'
app_info_file_path=data_dir+'app_info.csv'
user_app_actived_file_path=data_dir+'user_app_actived.csv'
user_basic_info_file_path=data_dir+'user_basic_info.csv'
user_behavior_info_file_path=data_dir+'user_behavior_info.csv'
user_app_usage_file_path=data_dir+'user_app_usage.csv'

In [3]:
#############用户label数据################

In [4]:
age_train=pd.read_csv(age_train_file_path,header=None,names=['uId','age_group'])
age_train.age_group=age_train.age_group.apply(lambda x:x-1)
age_train.age_group=age_train.age_group.astype('category')
age_test=pd.read_csv(age_test_file_path,header=None,names=['id'])

In [5]:
###########用户基本数据#################

In [6]:
user_basic_info=pd.read_csv(user_basic_info_file_path,header=None,names=['uId',
                                                                         'gender',
                                                                         'city',
                                                                         'prodName',#手机型号
                                                                         'ramCapacity',#手机ram容量
                                                                         'ramLeftRation',#ram剩余容量占比
                                                                         'romCapacity',#ROM容量
                                                                         'romLeftRation',#rom剩余容量占比
                                                                         'color',#手机颜色
                                                                         'fontSize',#字体大小
                                                                         'ct',#上网类型
                                                                         'carrier',#移动运营商
                                                                         'os',#手机系统版本
                                                                        ])

In [7]:
user_basic_info['gender']=user_basic_info['gender'].astype('int')
big_city=list(user_basic_info['city'].value_counts().iloc[:15].index)
user_basic_info['is_in_big_city']=user_basic_info['city'].apply(lambda x:x in big_city).astype('int')
user_basic_info.loc[pd.isna(user_basic_info['fontSize']),'fontSize']=1
user_basic_info['is_bigger']=user_basic_info['fontSize'].apply(lambda x:x>1.0).astype('int')
user_basic_info['is_smaller']=user_basic_info['fontSize'].apply(lambda x:x<1.0).astype('int')
user_basic_info['is_old_system']=user_basic_info['os'].apply(lambda x:x<=6.0).astype('int')
user_basic_info.loc[user_basic_info['romLeftRation']>1,'romLeftRation']=user_basic_info.loc[user_basic_info['romLeftRation']>1,:].apply(lambda x:x.romLeftRation/x.romCapacity,axis=1)

_=user_basic_info.pop('ct')
user_basic_info.loc[user_basic_info['romCapacity']==137,'romCapacity']=128
user_basic_info.loc[user_basic_info['romCapacity']==68,'romCapacity']=64
user_basic_info.loc[user_basic_info['romCapacity']==34,'romCapacity']=32
user_basic_info.loc[user_basic_info['romCapacity']==275,'romCapacity']=256
user_basic_info.loc[user_basic_info['romCapacity']==550,'romCapacity']=512
user_basic_info.loc[user_basic_info['romCapacity']==17,'romCapacity']=16
user_basic_info.loc[user_basic_info['romCapacity']==3,'romCapacity']=4
user_basic_info.loc[user_basic_info['os']==12.1,'os']=8

for column_name in ['gender','city','prodName','color','carrier']:
    user_basic_info[column_name]=user_basic_info[column_name].astype('category')
    
user_phone_info_map={}
for name,group in user_basic_info.groupby('prodName'):
    user_phone_info_map[name]={}
    user_phone_info_map[name]['ramCapacity']=group['ramCapacity'].mean()
    user_phone_info_map[name]['romCapacity']=group['romCapacity'].mean()
    user_phone_info_map[name]['ramLeftRation']=group['ramLeftRation'].mean()
    user_phone_info_map[name]['romLeftRation']=group['romLeftRation'].mean()
    user_phone_info_map[name]['ramMax']=group['ramCapacity'].max()
    user_phone_info_map[name]['ramMin']=group['ramCapacity'].min()
    user_phone_info_map[name]['romMax']=group['romCapacity'].max()
    user_phone_info_map[name]['romMin']=group['romCapacity'].min()
    
def fill_na(y,column_name):
    return user_phone_info_map[y][column_name]
for column_name in ['ramCapacity','romCapacity','ramLeftRation','romLeftRation']:
    user_basic_info.loc[pd.isna(user_basic_info[column_name]),column_name]=user_basic_info.loc[pd.isna(user_basic_info[column_name]),'prodName'].apply(lambda x:fill_na(x,column_name))
for column_name in ['ramMax','ramMin','romMax','romMin']:
    user_basic_info[column_name]=user_basic_info['prodName'].apply(lambda x:fill_na(x,column_name))
for column_name in ['ramCapacity','romCapacity','ramLeftRation','romLeftRation','ramMax','ramMin','romMax','romMin','city','os']:
    user_basic_info[column_name] = user_basic_info[column_name].fillna(user_basic_info[column_name].mode()[0])
user_basic_info.loc[user_basic_info['ramLeftRation']>1,'ramLeftRation']=user_basic_info.loc[user_basic_info['ramLeftRation']>1,:].apply(lambda x:x['ramLeftRation']/x['ramCapacity'],axis=1)
user_basic_info['ramLeft']=(1-user_basic_info['ramLeftRation'])*user_basic_info['ramCapacity']
user_basic_info['romLeft']=(1-user_basic_info['romLeftRation'])*user_basic_info['romCapacity']
for i in ['ramMax','ramMin','romMax','romMin','ramCapacity','romCapacity']:
    user_basic_info[i]=user_basic_info[i].astype(np.int32)

In [8]:
############用户行为数据################

In [9]:
user_behavior_info=pd.read_csv(user_behavior_info_file_path,header=None,names=['uId',
                                                                               'bootTimes',
                                                                               'AFuncTimes',
                                                                               'BFuncTimes',
                                                                               'CFuncTimes',
                                                                               'DFuncTimes',
                                                                               'EFuncTimes',
                                                                               'FFuncTimes',
                                                                               'GFuncTimes',
                                                                              ],dtype={'uId':int,
                                                                                      'bootTimes':int,
                                                                               'AFuncTimes':np.float32,
                                                                               'BFuncTimes':np.float32,
                                                                               'CFuncTimes':np.float32,
                                                                               'DFuncTimes':np.float32,
                                                                               'EFuncTimes':np.float32,
                                                                               'FFuncTimes':np.float32,
                                                                               'GFuncTimes':np.int32,})

In [10]:
user_behavior_info=user_behavior_info.applymap(lambda x:max(0,x))
for column_name in user_behavior_info.columns:
    if column_name=='uId':
        continue
    p99= user_behavior_info[column_name].quantile(0.99)
    user_behavior_info.loc[user_behavior_info[column_name]>p99,column_name]=p99

In [11]:
#####################App分类数据##################

In [12]:
app_info=pd.read_csv(app_info_file_path,header=None,names=['appId','category'],dtype={'appId':str,'category':'category'})
app_category_list=list(app_info.category.unique())

In [13]:
# pd_app_use_count=pd.read_csv('./temp_data/app_use_count.csv')
# temp_sorted_app_use_count=pd_app_use_count.sort_values(by='count',ascending=False).iloc[0:2000,:]
# temp_sorted_app_use_count=temp_sorted_app_use_count.reset_index()
# def get_top_app_id(data):
#     app_rank_map={}
#     for index,row in data.iterrows():
#         app_rank_map[row['appId']]=index
#     return app_rank_map
# top_500_app=get_top_app_id(temp_sorted_app_use_count)

In [14]:
# final_user_app_actived_read_map={}
# for name in app_category_list:
#     final_user_app_actived_read_map[name]=np.int32
#     final_user_app_actived_read_map[name+'_rate']=np.float32
# for app_name in top_500_app.keys():
#     final_user_app_actived_read_map[app_name]=np.int8
# final_user_app_actived_read_map['uId']=np.int32
# final_user_app_actived_read_map['app_count']=np.int32
# final_user_app_actived=pd.read_csv(temp_save_dir+'final_user_app_actived.csv',dtype=final_user_app_actived_read_map)

In [15]:
final_user_app_actived=pd.read_hdf(temp_save_dir+'final_user_app_actived.h5',key='data')

In [16]:
pd_user_app_usage=pd.read_hdf(temp_save_dir+'user_app_usage_statistic.h5',key='data')

In [17]:
#####embedding

In [18]:
embedding_dtype_map={'uId':np.int32}
for i in range(32):
    embedding_dtype_map[str(i)+'_sum']=np.float32
    embedding_dtype_map[str(i)+'_avg']=np.float32

In [19]:
app_train_embedding_sum=pd.read_hdf(temp_save_dir+'app_actived_embedding_by_w2v_avg_windows_5_size_128_iter_20_1.h5',key='data')
app_usage_embedding_sum=pd.read_hdf(temp_save_dir+'app_usage_embedding_by_w2v_sum_windows_5_size_128_iter_20_1.h5',key='data')

In [20]:
# for emb in [app_train_embedding_sum,
#             app_usage_embedding_sum,
#            ]:
#     need_norm=list(emb.columns)
#     need_norm.remove('uId')
#     norm=np.linalg.norm(emb[need_norm],axis=1,keepdims=True)
#     print('start divide norm')
#     emb[need_norm]=np.array(emb[need_norm])/norm
#     emb['uId']=emb['uId'].astype(np.int32)
#     print('finish')
    

In [21]:
# app_train_embedding_sum.fillna(0,inplace=True)
# h5 = pd.HDFStore(temp_save_dir+'app_actived_embedding_by_w2v_avg_windows_5_size_128_iter_20_1_norm.h5','w',complevel=4,complib='blosc')
# h5['data'] = app_train_embedding_sum
# h5.close()

In [22]:
# app_usage_embedding_sum.fillna(0,inplace=True)
# h5 = pd.HDFStore(temp_save_dir+'app_usage_embedding_by_w2v_sum_windows_5_size_128_iter_20_1_norm.h5','w',complevel=4,complib='blosc')
# h5['data'] = app_usage_embedding_sum
# h5.close()

In [23]:
print('test')

test


In [24]:
#######卡方+tfidf

In [25]:
train_actived_tfidf = pd.read_hdf(temp_save_dir+'train_app_actived_tfidf_input_csr.h5',key='data')
test_actived_tfidf = pd.read_hdf(temp_save_dir+'test_app_actived_tfidf_input_csr.h5',key='data')

In [26]:
train_actived_tfidf.rename(columns=lambda x:'tfidf'+str(x),inplace=True)
test_actived_tfidf.rename(columns=lambda x:'tfidf'+str(x),inplace=True)

In [27]:
#####yc_static
train_yc_static=pd.read_hdf(data_dir+'train_app_usage_stat_df.h5',key='data')
test_yc_static=pd.read_hdf(data_dir+'test_app_usage_stat_df.h5',key='data')

In [28]:
#######usgae分类及转化率

In [29]:
train_app_trans_rate_with_usage=pd.read_hdf(data_dir+'train_app_trans_rate_with_usage.h5',key='data')
test_app_trans_rate_with_usage=pd.read_hdf(data_dir+'test_app_trans_rate_with_usage.h5',key='data')

In [30]:
train_app_trans_rate_with_usage.rename(columns=lambda x:'trans_rate'+str(x),inplace=True)
test_app_trans_rate_with_usage.rename(columns=lambda x:'trans_rate'+str(x),inplace=True)

In [31]:
temp=pd.merge(age_train,user_basic_info,on='uId',how='left')
temp=pd.merge(temp,app_train_embedding_sum,on='uId',how='left')
temp=pd.merge(temp,app_usage_embedding_sum,on='uId',how='left')
temp=pd.merge(temp,train_yc_static,on='uId',how='left')
print('start concat')
temp=pd.concat([temp,train_actived_tfidf,train_app_trans_rate_with_usage],axis=1)
print('finish concat')
temp_with_behavior=pd.merge(temp,user_behavior_info,on='uId',how='left')
temp_with_behavior_with_actived=pd.merge(temp_with_behavior,final_user_app_actived,on='uId',how='left')
temp_with_behavior_with_actived=pd.merge(temp_with_behavior_with_actived,pd_user_app_usage,on='uId',how='left')
print('finish merge,start concat')
temp_with_behavior_with_actived['app_use_rate']=(temp_with_behavior_with_actived['app_usage_count'].fillna(0)/temp_with_behavior_with_actived['app_count'].fillna(0)).fillna(0)

start concat
finish concat
finish merge,start concat


In [32]:
temp_with_behavior_with_actived.shape

(4000000, 1044)

In [33]:

pd_Y=age_train['age_group']
index_set=[]
from sklearn.utils import shuffle
index_set.extend(list(shuffle(pd_Y[pd_Y==0]).index)[:190000])
index_set.extend(list(shuffle(pd_Y[pd_Y==1]).index)[:300000])
index_set.extend(list(shuffle(pd_Y[pd_Y==2]).index)[:750000])
index_set.extend(list(shuffle(pd_Y[pd_Y==3]).index)[:750000])
index_set.extend(list(shuffle(pd_Y[pd_Y==4]).index)[:380000])
index_set.extend(list(shuffle(pd_Y[pd_Y==5]).index)[:150000])
print(len(index_set))
index_set=shuffle(index_set)

2520000


In [None]:
temp_with_behavior_with_actived=temp_with_behavior_with_actived

In [34]:
params={
    'lightgbm':dict(
    objective='multiclass',
    metric='multi_error',
    num_class=6,
#     boosting='dart',
    learning_rate=0.1,
#     num_leaves=395,
#     lambda_l1=3,
#     lambda_l2=5,
    subsample=0.9,
    subsample_freq=1,
#     min_child_weight=4,
#     min_child_samples=5,
#     min_split_gain=0,
#     min_sum_hessian_in_leaf=4,
#     min_data_in_leaf=80,
#     max_bin=200,
    feature_fraction=0.8,
    num_thread=60,
        
),
    'xgboost':dict(learning_rate=0.3,
                    objective='multi:softmax',
                    eval_metric='merror',
                   num_class=6,
                   subsample=0.9,
                   colsample_bytree=0.9,
                   tree_method='approx'
#                    max_depth=20,
                    #silent=True
                  ),
    'catboost':dict(loss_function='MultiClass',
                   custom_metric='Accuracy:use_weights=false',
                   eval_metric='Accuracy:use_weights=false',
#                     bagging_temperature=0,
                    use_best_model=True,
#                     depth=8,
                    thread_count=60,
                    classes_count=6,
                    iterations=500,
                    learning_rate=0.5,
                   )
       
       }

def test_model(df,cv=5):
    scores = []
    df=df.sample(frac=1.0)
    _=df.pop('uId')
    size = (len(df) + cv-1) // cv
    for left in range(0,len(df),size):
        right = min(left + size,len(df))
        df_train = pd.concat([df.iloc[:left, :], df.iloc[right:, :]])
        df_test = df.iloc[left:right, :]
        df_train_y=df_train.pop('age_group')
        df_test_y=df_test.pop('age_group')
        
        model_train_x,model_valid_x,model_train_y,model_valid_y=train_test_split(df_train,df_train_y,test_size=0.2,stratify=df_train_y)
        lgb_train=lgb.Dataset(model_train_x,model_train_y)
        lgb_valid=lgb.Dataset(model_valid_x,model_valid_y)
        lgb_train.set_weight(calc_sample_weight(model_train_y))
        model=lgb.train(params, lgb_train, 500, valid_sets=[lgb_train, lgb_valid],
                         valid_names=['train', 'valid'],early_stopping_rounds=100)
        y_=np.argmax(model.predict(df_test),axis=1)
        score=accuracy_score(df_test_y,y_)
        print(score)
        scores.append(score)
    print(scores)
    print(sum(scores)/cv)
    
def train_model(df,y_index,name='lightgbm'):
    if 'uId' in df.columns:    
        _=df.pop('uId')
    if 'age_group' in df.columns:
        _=df.pop('age_group')
    y=age_train['age_group'].loc[y_index]
    train_x,valid_x,train_y,valid_y=train_test_split(df,y,test_size=0.1,stratify=y)
    print('finish split')
    if name=='lightgbm':
        lgb_train=lgb.Dataset(train_x,train_y)
        lgb_valid=lgb.Dataset(valid_x,valid_y)
        return lgb.train(params[name], lgb_train, 300, valid_sets=[lgb_train, lgb_valid],
                         valid_names=['train', 'valid'],early_stopping_rounds=50)
    elif name=='xgboost':
        xgb_train=xgb.DMatrix(data=train_x,label=train_y)
        xgb_valid=xgb.DMatrix(data=valid_x,label=valid_y)
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'valid')]
        return xgb.train(params[name], xgb_train, 1000, watchlist, maximize=False, early_stopping_rounds=50)
    elif name=='catboost':
        category_feature_names=['gender','city','carrier','prodName','color','is_bigger','is_smaller','is_in_big_city','is_old_system','ramCapacity','romCapacity','ramMax','ramMin','romMax','romMin',
'是否有教育','是否有儿童','是否有汽车','是否有商务','是否有运动','是否有游戏','','游戏总量and儿童','游戏总量and教育','儿童and金融理财','儿童and教育']
        category_feature_index=[]
        for index,value in enumerate(list(train_x.columns)):
            if value in category_feature_names:
                category_feature_index.append(index)
        print(len(category_feature_index))
        model = cat.CatBoostClassifier(**params[name])
        model.fit(train_x,train_y,cat_features=category_feature_index,eval_set=(valid_x, valid_y))
        return model
    
    

In [None]:
model=train_model(temp_with_behavior_with_actived.iloc[index_set],index_set)
# model=train_model(temp_with_behavior_with_actived.loc[index_set],name='catboost')

finish split




[1]	train's multi_error: 0.506484	valid's multi_error: 0.509563
Training until validation scores don't improve for 50 rounds.
[2]	train's multi_error: 0.496295	valid's multi_error: 0.498726
[3]	train's multi_error: 0.491362	valid's multi_error: 0.493679
[4]	train's multi_error: 0.48801	valid's multi_error: 0.490492
[5]	train's multi_error: 0.48616	valid's multi_error: 0.488996
[6]	train's multi_error: 0.483981	valid's multi_error: 0.486258
[7]	train's multi_error: 0.481836	valid's multi_error: 0.484274
[8]	train's multi_error: 0.479608	valid's multi_error: 0.482107
[9]	train's multi_error: 0.477728	valid's multi_error: 0.479877
[10]	train's multi_error: 0.476214	valid's multi_error: 0.478373
[11]	train's multi_error: 0.474957	valid's multi_error: 0.477079
[12]	train's multi_error: 0.473645	valid's multi_error: 0.475913
[13]	train's multi_error: 0.472398	valid's multi_error: 0.47454
[14]	train's multi_error: 0.471015	valid's multi_error: 0.47296
[15]	train's multi_error: 0.469745	valid'

In [43]:
len(index_set)

2700000

In [36]:
test_sample=pd.merge(age_test,user_basic_info,left_on=['id'],right_on=['uId'],how='left')
test_sample=pd.merge(test_sample,app_train_embedding_sum,on='uId',how='left')
test_sample=pd.merge(test_sample,app_usage_embedding_sum,on='uId',how='left')
test_sample=pd.merge(test_sample,test_yc_static,on='uId',how='left')
test_sample=pd.concat([test_sample,test_actived_tfidf,test_app_trans_rate_with_usage],axis=1)

test_sample=pd.merge(test_sample,user_behavior_info,on=['uId'],how='left')
test_sample=pd.merge(test_sample,final_user_app_actived,on=['uId'],how='left')
test_sample=pd.merge(test_sample,pd_user_app_usage,on=['uId'],how='left')

test_sample['app_use_rate']=(test_sample['app_usage_count'].fillna(0)/test_sample['app_count'].fillna(0)).fillna(0)
_=test_sample.pop('id')
_=test_sample.pop('uId')

In [21]:
test_sample.shape

(1000000, 1042)

In [37]:
# pred_y=model.predict(test_sample)
pred_y=model.predict(test_sample,prediction_type='Probability')

In [38]:
pd.DataFrame(pred_y).to_csv('pred_cat_guess.csv',index=False,header=None)

In [39]:
result=pred_y
result=np.argmax(result,axis=1)+1
result=pd.DataFrame(result)
submission=pd.concat([age_test,result],axis=1)
submission.rename(columns={0:'label'},inplace=True)
submission.to_csv('submission_catboost.csv',index=False)
print(submission.label.value_counts()/len(submission))

3    0.275054
4    0.272491
5    0.161481
2    0.117835
1    0.097495
6    0.075644
Name: label, dtype: float64


In [23]:
model.save_model('lgb_model.txt')

<lightgbm.basic.Booster at 0x7fc14a838b00>

In [52]:
pred_y.shape

(2010000, 6)