In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns',None)

In [7]:
#读取数据
age_train = pd.read_csv("data/age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("data/age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("data/user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("data/user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("data/user_app_actived.csv", names=['uid','appId'])
#user_app_usage = pd.read_csv("data/user_app_usage.csv")
app_info = pd.read_csv("data/app_info.csv", names=['appId', 'category'])

In [8]:
#处理数据量较大的user_app_usage.csv，结合app_info.csv简单统计得到appuseProcessed.csv作为特征
def f(x):
    s = x.value_counts()
    return np.nan if len(s) == 0 else s.index[0]
def processUserAppUsage():
    resTable = pd.DataFrame()
    reader = pd.read_csv("data/user_app_usage.csv", names=['uid','appId','duration','times','use_date'], iterator=True)
    last_df = pd.DataFrame()
    
    app_info = pd.read_csv("data/app_info.csv", names=['appId','category'])
    cats = list(set(app_info['category']))
    category2id = dict(zip(sorted(cats), range(0,len(cats))))
    id2category = dict(zip(range(0,len(cats)), sorted(cats)))
    app_info['category'] = app_info['category'].apply(lambda x: category2id[x])
    i = 1
    
    while True:
        try:
            print("index: {}".format(i))
            i+=1
            df = reader.get_chunk(1000000)
            df = pd.concat([last_df, df])
            idx = df.shape[0]-1
            last_user = df.iat[idx,0]
            while(df.iat[idx,0]==last_user):
                idx-=1
            last_df = df[idx+1:]
            df = df[:idx+1]

            now_df = pd.DataFrame()
            now_df['uid'] = df['uid'].unique()
            now_df = now_df.merge(df.groupby('uid')['appId'].count().to_frame(), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['appId','use_date'].agg(['nunique']), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['duration','times'].agg(['mean','max','std']), how='left', on='uid')    

            now_df.columns = ['uid','usage_cnt','usage_appid_cnt','usage_date_cnt','duration_mean','duration_max','duration_std','times_mean','times_max','times_std']


            df = df.merge(app_info, how='left', on='appId')
            now_df = now_df.merge(df.groupby('uid')['category'].nunique().to_frame(), how='left', on='uid')
            #print(df.groupby(['uid'])['category'].value_counts().index[0])
            now_df['usage_most_used_category'] = df.groupby(['uid'])['category'].transform(f)
            resTable = pd.concat([resTable, now_df])
        except StopIteration:
            break
    
    resTable.to_csv("data/appuseProcessed.csv",index=0)
    
    print("Iterator is stopped")

In [None]:
processUserAppUsage()

In [9]:
#将user_basic_info.csv 和 user_behavior_info.csv中的字符值编码成可以训练的数值类型，合并
class2id = {}
id2class = {}
def mergeBasicTables(baseTable):
    resTable = baseTable.merge(user_basic_info, how='left', on='uid', suffixes=('_base0', '_ubaf'))
    resTable = resTable.merge(user_behavior_info, how='left', on='uid', suffixes=('_base1', '_ubef'))
    cat_columns = ['city','prodName','color','carrier','os','ct']
    for c in cat_columns:
        resTable[c] = resTable[c].apply(lambda x: x if type(x)==str else str(x))
        sort_temp = sorted(list(set(resTable[c])))  
        class2id[c+'2id'] = dict(zip(sort_temp, range(1, len(sort_temp)+1)))
        id2class['id2'+c] = dict(zip(range(1,len(sort_temp)+1), sort_temp))
        resTable[c] = resTable[c].apply(lambda x: class2id[c+'2id'][x])
        
    return resTable

In [12]:
#处理app使用相关数据
#对user_app_actived.csv简单统计
#将之前训练的appuseProcess.csv进行合并
def mergeAppData(baseTable):
    resTable = baseTable.merge(user_app_actived, how='left', on='uid')
    resTable['appId'] = resTable['appId'].apply(lambda x: len(list(x.split('#'))))
    user_app_detail_list = pd.read_csv("data/user_app_detail_list.csv")
    resTable = resTable.merge(user_app_detail_list, how='left', on='uid')
    appusedTable = pd.read_csv("data/appuseProcessed.csv")
    resTable = resTable.merge(appusedTable, how='left', on='uid')
    resTable[['category', 'usage_most_used_category']] = resTable[['category', 'usage_most_used_category']].fillna(41)
    resTable = resTable.fillna(0)
    print(resTable[:5])
    return resTable

In [13]:
#合并用户基本特征以及app使用相关特征，作为训练集和测试集
df_train = mergeAppData(mergeBasicTables(age_train))
df_test = mergeAppData(mergeBasicTables(age_test))
print(df_train.shape)
print(df_test.shape)

       uid  age_group  gender  city  prodName  ramCapacity  ramLeftRation  \
0  1000001          4       0    51        78          3.0           0.43   
1  1000011          3       0    30       138          0.0           0.00   
2  1000015          5       1   228        78          3.0           0.34   
3  1000019          3       0    57       166          2.0           0.00   
4  1000023          2       1   293       164          2.0           0.34   

   romCapacity  romLeftRation  color  fontSize  ct  carrier  os  bootTimes  \
0         32.0           0.46     80      1.15   5        1  14        108   
1          0.0           0.00     16      0.00   6        1  15          0   
2         32.0           0.06     80      1.30   8        2  14         12   
3         17.0           0.00    100      0.00   7        3  15          0   
4         16.0           0.06    119      1.00   8        2  12          5   

   AFuncTimes  BFuncTimes  CFuncTimes  DFuncTimes  EFuncTimes  FFunc

In [14]:
#训练模型

from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [16]:
print("训练模型：")
param = {
        'learning_rate': 0.1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.2,
        'max_depth': 20,
        'objective': 'multiclass',
        'num_class': 7,
        'num_leaves': 31,
        'min_data_in_leaf': 50,
        'max_bin': 230,
        'feature_fraction': 0.8,
        'metric': 'multi_error'
        }

X = df_train.drop(['age_group','uid'], axis=1)
y = df_train['age_group']
uid = df_test['uid']
test = df_test.drop('uid', axis=1)

xx_score = []
cv_pred = []
skf = StratifiedKFold(n_splits=3, random_state=1030, shuffle=True)
for index, (train_index, vali_index) in enumerate(skf.split(X, y)):
    print(index)
    x_train, y_train, x_vali, y_vali = np.array(X)[train_index], np.array(y)[train_index], np.array(X)[vali_index], np.array(y)[vali_index]
    train = lgb.Dataset(x_train, y_train)
    vali =lgb.Dataset(x_vali, y_vali)
    print("training start...")
    model = lgb.train(param, train, num_boost_round=1500, valid_sets=[vali], early_stopping_rounds=50)
    xx_pred = model.predict(x_vali,num_iteration=model.best_iteration)
    xx_pred = [np.argmax(x) for x in xx_pred]
    xx_score.append(f1_score(y_vali,xx_pred,average='weighted'))
    y_test = model.predict(test,num_iteration=model.best_iteration)
    y_test = [np.argmax(x) for x in y_test]
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
        
submit = []
for line in cv_pred:
    submit.append(np.argmax(np.bincount(line)))
df = pd.DataFrame({'id':uid.as_matrix(),'label':submit})
df.to_csv('submission.csv',index=False)

训练模型：
0
training start...
[1]	valid_0's multi_error: 0.701493
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_error: 0.6878
[3]	valid_0's multi_error: 0.664481
[4]	valid_0's multi_error: 0.651689
[5]	valid_0's multi_error: 0.640986
[6]	valid_0's multi_error: 0.628098
[7]	valid_0's multi_error: 0.616032
[8]	valid_0's multi_error: 0.60513
[9]	valid_0's multi_error: 0.597095
[10]	valid_0's multi_error: 0.589619
[11]	valid_0's multi_error: 0.583939
[12]	valid_0's multi_error: 0.579678
[13]	valid_0's multi_error: 0.576062
[14]	valid_0's multi_error: 0.573019
[15]	valid_0's multi_error: 0.569848
[16]	valid_0's multi_error: 0.56738
[17]	valid_0's multi_error: 0.564974
[18]	valid_0's multi_error: 0.562889
[19]	valid_0's multi_error: 0.561111
[20]	valid_0's multi_error: 0.559199
[21]	valid_0's multi_error: 0.557436
[22]	valid_0's multi_error: 0.556141
[23]	valid_0's multi_error: 0.554213
[24]	valid_0's multi_error: 0.552766
[25]	valid_0's multi_error: 0.551484


[218]	valid_0's multi_error: 0.497037
[219]	valid_0's multi_error: 0.496931
[220]	valid_0's multi_error: 0.496844
[221]	valid_0's multi_error: 0.496771
[222]	valid_0's multi_error: 0.496643
[223]	valid_0's multi_error: 0.496562
[224]	valid_0's multi_error: 0.496475
[225]	valid_0's multi_error: 0.496354
[226]	valid_0's multi_error: 0.496332
[227]	valid_0's multi_error: 0.496254
[228]	valid_0's multi_error: 0.496107
[229]	valid_0's multi_error: 0.496149
[230]	valid_0's multi_error: 0.495987
[231]	valid_0's multi_error: 0.49586
[232]	valid_0's multi_error: 0.495801
[233]	valid_0's multi_error: 0.495746
[234]	valid_0's multi_error: 0.495623
[235]	valid_0's multi_error: 0.495604
[236]	valid_0's multi_error: 0.495549
[237]	valid_0's multi_error: 0.495437
[238]	valid_0's multi_error: 0.495357
[239]	valid_0's multi_error: 0.49526
[240]	valid_0's multi_error: 0.495165
[241]	valid_0's multi_error: 0.495122
[242]	valid_0's multi_error: 0.495129
[243]	valid_0's multi_error: 0.495086
[244]	valid_0'

[435]	valid_0's multi_error: 0.488014
[436]	valid_0's multi_error: 0.488017
[437]	valid_0's multi_error: 0.488014
[438]	valid_0's multi_error: 0.487999
[439]	valid_0's multi_error: 0.487875
[440]	valid_0's multi_error: 0.487905
[441]	valid_0's multi_error: 0.487947
[442]	valid_0's multi_error: 0.487886
[443]	valid_0's multi_error: 0.487866
[444]	valid_0's multi_error: 0.487838
[445]	valid_0's multi_error: 0.487805
[446]	valid_0's multi_error: 0.487829
[447]	valid_0's multi_error: 0.487801
[448]	valid_0's multi_error: 0.48782
[449]	valid_0's multi_error: 0.48782
[450]	valid_0's multi_error: 0.48778
[451]	valid_0's multi_error: 0.487799
[452]	valid_0's multi_error: 0.487787
[453]	valid_0's multi_error: 0.487759
[454]	valid_0's multi_error: 0.487746
[455]	valid_0's multi_error: 0.487714
[456]	valid_0's multi_error: 0.487684
[457]	valid_0's multi_error: 0.487687
[458]	valid_0's multi_error: 0.487684
[459]	valid_0's multi_error: 0.487713
[460]	valid_0's multi_error: 0.48762
[461]	valid_0's 

[652]	valid_0's multi_error: 0.485238
[653]	valid_0's multi_error: 0.485226
[654]	valid_0's multi_error: 0.48525
[655]	valid_0's multi_error: 0.485231
[656]	valid_0's multi_error: 0.485214
[657]	valid_0's multi_error: 0.485199
[658]	valid_0's multi_error: 0.485211
[659]	valid_0's multi_error: 0.485219
[660]	valid_0's multi_error: 0.485213
[661]	valid_0's multi_error: 0.485207
[662]	valid_0's multi_error: 0.485208
[663]	valid_0's multi_error: 0.485229
[664]	valid_0's multi_error: 0.48521
[665]	valid_0's multi_error: 0.48516
[666]	valid_0's multi_error: 0.485163
[667]	valid_0's multi_error: 0.48518
[668]	valid_0's multi_error: 0.485087
[669]	valid_0's multi_error: 0.485084
[670]	valid_0's multi_error: 0.485089
[671]	valid_0's multi_error: 0.485098
[672]	valid_0's multi_error: 0.485134
[673]	valid_0's multi_error: 0.485114
[674]	valid_0's multi_error: 0.485116
[675]	valid_0's multi_error: 0.485086
[676]	valid_0's multi_error: 0.485069
[677]	valid_0's multi_error: 0.485054
[678]	valid_0's 

[869]	valid_0's multi_error: 0.483749
[870]	valid_0's multi_error: 0.483747
[871]	valid_0's multi_error: 0.483728
[872]	valid_0's multi_error: 0.483734
[873]	valid_0's multi_error: 0.483744
[874]	valid_0's multi_error: 0.483732
[875]	valid_0's multi_error: 0.483731
[876]	valid_0's multi_error: 0.483734
[877]	valid_0's multi_error: 0.483735
[878]	valid_0's multi_error: 0.483738
[879]	valid_0's multi_error: 0.483774
[880]	valid_0's multi_error: 0.483762
[881]	valid_0's multi_error: 0.483755
[882]	valid_0's multi_error: 0.483744
[883]	valid_0's multi_error: 0.483771
[884]	valid_0's multi_error: 0.483786
[885]	valid_0's multi_error: 0.48376
[886]	valid_0's multi_error: 0.483753
[887]	valid_0's multi_error: 0.48371
[888]	valid_0's multi_error: 0.483743
[889]	valid_0's multi_error: 0.483713
[890]	valid_0's multi_error: 0.483683
[891]	valid_0's multi_error: 0.483677
[892]	valid_0's multi_error: 0.483662
[893]	valid_0's multi_error: 0.483628
[894]	valid_0's multi_error: 0.483595
[895]	valid_0'

[59]	valid_0's multi_error: 0.526067
[60]	valid_0's multi_error: 0.525657
[61]	valid_0's multi_error: 0.525073
[62]	valid_0's multi_error: 0.524749
[63]	valid_0's multi_error: 0.524348
[64]	valid_0's multi_error: 0.523916
[65]	valid_0's multi_error: 0.523309
[66]	valid_0's multi_error: 0.522848
[67]	valid_0's multi_error: 0.522527
[68]	valid_0's multi_error: 0.522136
[69]	valid_0's multi_error: 0.521767
[70]	valid_0's multi_error: 0.521443
[71]	valid_0's multi_error: 0.521113
[72]	valid_0's multi_error: 0.520666
[73]	valid_0's multi_error: 0.520297
[74]	valid_0's multi_error: 0.519963
[75]	valid_0's multi_error: 0.519524
[76]	valid_0's multi_error: 0.519128
[77]	valid_0's multi_error: 0.5188
[78]	valid_0's multi_error: 0.518485
[79]	valid_0's multi_error: 0.518104
[80]	valid_0's multi_error: 0.517755
[81]	valid_0's multi_error: 0.517452
[82]	valid_0's multi_error: 0.517158
[83]	valid_0's multi_error: 0.516843
[84]	valid_0's multi_error: 0.516664
[85]	valid_0's multi_error: 0.516372
[86

[277]	valid_0's multi_error: 0.492024
[278]	valid_0's multi_error: 0.491958
[279]	valid_0's multi_error: 0.491878
[280]	valid_0's multi_error: 0.491861
[281]	valid_0's multi_error: 0.491766
[282]	valid_0's multi_error: 0.491697
[283]	valid_0's multi_error: 0.49164
[284]	valid_0's multi_error: 0.491572
[285]	valid_0's multi_error: 0.491491
[286]	valid_0's multi_error: 0.491472
[287]	valid_0's multi_error: 0.491475
[288]	valid_0's multi_error: 0.491357
[289]	valid_0's multi_error: 0.491342
[290]	valid_0's multi_error: 0.491355
[291]	valid_0's multi_error: 0.491349
[292]	valid_0's multi_error: 0.491242
[293]	valid_0's multi_error: 0.49117
[294]	valid_0's multi_error: 0.491124
[295]	valid_0's multi_error: 0.491149
[296]	valid_0's multi_error: 0.491097
[297]	valid_0's multi_error: 0.491036
[298]	valid_0's multi_error: 0.490925
[299]	valid_0's multi_error: 0.49091
[300]	valid_0's multi_error: 0.490799
[301]	valid_0's multi_error: 0.490734
[302]	valid_0's multi_error: 0.490645
[303]	valid_0's

[494]	valid_0's multi_error: 0.485437
[495]	valid_0's multi_error: 0.485384
[496]	valid_0's multi_error: 0.485369
[497]	valid_0's multi_error: 0.485363
[498]	valid_0's multi_error: 0.485352
[499]	valid_0's multi_error: 0.485369
[500]	valid_0's multi_error: 0.485351
[501]	valid_0's multi_error: 0.48536
[502]	valid_0's multi_error: 0.485309
[503]	valid_0's multi_error: 0.485309
[504]	valid_0's multi_error: 0.485273
[505]	valid_0's multi_error: 0.485276
[506]	valid_0's multi_error: 0.485263
[507]	valid_0's multi_error: 0.485269
[508]	valid_0's multi_error: 0.485245
[509]	valid_0's multi_error: 0.485257
[510]	valid_0's multi_error: 0.485213
[511]	valid_0's multi_error: 0.485212
[512]	valid_0's multi_error: 0.48521
[513]	valid_0's multi_error: 0.485194
[514]	valid_0's multi_error: 0.485188
[515]	valid_0's multi_error: 0.485185
[516]	valid_0's multi_error: 0.485164
[517]	valid_0's multi_error: 0.485139
[518]	valid_0's multi_error: 0.485101
[519]	valid_0's multi_error: 0.485116
[520]	valid_0'

[711]	valid_0's multi_error: 0.483546
[712]	valid_0's multi_error: 0.483546
[713]	valid_0's multi_error: 0.483545
[714]	valid_0's multi_error: 0.483506
[715]	valid_0's multi_error: 0.48353
[716]	valid_0's multi_error: 0.483499
[717]	valid_0's multi_error: 0.48343
[718]	valid_0's multi_error: 0.483445
[719]	valid_0's multi_error: 0.483458
[720]	valid_0's multi_error: 0.483421
[721]	valid_0's multi_error: 0.483437
[722]	valid_0's multi_error: 0.483337
[723]	valid_0's multi_error: 0.48336
[724]	valid_0's multi_error: 0.48334
[725]	valid_0's multi_error: 0.483318
[726]	valid_0's multi_error: 0.483296
[727]	valid_0's multi_error: 0.483297
[728]	valid_0's multi_error: 0.483266
[729]	valid_0's multi_error: 0.483251
[730]	valid_0's multi_error: 0.483227
[731]	valid_0's multi_error: 0.483212
[732]	valid_0's multi_error: 0.483248
[733]	valid_0's multi_error: 0.483166
[734]	valid_0's multi_error: 0.483143
[735]	valid_0's multi_error: 0.483148
[736]	valid_0's multi_error: 0.483152
[737]	valid_0's 

[928]	valid_0's multi_error: 0.482284
[929]	valid_0's multi_error: 0.482299
[930]	valid_0's multi_error: 0.482291
[931]	valid_0's multi_error: 0.482285
[932]	valid_0's multi_error: 0.482269
[933]	valid_0's multi_error: 0.482248
[934]	valid_0's multi_error: 0.482255
[935]	valid_0's multi_error: 0.482201
[936]	valid_0's multi_error: 0.482191
[937]	valid_0's multi_error: 0.482206
[938]	valid_0's multi_error: 0.482207
[939]	valid_0's multi_error: 0.482206
[940]	valid_0's multi_error: 0.482169
[941]	valid_0's multi_error: 0.482191
[942]	valid_0's multi_error: 0.4822
[943]	valid_0's multi_error: 0.482178
[944]	valid_0's multi_error: 0.482163
[945]	valid_0's multi_error: 0.482181
[946]	valid_0's multi_error: 0.482173
[947]	valid_0's multi_error: 0.482187
[948]	valid_0's multi_error: 0.482204
[949]	valid_0's multi_error: 0.482212
[950]	valid_0's multi_error: 0.482225
[951]	valid_0's multi_error: 0.482222
[952]	valid_0's multi_error: 0.482216
[953]	valid_0's multi_error: 0.4822
[954]	valid_0's 

[1141]	valid_0's multi_error: 0.481651
[1142]	valid_0's multi_error: 0.481646
[1143]	valid_0's multi_error: 0.481624
[1144]	valid_0's multi_error: 0.481657
[1145]	valid_0's multi_error: 0.481666
[1146]	valid_0's multi_error: 0.481661
[1147]	valid_0's multi_error: 0.481664
[1148]	valid_0's multi_error: 0.481633
[1149]	valid_0's multi_error: 0.481618
[1150]	valid_0's multi_error: 0.481613
[1151]	valid_0's multi_error: 0.481613
[1152]	valid_0's multi_error: 0.481612
[1153]	valid_0's multi_error: 0.481596
[1154]	valid_0's multi_error: 0.481596
[1155]	valid_0's multi_error: 0.481584
[1156]	valid_0's multi_error: 0.481584
[1157]	valid_0's multi_error: 0.481554
[1158]	valid_0's multi_error: 0.481525
[1159]	valid_0's multi_error: 0.481521
[1160]	valid_0's multi_error: 0.48153
[1161]	valid_0's multi_error: 0.481499
[1162]	valid_0's multi_error: 0.481469
[1163]	valid_0's multi_error: 0.481466
[1164]	valid_0's multi_error: 0.481484
[1165]	valid_0's multi_error: 0.481427
[1166]	valid_0's multi_err

[25]	valid_0's multi_error: 0.554543
[26]	valid_0's multi_error: 0.553058
[27]	valid_0's multi_error: 0.551541
[28]	valid_0's multi_error: 0.550586
[29]	valid_0's multi_error: 0.549446
[30]	valid_0's multi_error: 0.548455
[31]	valid_0's multi_error: 0.547307
[32]	valid_0's multi_error: 0.546061
[33]	valid_0's multi_error: 0.545134
[34]	valid_0's multi_error: 0.543949
[35]	valid_0's multi_error: 0.543047
[36]	valid_0's multi_error: 0.542183
[37]	valid_0's multi_error: 0.541155
[38]	valid_0's multi_error: 0.540456
[39]	valid_0's multi_error: 0.539728
[40]	valid_0's multi_error: 0.538832
[41]	valid_0's multi_error: 0.538246
[42]	valid_0's multi_error: 0.537607
[43]	valid_0's multi_error: 0.536856
[44]	valid_0's multi_error: 0.536428
[45]	valid_0's multi_error: 0.535817
[46]	valid_0's multi_error: 0.535011
[47]	valid_0's multi_error: 0.534478
[48]	valid_0's multi_error: 0.533768
[49]	valid_0's multi_error: 0.533153
[50]	valid_0's multi_error: 0.532532
[51]	valid_0's multi_error: 0.531789
[

[244]	valid_0's multi_error: 0.496431
[245]	valid_0's multi_error: 0.496341
[246]	valid_0's multi_error: 0.496216
[247]	valid_0's multi_error: 0.49624
[248]	valid_0's multi_error: 0.496172
[249]	valid_0's multi_error: 0.496046
[250]	valid_0's multi_error: 0.496083
[251]	valid_0's multi_error: 0.495928
[252]	valid_0's multi_error: 0.495862
[253]	valid_0's multi_error: 0.495784
[254]	valid_0's multi_error: 0.49568
[255]	valid_0's multi_error: 0.49572
[256]	valid_0's multi_error: 0.495672
[257]	valid_0's multi_error: 0.495578
[258]	valid_0's multi_error: 0.495531
[259]	valid_0's multi_error: 0.495396
[260]	valid_0's multi_error: 0.495259
[261]	valid_0's multi_error: 0.495195
[262]	valid_0's multi_error: 0.495169
[263]	valid_0's multi_error: 0.495192
[264]	valid_0's multi_error: 0.49511
[265]	valid_0's multi_error: 0.495059
[266]	valid_0's multi_error: 0.495025
[267]	valid_0's multi_error: 0.494975
[268]	valid_0's multi_error: 0.494895
[269]	valid_0's multi_error: 0.494786
[270]	valid_0's 

[461]	valid_0's multi_error: 0.488431
[462]	valid_0's multi_error: 0.488434
[463]	valid_0's multi_error: 0.488417
[464]	valid_0's multi_error: 0.488405
[465]	valid_0's multi_error: 0.488405
[466]	valid_0's multi_error: 0.48834
[467]	valid_0's multi_error: 0.48829
[468]	valid_0's multi_error: 0.488284
[469]	valid_0's multi_error: 0.488247
[470]	valid_0's multi_error: 0.488251
[471]	valid_0's multi_error: 0.488256
[472]	valid_0's multi_error: 0.488208
[473]	valid_0's multi_error: 0.488217
[474]	valid_0's multi_error: 0.488175
[475]	valid_0's multi_error: 0.488144
[476]	valid_0's multi_error: 0.488101
[477]	valid_0's multi_error: 0.488105
[478]	valid_0's multi_error: 0.488037
[479]	valid_0's multi_error: 0.488066
[480]	valid_0's multi_error: 0.488032
[481]	valid_0's multi_error: 0.488084
[482]	valid_0's multi_error: 0.48805
[483]	valid_0's multi_error: 0.488037
[484]	valid_0's multi_error: 0.488026
[485]	valid_0's multi_error: 0.488034
[486]	valid_0's multi_error: 0.488001
[487]	valid_0's

[678]	valid_0's multi_error: 0.485713
[679]	valid_0's multi_error: 0.48571
[680]	valid_0's multi_error: 0.485698
[681]	valid_0's multi_error: 0.485696
[682]	valid_0's multi_error: 0.48571
[683]	valid_0's multi_error: 0.485687
[684]	valid_0's multi_error: 0.485684
[685]	valid_0's multi_error: 0.485666
[686]	valid_0's multi_error: 0.485675
[687]	valid_0's multi_error: 0.485653
[688]	valid_0's multi_error: 0.485617
[689]	valid_0's multi_error: 0.485604
[690]	valid_0's multi_error: 0.485505
[691]	valid_0's multi_error: 0.485486
[692]	valid_0's multi_error: 0.485435
[693]	valid_0's multi_error: 0.485423
[694]	valid_0's multi_error: 0.485456
[695]	valid_0's multi_error: 0.485468
[696]	valid_0's multi_error: 0.485468
[697]	valid_0's multi_error: 0.48545
[698]	valid_0's multi_error: 0.485448
[699]	valid_0's multi_error: 0.48544
[700]	valid_0's multi_error: 0.485401
[701]	valid_0's multi_error: 0.485423
[702]	valid_0's multi_error: 0.485432
[703]	valid_0's multi_error: 0.485448
[704]	valid_0's 

[895]	valid_0's multi_error: 0.48414
[896]	valid_0's multi_error: 0.484196
[897]	valid_0's multi_error: 0.484154
[898]	valid_0's multi_error: 0.48419
[899]	valid_0's multi_error: 0.484199
[900]	valid_0's multi_error: 0.484205
[901]	valid_0's multi_error: 0.484198
[902]	valid_0's multi_error: 0.484175
[903]	valid_0's multi_error: 0.484159
[904]	valid_0's multi_error: 0.484169
[905]	valid_0's multi_error: 0.484202
[906]	valid_0's multi_error: 0.484247
[907]	valid_0's multi_error: 0.484254
[908]	valid_0's multi_error: 0.484201
[909]	valid_0's multi_error: 0.484186
[910]	valid_0's multi_error: 0.484201
[911]	valid_0's multi_error: 0.484184
[912]	valid_0's multi_error: 0.484169
[913]	valid_0's multi_error: 0.484157
[914]	valid_0's multi_error: 0.484174
[915]	valid_0's multi_error: 0.484143
[916]	valid_0's multi_error: 0.484165
[917]	valid_0's multi_error: 0.484089
[918]	valid_0's multi_error: 0.484071
[919]	valid_0's multi_error: 0.484065
[920]	valid_0's multi_error: 0.484066
[921]	valid_0'

[1109]	valid_0's multi_error: 0.483326
[1110]	valid_0's multi_error: 0.483325
[1111]	valid_0's multi_error: 0.483354
[1112]	valid_0's multi_error: 0.483338
[1113]	valid_0's multi_error: 0.483366
[1114]	valid_0's multi_error: 0.483374
[1115]	valid_0's multi_error: 0.483354
[1116]	valid_0's multi_error: 0.48335
[1117]	valid_0's multi_error: 0.48336
[1118]	valid_0's multi_error: 0.483329
[1119]	valid_0's multi_error: 0.483334
[1120]	valid_0's multi_error: 0.483316
[1121]	valid_0's multi_error: 0.483302
[1122]	valid_0's multi_error: 0.483307
[1123]	valid_0's multi_error: 0.483326
[1124]	valid_0's multi_error: 0.483356
[1125]	valid_0's multi_error: 0.483328
[1126]	valid_0's multi_error: 0.483344
[1127]	valid_0's multi_error: 0.483337
[1128]	valid_0's multi_error: 0.483338
[1129]	valid_0's multi_error: 0.483359
[1130]	valid_0's multi_error: 0.483344
[1131]	valid_0's multi_error: 0.483325
[1132]	valid_0's multi_error: 0.483343
[1133]	valid_0's multi_error: 0.48334
[1134]	valid_0's multi_error

[1320]	valid_0's multi_error: 0.482787
[1321]	valid_0's multi_error: 0.482817
[1322]	valid_0's multi_error: 0.482801
[1323]	valid_0's multi_error: 0.482795
[1324]	valid_0's multi_error: 0.48282
[1325]	valid_0's multi_error: 0.482805
[1326]	valid_0's multi_error: 0.482798
[1327]	valid_0's multi_error: 0.482801
[1328]	valid_0's multi_error: 0.482787
[1329]	valid_0's multi_error: 0.482769
[1330]	valid_0's multi_error: 0.48278
[1331]	valid_0's multi_error: 0.48281
[1332]	valid_0's multi_error: 0.482796
[1333]	valid_0's multi_error: 0.482834
[1334]	valid_0's multi_error: 0.482843
[1335]	valid_0's multi_error: 0.482816
[1336]	valid_0's multi_error: 0.482832
[1337]	valid_0's multi_error: 0.482814
[1338]	valid_0's multi_error: 0.48284
[1339]	valid_0's multi_error: 0.482834
[1340]	valid_0's multi_error: 0.482798
[1341]	valid_0's multi_error: 0.482786
[1342]	valid_0's multi_error: 0.482774
[1343]	valid_0's multi_error: 0.482747
[1344]	valid_0's multi_error: 0.482753
[1345]	valid_0's multi_error:



In [13]:
age_train['age_group'].nunique()

6

In [14]:
history

import pandas as pd
import numpy as np
import lightgbm as lgb
pd.set_option('display.max_columns',None)
#读取数据
age_train = pd.read_csv("data/age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("data/age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("data/user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("data/user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("data/user_app_actived.csv", names=['uid','appId'])
#user_app_usage = pd.read_csv("data/user_app_usage.csv")
app_info = pd.read_csv("data/app_info.csv", names=['appId', 'category'])
#处理数据量较大的user_app_usage.csv，结合app_info.csv简单统计得到appuseProcessed.csv作为特征
def f(x):
    s = x.value_counts()
    return np.nan if len(s) == 0 else s.index[0]
def proces