# Import 

In [12]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import KFold
import lightgbm as lgb
import random
from gensim.models.word2vec import Word2Vec 
import logging
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
from tqdm.notebook import tqdm as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import time
from datetime import datetime
import IPython.display as ipd
from collections import *

In [13]:
DATA_PATH = '/data/cch/weibo'
VAR_PATH = 'var'


In [14]:
pd.__version__

'1.3.4'

# Read

In [15]:
df_weibo_with_repost = pickle.load(open('%s/df_all.pickle'%VAR_PATH, 'rb'))

df_24h_feature_predict = pickle.load(open('%s/df_24h_feature_predict.pickle'%VAR_PATH, 'rb'))

# Model

In [16]:
'''
评估函数传入的是1，2，3，4，5五个档位的值，注意从1开始

'''

def precision_score_hyr(predictions, ground_truths):
    predictions, ground_truths = np.array(predictions)-1, np.array(ground_truths)-1
    
    score, total = 0, 0
    for p_cnt, g_cnt in zip(predictions, ground_truths):
        if g_cnt==0:
            total += 1
            if p_cnt == g_cnt:
                score += 1
        elif g_cnt==1:
            total += 10
            if p_cnt == g_cnt:
                score += 10
        elif  g_cnt==2:
            total += 50
            if p_cnt == g_cnt:
                score += 50
        elif  g_cnt==3:
            total += 100
            if p_cnt == g_cnt:
                score += 100
        elif  g_cnt==4:
            total += 300
            if p_cnt == g_cnt:
                score += 300
    return score/total

def precision_score_cch(predictions, ground_truths):
    
    predictions, ground_truths = np.array(predictions)-1, np.array(ground_truths)-1
    y_pred = predictions
    y_true = ground_truths

    w=[1,10,50,100,300]
    n = len(y_true)
    count_r = [0 for i in range(5)]
    count = [0 for i in range(5)]
    for i in range(n):
        count[y_true[i]] += 1
        if y_pred[i] == y_true[i]:
            count_r[y_pred[i]] += 1
    sum1 = sum(w[i]*count_r[i] for i in range(5))
    sum2 = sum(w[i]*count[i] for i in range(5))
    precision = sum1/sum2
    return precision

#随机数测试
predictions   = [random.randint(1,5) for i in range(18000)]
ground_truths = [random.randint(1,5) for i in range(18000)]
# print(predictions, ground_truths)
print(precision_score_hyr(predictions, ground_truths), precision_score_cch(predictions, ground_truths))

0.19798468224654805 0.19798468224654805


In [6]:
df_weibo_with_repost.index = df_weibo_with_repost['WeiboId']
df_weibo_with_repost = pd.concat([df_weibo_with_repost, df_24h_feature_predict], 1)
feature_names_24h = list(df_24h_feature_predict.columns)

  


## 建模

In [9]:
fix_feature_list = [
    
    #文本硬特征
    'weibotext_len', 
    '转发&点赞', '疫情', '特朗普|总统', '视频|链接', '粉|饭', '中国&金牌', '台湾',
    '历史', '发展|经济', '推荐', '东京&奥运',    
    
    #文本stacking
#     'text_stacking_0','text_stacking_1','text_stacking_2','text_stacking_3', 'text_stacking_4',
   
    #性别
    'Gender', 
    
    #用户粉丝，关注数
    'follow_mean', 'follow_median', 'follow_max', 'follow_min', 'follower_mean',
    'follower_median', 'follower_max', 'follower_min',

    #微博发布时间
    'post_day', 'post_weekday', 'post_month', 'post_hour', 'post_minute', 'post_year',
    
    #user id target encode
#     'target_encode_0', 'target_encode_1', 'target_encode_2', 'target_encode_3', 'target_encode_4',
#     'target_encode_cnt', 
    
    #转发特征(微博维度)
     'repost_weibo_cnt_15_30_mins', 'repost_weibo_cnt_15_mins','repost_weibo_cnt_30_45_mins','repost_weibo_cnt_30_mins',
     'repost_weibo_cnt_45_60_mins','repost_weibo_cnt_45_mins','repost_weibo_cnt_60_mins',
     'repost_weibo_follow_max_max',
     'repost_weibo_follow_max_mean','repost_weibo_follow_max_min','repost_weibo_follow_max_sum','repost_weibo_follow_mean_max',
     'repost_weibo_follow_mean_mean','repost_weibo_follow_mean_min','repost_weibo_follow_mean_sum','repost_weibo_follow_median_max',
     'repost_weibo_follow_median_mean','repost_weibo_follow_median_min','repost_weibo_follow_median_sum',
     'repost_weibo_follow_min_max', 'repost_weibo_follow_min_mean','repost_weibo_follow_min_min','repost_weibo_follow_min_sum',
     'repost_weibo_follower_max_max','repost_weibo_follower_max_mean','repost_weibo_follower_max_min',
     'repost_weibo_follower_max_sum','repost_weibo_follower_mean_max','repost_weibo_follower_mean_mean',
     'repost_weibo_follower_mean_min','repost_weibo_follower_mean_sum','repost_weibo_follower_median_max',
     'repost_weibo_follower_median_mean','repost_weibo_follower_median_min','repost_weibo_follower_median_sum',
     'repost_weibo_follower_min_max','repost_weibo_follower_min_mean','repost_weibo_follower_min_min',
     'repost_weibo_follower_min_sum','repost_weibo_pass_time_max','repost_weibo_pass_time_mean','repost_weibo_pass_time_median',
     'repost_weibo_pass_time_min', 'repost_weibo_Verified', 'repost_weibo_Gender',
    
    #水军
     'repost_weibo_unique_repost_userid',
     'repost_weibo_repeat_repost_cnt',
     'repost_weibo_max_user_repost',
     'repost_weibo_other_cnt_max',
     'repost_weibo_other_cnt_min',
     'repost_weibo_other_cnt_mean',
     'repost_weibo_other_cnt_std',
     'repost_weibo_other_cnt_0',
     'repost_weibo_text_len_max',
     'repost_weibo_text_len_min',
     'repost_weibo_text_len_mean',
     'repost_weibo_text_len_std',

    #转发特征(用户维度)    
     'repost_userid_cnt_15_30_mins','repost_userid_cnt_15_mins','repost_userid_cnt_30_45_mins','repost_userid_cnt_30_mins',
     'repost_userid_cnt_45_60_mins','repost_userid_cnt_45_mins','repost_userid_cnt_60_mins',
    'repost_userid_follow_max_max',
     'repost_userid_follow_max_mean','repost_userid_follow_max_min','repost_userid_follow_max_sum',
       'repost_userid_follow_mean_max',
     'repost_userid_follow_mean_mean','repost_userid_follow_mean_min','repost_userid_follow_mean_sum',
     'repost_userid_follow_median_max','repost_userid_follow_median_mean','repost_userid_follow_median_min',
     'repost_userid_follow_median_sum','repost_userid_follow_min_max','repost_userid_follow_min_mean',
     'repost_userid_follow_min_min','repost_userid_follow_min_sum','repost_userid_follower_max_max',
     'repost_userid_follower_max_mean','repost_userid_follower_max_min','repost_userid_follower_max_sum',
     'repost_userid_follower_mean_max','repost_userid_follower_mean_mean','repost_userid_follower_mean_min',
     'repost_userid_follower_mean_sum','repost_userid_follower_median_max','repost_userid_follower_median_mean',
     'repost_userid_follower_median_min','repost_userid_follower_median_sum','repost_userid_follower_min_max',
     'repost_userid_follower_min_mean','repost_userid_follower_min_min','repost_userid_follower_min_sum',
     'repost_userid_pass_time_max','repost_userid_pass_time_mean','repost_userid_pass_time_median',
     'repost_userid_pass_time_min', 'repost_userid_Verified', 'repost_userid_Gender',
        
     'repost_userid_unique_repost_userid',
     'repost_userid_repeat_repost_cnt',
     'repost_userid_max_user_repost',
     'repost_userid_other_cnt_max',
     'repost_userid_other_cnt_min',
     'repost_userid_other_cnt_mean',
     'repost_userid_other_cnt_std',
     'repost_userid_other_cnt_0',
     'repost_userid_text_len_max',
     'repost_userid_text_len_min',
     'repost_userid_text_len_mean',
     'repost_userid_text_len_std',
    
    #交叉特征
    'userid_idx', 
    'cross_userid_idx_post_hour',
    'cross_userid_idx_post_weekday', 
    'cross_repost_1h_cnt_idx_post_hour', 'cross_repost_1h_cnt_idx_post_weekday',
    
    
    #交叉特征 24h转发特征 target encode
#     'target_encode_24h_10_reposet_cnt',
#        'target_encode_24h_11_reposet_cnt', 'target_encode_24h_12_reposet_cnt',
#        'target_encode_24h_13_reposet_cnt', 'target_encode_24h_14_reposet_cnt',
#        'target_encode_24h_15_reposet_cnt', 'target_encode_24h_16_reposet_cnt',
#        'target_encode_24h_17_reposet_cnt', 'target_encode_24h_18_reposet_cnt',
#        'target_encode_24h_19_reposet_cnt', 'target_encode_24h_20_reposet_cnt',
#        'target_encode_24h_21_reposet_cnt', 'target_encode_24h_22_reposet_cnt',
#        'target_encode_24h_23_reposet_cnt', 'target_encode_24h_24_reposet_cnt',
#        'target_encode_24h_2_reposet_cnt', 'target_encode_24h_3_reposet_cnt',
#        'target_encode_24h_4_reposet_cnt', 'target_encode_24h_5_reposet_cnt',
#        'target_encode_24h_6_reposet_cnt', 'target_encode_24h_7_reposet_cnt',
#        'target_encode_24h_8_reposet_cnt', 'target_encode_24h_9_reposet_cnt'

]

hidden_feature_list = [
    'weibotext_wv_embed', 'weibotext_tfidf',#微博内容词向量/tfidf+svd降维
    'user_intro_wv_embed', 'user_intro_tfidf'#用户个性签名词向量/tfidf+svd降维
]

In [10]:
def get_train_test_from_df(df, fix_feature_list, hidden_feature_list):
    df_train = df.query('type=="train"')
    df_test = df.query('type=="test"')
    train_x_fix, train_y, test_x_fix = \
        np.array(df_train[fix_feature_list]), np.array(df_train['label']), np.array(df_test[fix_feature_list])
    
    train_x_list = [train_x_fix]
    test_x_list = [test_x_fix]
    
    feature_name_list = fix_feature_list[:]
    for f in hidden_feature_list:
        hidden_train = np.array(list(df_train[f]))
        hidden_test = np.array(list(df_test[f]))
        train_x_list.append(hidden_train)
        test_x_list.append(hidden_test)
        feature_name_list += ['%s_%d'%(f,i) for i in range(hidden_train.shape[1])]
        
    return np.concatenate(train_x_list,1), train_y, np.concatenate(test_x_list,1), feature_name_list,\
list(df_train['WeiboId']), list(df_test['WeiboId'])
    
train_x, train_y, test_x, feature_name_list, train_weibo_ids, test_weibo_ids = \
get_train_test_from_df(df_weibo_with_repost, fix_feature_list, hidden_feature_list)

train_x.shape, train_y.shape, test_x.shape

((18000, 129), (18000,), (2329, 129))

In [11]:
SEED = 42 

params = {  
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 5,  
    'metric': 'multi_error',  
    'num_leaves': 8,  
    'max_depth': 3,
    'min_data_in_leaf': 100,  
    'learning_rate': 0.01,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
    'bagging_freq': 5,  
    'lambda_l1': 0.5,  
    'lambda_l2': 0.5,  
    'min_gain_to_split': 0.2,  
    'verbose': -1, 
    
    'feature_fraction_seed':SEED,
    'bagging_seed':SEED,
} 

def fit_lgb(train_x, train_y):
    class_weights = [1,10,50,100,300]
    trn_data = lgb.Dataset(train_x, train_y, weight=[class_weights[int(y)] for y in train_y])
    clf = lgb.train(params, 
                    trn_data, 
                    num_boost_round = 400,
                    valid_sets = [trn_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 100
                   )
    return clf

def eval_logit(logit, label):
    prediction = np.argmax(logit, 1)
    return precision_score_cch(prediction+1, np.array(label).astype('int')+1)
    
def cross_validation_lgb(train_x, train_y, test_x):
    n_flod = 5
    folds = KFold(n_splits=n_flod, shuffle=True,random_state=SEED)
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    score_train = np.zeros((len(train_x), 5))
    score_test = np.zeros((len(test_x), 5))
    trainset_score_list = []
    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
        
        trn_x, trn_labels = train_x[trn_idx], train_y[trn_idx]
        val_x, val_labels = train_x[val_idx], train_y[val_idx]
        model = fit_lgb(trn_x, trn_labels)
        score_train[val_idx] = model.predict(val_x)
        score_test += model.predict(test_x)/n_flod
        
        train_score = eval_logit(model.predict(trn_x), trn_labels)
        trainset_score_list.append(train_score)
        print(eval_logit(score_train[val_idx], val_labels))
        
    return score_train, score_test, np.mean(trainset_score_list)


start = time.time()
score_list = []
for t in range(10):
    SEED = random.randint(0, 1314)
    score_train, score_test, avg_train_score = cross_validation_lgb(train_x, train_y, test_x)
    prediction = np.argmax(score_train, 1)
    score = precision_score_cch(prediction+1, np.array(train_y).astype('int')+1)
    logging.info('time:%d SEED:%d train score:%f val score:%f cost:%f'%(t+1, SEED, avg_train_score, score, time.time()-start))

    score_list.append(score)
print(score_list, np.mean(score_list))



Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.30818
[200]	training's multi_error: 0.277379
[300]	training's multi_error: 0.255147
[400]	training's multi_error: 0.239506
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.239506
0.7074089068825911
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.297513
[200]	training's multi_error: 0.270007
[300]	training's multi_error: 0.249186
[400]	training's multi_error: 0.2316
Did not meet early stopping. Best iteration is:
[391]	training's multi_error: 0.230507
0.6834577114427861
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.295304
[200]	training's multi_error: 0.2686
[300]	training's multi_error: 0.248043
[400]	training's multi_error: 0.23218
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.23218
0.6621989358677551
Training until validation scores do

2021-12-11 13:57:56,445 - INFO - time:1 SEED:33 train score:0.764839 val score:0.680526 cost:49.376096


0.6632950272732087
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.301708
[200]	training's multi_error: 0.272767
[300]	training's multi_error: 0.250932
[400]	training's multi_error: 0.233188
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.232397
0.6615703833133122
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.31552
[200]	training's multi_error: 0.28072
[300]	training's multi_error: 0.255649
[400]	training's multi_error: 0.234704
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.234211
0.6939144059806929
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.318689
[200]	training's multi_error: 0.278857
[300]	training's multi_error: 0.246412
[400]	training's multi_error: 0.234289
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.234289
0.6972660174406923
Training un

2021-12-11 13:58:51,959 - INFO - time:2 SEED:948 train score:0.767722 val score:0.684561 cost:104.890079


0.6903816995187554
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.29615
[200]	training's multi_error: 0.270138
[300]	training's multi_error: 0.250686
[400]	training's multi_error: 0.234385
Did not meet early stopping. Best iteration is:
[398]	training's multi_error: 0.234142
0.6623799961488736
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.299728
[200]	training's multi_error: 0.273836
[300]	training's multi_error: 0.243954
[400]	training's multi_error: 0.226079
Did not meet early stopping. Best iteration is:
[377]	training's multi_error: 0.225364
0.6376260740901625
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.307378
[200]	training's multi_error: 0.274294
[300]	training's multi_error: 0.256683
[400]	training's multi_error: 0.236659
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.236659
0.7079734265930702
Training u

2021-12-11 13:59:50,892 - INFO - time:3 SEED:87 train score:0.765453 val score:0.681024 cost:163.823376


0.687913409500902
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.302028
[200]	training's multi_error: 0.271122
[300]	training's multi_error: 0.251857
[400]	training's multi_error: 0.233418
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.233316
0.664112322567373
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.309519
[200]	training's multi_error: 0.276272
[300]	training's multi_error: 0.253105
[400]	training's multi_error: 0.23108
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.23108
0.6704092873485298
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.303473
[200]	training's multi_error: 0.272383
[300]	training's multi_error: 0.25014
[400]	training's multi_error: 0.234295
Did not meet early stopping. Best iteration is:
[398]	training's multi_error: 0.234265
0.6920518064076346
Training until

2021-12-11 14:00:43,250 - INFO - time:4 SEED:454 train score:0.766840 val score:0.682442 cost:216.181293


0.6809004221571827
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.297544
[200]	training's multi_error: 0.27044
[300]	training's multi_error: 0.248505
[400]	training's multi_error: 0.230128
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.230128
0.6412714083946961
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.306418
[200]	training's multi_error: 0.27899
[300]	training's multi_error: 0.255724
[400]	training's multi_error: 0.228399
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.228399
0.6825425722608112
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.301423
[200]	training's multi_error: 0.27498
[300]	training's multi_error: 0.249317
[400]	training's multi_error: 0.231482
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.231482
0.6799724055838112
Training unt

2021-12-11 14:01:40,992 - INFO - time:5 SEED:257 train score:0.767345 val score:0.685837 cost:273.923052


0.6974235190568906
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.308066
[200]	training's multi_error: 0.274536
[300]	training's multi_error: 0.252159
[400]	training's multi_error: 0.23017
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.230121
0.7000476802820361
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.298082
[200]	training's multi_error: 0.271633
[300]	training's multi_error: 0.252816
[400]	training's multi_error: 0.230897
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.230897
0.6876924291466014
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.304364
[200]	training's multi_error: 0.27024
[300]	training's multi_error: 0.249067
[400]	training's multi_error: 0.231179
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.230855
0.6569679031119318
Training un

2021-12-11 14:02:45,459 - INFO - time:6 SEED:1062 train score:0.769026 val score:0.686073 cost:338.390155


0.6963896906172464
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.311334
[200]	training's multi_error: 0.271244
[300]	training's multi_error: 0.255282
[400]	training's multi_error: 0.231878
Did not meet early stopping. Best iteration is:
[398]	training's multi_error: 0.231704
0.684656353709453
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.299784
[200]	training's multi_error: 0.270965
[300]	training's multi_error: 0.252609
[400]	training's multi_error: 0.236034
Did not meet early stopping. Best iteration is:
[398]	training's multi_error: 0.235843
0.6659621756500634
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.309581
[200]	training's multi_error: 0.275614
[300]	training's multi_error: 0.253244
[400]	training's multi_error: 0.235826
Did not meet early stopping. Best iteration is:
[398]	training's multi_error: 0.235326
0.6787781682861278
Training u

2021-12-11 14:03:48,076 - INFO - time:7 SEED:374 train score:0.765313 val score:0.680762 cost:401.006908


0.6840175036465931
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.30198
[200]	training's multi_error: 0.269862
[300]	training's multi_error: 0.253492
[400]	training's multi_error: 0.232103
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.231598
0.6811199140593528
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.300437
[200]	training's multi_error: 0.274379
[300]	training's multi_error: 0.250175
[400]	training's multi_error: 0.232689
Did not meet early stopping. Best iteration is:
[396]	training's multi_error: 0.232453
0.6844486843971174
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.30704
[200]	training's multi_error: 0.278227
[300]	training's multi_error: 0.258247
[400]	training's multi_error: 0.23755
Did not meet early stopping. Best iteration is:
[399]	training's multi_error: 0.237526
0.6833848635569297
Training unt

2021-12-11 14:04:44,756 - INFO - time:8 SEED:560 train score:0.767034 val score:0.678714 cost:457.687507


0.6789618798096048
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.306933
[200]	training's multi_error: 0.274241
[300]	training's multi_error: 0.248415
[400]	training's multi_error: 0.228055
Did not meet early stopping. Best iteration is:
[398]	training's multi_error: 0.228045
0.6793647805507745
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.309641
[200]	training's multi_error: 0.27162
[300]	training's multi_error: 0.25202
[400]	training's multi_error: 0.233725
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.233725
0.6748951832736028
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.306739
[200]	training's multi_error: 0.27522
[300]	training's multi_error: 0.252398
[400]	training's multi_error: 0.235492
Did not meet early stopping. Best iteration is:
[397]	training's multi_error: 0.235039
0.6875264606265876
Training unt

2021-12-11 14:05:31,681 - INFO - time:9 SEED:501 train score:0.765558 val score:0.683216 cost:504.612547


0.6764515520077667
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.303169
[200]	training's multi_error: 0.27609
[300]	training's multi_error: 0.253125
[400]	training's multi_error: 0.235354
Did not meet early stopping. Best iteration is:
[395]	training's multi_error: 0.235207
0.6913721484508001
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.310441
[200]	training's multi_error: 0.277037
[300]	training's multi_error: 0.254563
[400]	training's multi_error: 0.231239
Did not meet early stopping. Best iteration is:
[400]	training's multi_error: 0.231239
0.6771026904742667
Training until validation scores don't improve for 100 rounds
[100]	training's multi_error: 0.31046
[200]	training's multi_error: 0.282017
[300]	training's multi_error: 0.259802
[400]	training's multi_error: 0.241539
Did not meet early stopping. Best iteration is:
[397]	training's multi_error: 0.241468
0.713871199671323
Training unt

2021-12-11 14:06:18,513 - INFO - time:10 SEED:30 train score:0.765447 val score:0.682892 cost:551.444792


0.6661041297647108
[0.6805258560166353, 0.6845614204329254, 0.681024272338966, 0.6824418112341967, 0.6858369374728684, 0.6860727473457991, 0.680761665889566, 0.6787144074473045, 0.6832162322941621, 0.6828919937188824] 0.6826047344191306


[0.6862871199575542, 0.684322930902348, 0.6829616648177029, 0.6900011254562117, 0.6829268292682927, 0.6821577675236212, 0.6878707976268952, 0.6875411997363217, 0.6877180571410196, 0.6869677529998768] 0.6858755245429844

[0.6847195202340949, 0.6796978418037312, 0.682825002277709, 0.6853063652587745, 0.6847811523599745, 0.68440867994705, 0.6821765251271498, 0.6809840774742619, 0.6877234164563135, 0.6839477788317765] 0.6836570359770836

[0.6855180582128827, 0.6808179387001516, 0.6869918699186992, 0.6889426606856708, 0.6891409553515443, 0.6861584963905012, 0.6742876130145613, 0.684320251244701, 0.6821926030730314, 0.6869248784775257] 0.6845295325069269


In [10]:
df_weibo_with_repost['predict'] = list(np.argmax(np.concatenate([score_train, score_test]), 1))
df_weibo_with_repost['score'] = list(np.round( np.concatenate([score_train, score_test]),3))

df_weibo_with_repost.loc['008aa2a00173766f0637eeb05644427c14b827c4'][['predict','label','score']]

predict                                      0
label                                      4.0
score      [0.379, 0.299, 0.238, 0.052, 0.032]
Name: 008aa2a00173766f0637eeb05644427c14b827c4, dtype: object

## 评估

In [26]:
prediction = np.argmax(score_train, 1)
print(pd.Series(prediction).value_counts(1))
print('baseline', precision_score_cch(prediction+1, np.array(train_y).astype('int')+1))


prediction_random = np.array([random.randint(0,4) for i in range(18000)])
print(pd.Series(prediction_random).value_counts(1))
print('randon', precision_score_cch(prediction_random+1, np.array(train_y).astype('int')+1))


prediction_shuffle = prediction[:]
random.shuffle(prediction_shuffle)
print(pd.Series(prediction_shuffle).value_counts(1))
print('random shuffle', precision_score_cch(prediction_shuffle+1, np.array(train_y).astype('int')+1))


1    0.492667
0    0.295000
2    0.152333
4    0.032722
3    0.027278
dtype: float64
baseline 0.6859977169316848
1    0.203389
4    0.201222
0    0.199833
3    0.198944
2    0.196611
dtype: float64
randon 0.18285179885417838
1    0.492667
0    0.295000
2    0.152333
4    0.032722
3    0.027278
dtype: float64
random shuffle 0.14386545974886247


# 提交文件、融合文件生成

## 提交文件

In [27]:
# prediciton_test = np.argmax(score_test, 1)
# df_submit = pd.DataFrame({
#     'WeiboId':list(df_weibo_with_repost.query('type=="test"')['WeiboId']),
#     'ForwardScale':prediciton_test+1
# })
# SUBMIT_PATH = '/data/cch/hyr/weibo/submit'
# df_submit.to_csv('%s/submission.csv'%SUBMIT_PATH, sep='\t', index=False)

## 逻辑文件

In [28]:

MODEL_OUT_PATH = 'sub_model_output'

df_model_output = pd.DataFrame({
    'WeiboId':train_weibo_ids+test_weibo_ids,
    'lgb_hyr_0':np.concatenate([score_train[:, 0], score_test[:, 0]]),
    'lgb_hyr_1':np.concatenate([score_train[:, 1], score_test[:, 1]]),
    'lgb_hyr_2':np.concatenate([score_train[:, 2], score_test[:, 2]]),
    'lgb_hyr_3':np.concatenate([score_train[:, 3], score_test[:, 3]]),
    'lgb_hyr_4':np.concatenate([score_train[:, 4], score_test[:, 4]])
})

pickle.dump(df_model_output, open('%s/df_lgb.pickle'%MODEL_OUT_PATH, 'wb'))

# 分析

In [None]:
def check_feature_importances():
    
    class_weights = [1,10,50,100,300]
    trn_data = lgb.Dataset(train_x, np.array(train_y).astype('int'),
                           weight=[class_weights[int(y)] for y in train_y],
                          feature_name=feature_name_list)
    clf = lgb.train(params, 
                    trn_data, 
                    num_boost_round = 100,
                    valid_sets = [trn_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 100
                   )

    importance_dict = {}
    for name, importance in zip(feature_name_list, clf.feature_importance()):
        importance_dict[name] = importance
    print(importance_dict)
    print(lgb.plot_importance(clf,max_num_features=30))
check_feature_importances()