In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import KFold
import lightgbm as lgb
import random
from gensim.models.word2vec import Word2Vec 
import jieba
from tqdm.notebook import tqdm as tqdm
import time
from datetime import datetime
import IPython.display as ipd
from collections import *
import torch
from transformers import *
import torch.nn as nn
import torch.utils.data as Data
import math
import torch.nn.functional as F
import logging
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)

  data = yaml.load(f.read()) or {}
  import pandas.util.testing as tm
  defaults = yaml.load(f)


In [2]:
DATA_PATH = '/data/cch/weibo'
VAR_PATH = '/data/cch/hyr/weibo/var'


# Read

In [3]:
df_test_origin_weibo = pd.read_csv('%s/test.origin_weibo.csv'%DATA_PATH, sep='\t')
df_test_repost = pd.read_csv('%s/test.repost.csv'%DATA_PATH, sep='\t')
df_train_origin_weibo = pd.read_csv('%s/train.origin_weibo.csv'%DATA_PATH, sep='\t')
df_train_repost = pd.read_csv('%s/train.repost.csv'%DATA_PATH, sep='\t')
# df_user_profile = pd.read_csv('%s/user_profile.csv'%DATA_PATH, sep='\t')
df_user_profile_agg = pickle.load(open('%s/df_user_profile_agg.pickle'%VAR_PATH, 'rb'))

In [4]:
def count2idx(num):
    if 0 <= num <= 10:
        return 0
    elif 11 <= num <= 50:
        return 1
    elif 51 <= num <= 150:
        return 2
    elif 151 <= num <= 300:
        return 3
    elif num > 300:
        return 4
    
df_train_origin_weibo['label'] = df_train_origin_weibo['ForwordCount'].apply(count2idx)
df_train_origin_weibo['type'] = 'train'
df_test_origin_weibo['type'] = 'test'
df_origin_weibo = pd.concat([df_train_origin_weibo, df_test_origin_weibo])
df_weibo = pd.merge(df_origin_weibo, df_user_profile_agg, how='left', on='UserId')

# 转发特征

In [None]:
def apply_pass_time(a_row):
    global mp_WeiboId_create_time
    repost_date_obj = datetime.strptime(a_row['RepostDate'], '%Y-%m-%d %H:%M:%S')
    post_date_obj = datetime.strptime(mp_WeiboId_create_time[a_row['OriginWeiboId']], '%Y-%m-%d %H:%M:%S')
    return (repost_date_obj-post_date_obj).seconds

In [None]:
mp_WeiboId_create_time = dict(list(df_weibo[['WeiboId', 'WeiboCreateTime']].values))
df_train_repost['pass_time'] = df_train_repost.apply(apply_pass_time, 1)
df_test_repost['pass_time'] = df_test_repost.apply(apply_pass_time, 1)

In [None]:
def get_24h_repost_features(df_repost):
    ret_dict_list = []
    
    for weibo_id, df in tqdm(df_repost.groupby('OriginWeiboId')):
        ret_dict = {'WeiboId':weibo_id}
        for hour in range(2, 25):
            ddl = 3600*hour
            df_sub = df.query('pass_time < @ddl')
            ret_dict['24h_%d_reposet_cnt'%hour] = df_sub.shape[0]
                
        ret_dict_list.append(ret_dict)
    return pd.DataFrame(ret_dict_list)

df_repost_24h_origin = get_24h_repost_features(df_train_repost)

In [None]:
pickle.dump(df_repost_24h_origin, open('%s/df_repost_24h_origin.pickle'%VAR_PATH, 'wb'))

# 交叉验证预测测试集24h数据

In [5]:
df_repost_24h_origin = pickle.load(open('%s/df_repost_24h_origin.pickle'%VAR_PATH, 'rb'))
df_weibo = pickle.load(open('%s/df_all.pickle'%VAR_PATH, 'rb'))

## 数据预处理

In [6]:
continue_feature_list = [

    #文本硬特征
    'weibotext_len', 
    
    #性别
    'Gender', 
    
    #用户粉丝，关注数
    'follow_mean', 'follow_median', 'follow_max', 'follow_min', 'follower_mean',
    'follower_median', 'follower_max', 'follower_min',
    
    #user id target encode
    'target_encode_0', 'target_encode_1', 'target_encode_2', 'target_encode_3', 'target_encode_4',
    'target_encode_cnt', 
    
    #转发特征(微博维度)
     'repost_weibo_cnt_15_30_mins', 'repost_weibo_cnt_15_mins','repost_weibo_cnt_30_45_mins','repost_weibo_cnt_30_mins',
     'repost_weibo_cnt_45_60_mins','repost_weibo_cnt_45_mins','repost_weibo_cnt_60_mins',
     'repost_weibo_follow_max_max',
     'repost_weibo_follow_max_mean','repost_weibo_follow_max_min','repost_weibo_follow_max_sum','repost_weibo_follow_mean_max',
     'repost_weibo_follow_mean_mean','repost_weibo_follow_mean_min','repost_weibo_follow_mean_sum','repost_weibo_follow_median_max',
     'repost_weibo_follow_median_mean','repost_weibo_follow_median_min','repost_weibo_follow_median_sum',
     'repost_weibo_follow_min_max', 'repost_weibo_follow_min_mean','repost_weibo_follow_min_min','repost_weibo_follow_min_sum',
     'repost_weibo_follower_max_max','repost_weibo_follower_max_mean','repost_weibo_follower_max_min',
     'repost_weibo_follower_max_sum','repost_weibo_follower_mean_max','repost_weibo_follower_mean_mean',
     'repost_weibo_follower_mean_min','repost_weibo_follower_mean_sum','repost_weibo_follower_median_max',
     'repost_weibo_follower_median_mean','repost_weibo_follower_median_min','repost_weibo_follower_median_sum',
     'repost_weibo_follower_min_max','repost_weibo_follower_min_mean','repost_weibo_follower_min_min',
     'repost_weibo_follower_min_sum','repost_weibo_pass_time_max','repost_weibo_pass_time_mean','repost_weibo_pass_time_median',
     'repost_weibo_pass_time_min', 'repost_weibo_Verified', 'repost_weibo_Gender',
    
    #水军
     'repost_weibo_unique_repost_userid',
     'repost_weibo_repeat_repost_cnt',
     'repost_weibo_max_user_repost',
     'repost_weibo_other_cnt_max',
     'repost_weibo_other_cnt_min',
     'repost_weibo_other_cnt_mean',
     'repost_weibo_other_cnt_std',
     'repost_weibo_other_cnt_0',
     'repost_weibo_text_len_max',
     'repost_weibo_text_len_min',
     'repost_weibo_text_len_mean',
     'repost_weibo_text_len_std',

    #转发特征(用户维度)    
     'repost_userid_cnt_15_30_mins','repost_userid_cnt_15_mins','repost_userid_cnt_30_45_mins','repost_userid_cnt_30_mins',
     'repost_userid_cnt_45_60_mins','repost_userid_cnt_45_mins','repost_userid_cnt_60_mins','repost_userid_follow_max_max',
     'repost_userid_follow_max_mean','repost_userid_follow_max_min','repost_userid_follow_max_sum','repost_userid_follow_mean_max',
     'repost_userid_follow_mean_mean','repost_userid_follow_mean_min','repost_userid_follow_mean_sum',
     'repost_userid_follow_median_max','repost_userid_follow_median_mean','repost_userid_follow_median_min',
     'repost_userid_follow_median_sum','repost_userid_follow_min_max','repost_userid_follow_min_mean',
     'repost_userid_follow_min_min','repost_userid_follow_min_sum','repost_userid_follower_max_max',
     'repost_userid_follower_max_mean','repost_userid_follower_max_min','repost_userid_follower_max_sum',
     'repost_userid_follower_mean_max','repost_userid_follower_mean_mean','repost_userid_follower_mean_min',
     'repost_userid_follower_mean_sum','repost_userid_follower_median_max','repost_userid_follower_median_mean',
     'repost_userid_follower_median_min','repost_userid_follower_median_sum','repost_userid_follower_min_max',
     'repost_userid_follower_min_mean','repost_userid_follower_min_min','repost_userid_follower_min_sum',
     'repost_userid_pass_time_max','repost_userid_pass_time_mean','repost_userid_pass_time_median',
     'repost_userid_pass_time_min', 'repost_userid_Verified', 'repost_userid_Gender',
        
     'repost_userid_unique_repost_userid',
     'repost_userid_repeat_repost_cnt',
     'repost_userid_max_user_repost',
     'repost_userid_other_cnt_max',
     'repost_userid_other_cnt_mean',
     'repost_userid_other_cnt_std',
     'repost_userid_other_cnt_0',
     'repost_userid_text_len_max',
     'repost_userid_text_len_min',
     'repost_userid_text_len_mean',
     'repost_userid_text_len_std',
    
    
    #交叉特征 24h转发特征 target encode
    'target_encode_24h_10_reposet_cnt',
       'target_encode_24h_11_reposet_cnt', 'target_encode_24h_12_reposet_cnt',
       'target_encode_24h_13_reposet_cnt', 'target_encode_24h_14_reposet_cnt',
       'target_encode_24h_15_reposet_cnt', 'target_encode_24h_16_reposet_cnt',
       'target_encode_24h_17_reposet_cnt', 'target_encode_24h_18_reposet_cnt',
       'target_encode_24h_19_reposet_cnt', 'target_encode_24h_20_reposet_cnt',
       'target_encode_24h_21_reposet_cnt', 'target_encode_24h_22_reposet_cnt',
       'target_encode_24h_23_reposet_cnt', 'target_encode_24h_24_reposet_cnt',
       'target_encode_24h_2_reposet_cnt', 'target_encode_24h_3_reposet_cnt',
       'target_encode_24h_4_reposet_cnt', 'target_encode_24h_5_reposet_cnt',
       'target_encode_24h_6_reposet_cnt', 'target_encode_24h_7_reposet_cnt',
       'target_encode_24h_8_reposet_cnt', 'target_encode_24h_9_reposet_cnt'
]

cat_feature_list = [
    
    #文本特征
    'is_video',

    #发表日期特征
    'post_day', 'post_weekday', 'post_month', 'post_hour',
    
    #交叉特征
    'userid_idx', 
    'cross_userid_idx_post_hour',
    'cross_userid_idx_post_weekday', 
    'cross_repost_1h_cnt_idx_post_hour', 'cross_repost_1h_cnt_idx_post_weekday',
]

hidden_feature_list = [
    'weibotext_wv_embed', 'weibotext_tfidf',#微博内容词向量/tfidf+svd降维
    'user_intro_wv_embed', 'user_intro_tfidf'#用户个性签名词向量/tfidf+svd降维
]

def generate_one_hot(df_weibo):
    global cat_feature_list
    
    cut_one_hot_feature_list = [    
        #转发特征(微博维度)
         'repost_weibo_cnt_15_30_mins', 'repost_weibo_cnt_15_mins','repost_weibo_cnt_30_45_mins','repost_weibo_cnt_30_mins',
         'repost_weibo_cnt_45_60_mins','repost_weibo_cnt_45_mins','repost_weibo_cnt_60_mins','repost_weibo_follow_max_max',
         'repost_weibo_follow_max_mean','repost_weibo_follow_max_min','repost_weibo_follow_max_sum','repost_weibo_follow_mean_max',
         'repost_weibo_follow_mean_mean','repost_weibo_follow_mean_min','repost_weibo_follow_mean_sum','repost_weibo_follow_median_max',
         'repost_weibo_follow_median_mean','repost_weibo_follow_median_min','repost_weibo_follow_median_sum',
         'repost_weibo_follow_min_max', 'repost_weibo_follow_min_mean','repost_weibo_follow_min_min','repost_weibo_follow_min_sum',
         'repost_weibo_follower_max_max','repost_weibo_follower_max_mean','repost_weibo_follower_max_min',
         'repost_weibo_follower_max_sum','repost_weibo_follower_mean_max','repost_weibo_follower_mean_mean',
         'repost_weibo_follower_mean_min','repost_weibo_follower_mean_sum','repost_weibo_follower_median_max',
         'repost_weibo_follower_median_mean','repost_weibo_follower_median_min','repost_weibo_follower_median_sum',
         'repost_weibo_follower_min_max','repost_weibo_follower_min_mean','repost_weibo_follower_min_min',
         'repost_weibo_follower_min_sum','repost_weibo_pass_time_max','repost_weibo_pass_time_mean','repost_weibo_pass_time_median',
         'repost_weibo_pass_time_min', 'repost_weibo_Verified', 'repost_weibo_Gender',

         'repost_weibo_unique_repost_userid',
         'repost_weibo_repeat_repost_cnt',
         'repost_weibo_max_user_repost',
         'repost_weibo_other_cnt_max',
         'repost_weibo_other_cnt_min',
         'repost_weibo_other_cnt_mean',
         'repost_weibo_other_cnt_std',
         'repost_weibo_other_cnt_0',
         'repost_weibo_text_len_max',
         'repost_weibo_text_len_min',
         'repost_weibo_text_len_mean',
         'repost_weibo_text_len_std',

        #转发特征(用户维度)    
         'repost_userid_cnt_15_30_mins','repost_userid_cnt_15_mins','repost_userid_cnt_30_45_mins','repost_userid_cnt_30_mins',
         'repost_userid_cnt_45_60_mins','repost_userid_cnt_45_mins','repost_userid_cnt_60_mins','repost_userid_follow_max_max',
         'repost_userid_follow_max_mean','repost_userid_follow_max_min','repost_userid_follow_max_sum','repost_userid_follow_mean_max',
         'repost_userid_follow_mean_mean','repost_userid_follow_mean_min','repost_userid_follow_mean_sum',
         'repost_userid_follow_median_max','repost_userid_follow_median_mean','repost_userid_follow_median_min',
         'repost_userid_follow_median_sum','repost_userid_follow_min_max','repost_userid_follow_min_mean',
         'repost_userid_follow_min_min','repost_userid_follow_min_sum','repost_userid_follower_max_max',
         'repost_userid_follower_max_mean','repost_userid_follower_max_min','repost_userid_follower_max_sum',
         'repost_userid_follower_mean_max','repost_userid_follower_mean_mean','repost_userid_follower_mean_min',
         'repost_userid_follower_mean_sum','repost_userid_follower_median_max','repost_userid_follower_median_mean',
         'repost_userid_follower_median_min','repost_userid_follower_median_sum','repost_userid_follower_min_max',
         'repost_userid_follower_min_mean','repost_userid_follower_min_min','repost_userid_follower_min_sum',
         'repost_userid_pass_time_max','repost_userid_pass_time_mean','repost_userid_pass_time_median',
         'repost_userid_pass_time_min', 'repost_userid_Verified', 'repost_userid_Gender',

         'repost_userid_unique_repost_userid',
         'repost_userid_repeat_repost_cnt',
         'repost_userid_max_user_repost',
         'repost_userid_other_cnt_max',
    #      'repost_userid_other_cnt_min',
         'repost_userid_other_cnt_mean',
         'repost_userid_other_cnt_std',
         'repost_userid_other_cnt_0',
         'repost_userid_text_len_max',
         'repost_userid_text_len_min',
         'repost_userid_text_len_mean',
         'repost_userid_text_len_std',
    ]

    one_hot_feature_name_list = []
    df_one_hot_list = []
    for f in cat_feature_list:
        df_one_hot = pd.get_dummies(df_weibo[f])
        one_hot_feature_name = ['%s_%s'%(f, name) for name in df_one_hot.columns]
        one_hot_feature_name_list.extend(one_hot_feature_name)
        df_one_hot.columns = one_hot_feature_name 
        df_one_hot_list.append(df_one_hot)

    cut_num = 5
    for f in cut_one_hot_feature_list:
        df_one_hot = pd.get_dummies(pd.qcut(df_weibo[f], cut_num, duplicates="drop"))
        one_hot_feature_name = ['%s_qcut_%s'%(f, name) for name in df_one_hot.columns]
        one_hot_feature_name_list.extend(one_hot_feature_name)
        df_one_hot.columns = one_hot_feature_name 
        df_one_hot_list.append(df_one_hot)

        df_one_hot = pd.get_dummies(pd.cut(df_weibo[f], cut_num))
        one_hot_feature_name = ['%s_cut_%s'%(f, name) for name in df_one_hot.columns]
        one_hot_feature_name_list.extend(one_hot_feature_name)
        df_one_hot.columns = one_hot_feature_name 
        df_one_hot_list.append(df_one_hot)
    return pd.concat(df_one_hot_list, 1)

In [7]:
dense_mean = np.array(df_weibo[continue_feature_list].mean()) 
dense_std = np.array(df_weibo[continue_feature_list].std())

forword_cnt_mean = df_weibo.query('type=="train"')['ForwordCount'].mean()
forword_cnt_std = df_weibo.query('type=="train"')['ForwordCount'].std()

mp_userid_idx = dict([(userid,i) for i, userid in enumerate(set(df_weibo['UserId']))])

df_weibo.index = df_weibo.WeiboId

# df_weibo['sample_weight'] = df_weibo['label'].map({0:1,1:10,2:50,3:100,4:300,-1:0})
df_weibo['sample_weight'] = df_weibo['label'].map({0:0.00333333,1:0.03333333,2:0.16666667,3:0.33333333,4:1,-1:0})

df_weibo_one_hot = generate_one_hot(df_weibo)
one_hot_feature_list = list(df_weibo_one_hot.columns)
df_weibo = pd.concat([df_weibo, df_weibo_one_hot], 1)

df_repost_24h_origin.index = df_repost_24h_origin['WeiboId']
df_repost_24h_origin = df_repost_24h_origin.drop('WeiboId', 1)

  app.launch_new_instance()


In [132]:
def count2idx(num):
    if 0 <= num <= 10:
        return 0
    elif 10 <= num <= 50:
        return 1
    elif 50 <= num <= 150:
        return 2
    elif 150 <= num <= 300:
        return 3
    elif num >= 300:
        return 4
    if num < 0:
        print(num)
        return 0
    print(num)

def get_distibution(cnt):
    ret = np.zeros(5)
    if cnt > 400:
        cnt = 400
    ll = [-10,11,51,151,301]
    rr = [10,50,150,300,500]
    median = [(l+r)//2 for l,r in zip(ll, rr)]
    interval_length = [r-l+3 for l, r in zip(ll, rr)]
    eps = 1e-8
    for i in range(5):
        dis = abs(cnt - median[i])/interval_length[i]
        ret[i] = 1/(dis+eps)
    return ret/sum(ret)

def get_mp_distibution(func_name):
    ret = {}
    for i in range(0, 500):
        ret[i] = func_name(i)
    return ret

mp_distibution = get_mp_distibution(get_distibution)
mp_distibution[-1] = mp_distibution[0]

In [133]:
df_repost_24h_label = pd.DataFrame()
for i in range(2, 25):
    df_repost_24h_label[i] = df_repost_24h_origin['24h_%d_reposet_cnt'%i].apply(count2idx)
df_repost_24h_label.index = df_repost_24h_origin.index

In [134]:
df_repost_24h_origin_soft = pd.DataFrame()
for i in range(2, 25):
    df_repost_24h_origin_soft[i] =\
    df_repost_24h_origin['24h_%d_reposet_cnt'%i].apply(lambda x:mp_distibution.get(x, -1) if x < 400 else mp_distibution[400])

In [136]:
df_weibo['1h_cnt_soft'] = \
df_weibo['repost_weibo_cnt_60_mins'].apply(lambda x:mp_distibution.get(x, -1) if x < 400 else mp_distibution[400])

def apply_1h_cnt_hard(x):
    label = count2idx(x)
    ret = np.zeros(5)
    ret[label] = 1.
    return ret
df_weibo['1h_cnt_hard'] = \
df_weibo['repost_weibo_cnt_60_mins'].apply(apply_1h_cnt_hard)

-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0


-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0


## Dataset

In [148]:
class WeiboDataset(Data.Dataset):
    def __init__(self, weibo_id):
        self.weibo_id = weibo_id
        
    def __len__(self):
        return len(self.weibo_id)
    
    def __getitem__(self,idx):
        return self.weibo_id[idx]

def collate_fn(weibo_ids, mask_24h=False):

    global dense_mean, dense_std, df_weibo, albert_tokenizer, mp_userid_idx, one_hot_feature_list, df_repost_24h_origin_soft
        
    df_weibo_sub = df_weibo.loc[weibo_ids]
    #连续性特征归一化
    dense = (np.array(df_weibo_sub[continue_feature_list].values) - dense_mean) /dense_std 
    
    #标签
    y = np.array(df_weibo_sub['label'])
    forword_cnt = np.array(df_weibo_sub['ForwordCount'])
    #样本权重
    sample_weight = np.array(df_weibo_sub['sample_weight'])
    cal_loss = df_weibo.at[weibo_ids[0], 'label'] != -1
    
    #序列文本特征用于bert encoder
    #微博正文
#     df_weibo_text = df_weibo_sub['WeiboText']
#     max_length = min(args.weibo_text_max_length, max(df_weibo_text.apply(len)))
#     weibotext_tokenized_list = []
#     for sentence in df_weibo_text:
#         sentence = list(sentence)
#         if len(sentence) > max_length:
#             text = sentence[:max_length]
#         else:
#             text = sentence + ['<pad>']*(max_length-len(sentence))
#         weibotext_tokenized_list.append(albert_tokenizer.encode(text))        
    
    #用户简介
#     max_length = max(df_weibo_sub['intro'].apply(len))
#     intro_tokenized_list = []
#     for sentence in df_weibo_sub['intro']:
#         sentence = list(sentence)
#         if len(sentence) > max_length:
#             text = sentence[:max_length]
#         else:
#             text = sentence + ['<pad>']*(max_length-len(sentence))
#         intro_tokenized_list.append(albert_tokenizer.encode(text))        
    
    #固定文本特征
    text_feature_list = []
    for f in ['weibotext_wv_embed', 'weibotext_tfidf', 'user_intro_wv_embed', 'user_intro_tfidf']:
        hidden_train = np.array(list(df_weibo_sub[f]))
        text_feature_list.append(hidden_train)
    text_hidden = np.concatenate(text_feature_list, 1)
    
    user_id_token = np.array(df_weibo_sub['UserId'].map(mp_userid_idx))
    
    #转发用户+转发文本 聚合
    max_repost_user_set = 512
    np_repost_text_wv = np.zeros((len(weibo_ids), max_repost_user_set, 10))
    np_repost_user_dense = np.zeros((len(weibo_ids), max_repost_user_set, 8))
    np_repost_user_id = np.zeros((len(weibo_ids), max_repost_user_set)).astype('int')
    np_repost_set_len = np.zeros((len(weibo_ids),))
    for i, weiboid in enumerate(weibo_ids):
        userid_set_len = min(max_repost_user_set, df_weibo_sub.at[weiboid, 'repost_weibo_set_len'])
        np_repost_text_wv[i,:userid_set_len,:] = np.array(df_weibo_sub.at[weiboid, 'repost_weibo_repost_text_wv'])[:userid_set_len]
        np_repost_user_id[i,:userid_set_len] = np.array(df_weibo_sub.at[weiboid, 'repost_weibo_repost_user_id_freq'])[:userid_set_len]
        np_repost_set_len[i] = userid_set_len
        np_repost_user_dense[i,:userid_set_len] = np.array(df_weibo_sub.at[weiboid, 'repost_weibo_set_user_dense'])[:userid_set_len]
    
    
    ret_dict = {}
    if cal_loss:
        df_24h_soft_sub = df_repost_24h_origin_soft.loc[weibo_ids]
        df_repost_24h_label_sub = df_repost_24h_label.loc[weibo_ids]
        for i in range(2, 25):
            ret_dict['24_h_%d'%i] = torch.tensor(np.array(list(df_24h_soft_sub[i]))).float().to(args.device)
            ret_dict['24_h_%d_label'%i] = torch.tensor(np.array(list(df_repost_24h_label_sub[i]))).long().to(args.device)
    
    ret_dict.update({
        'cal_loss':cal_loss,
        'mask_24h':mask_24h,
        'x_dense':torch.tensor(dense).float().to(args.device), 
        'label':torch.tensor(y).long().to(args.device),
        'forword_cnt' : torch.tensor(forword_cnt).float().to(args.device),
        'sample_weight':torch.tensor(sample_weight).float().to(args.device),
#         'weibo_text_token':torch.tensor(weibotext_tokenized_list).to(args.device),
#         'intro_text_token':torch.tensor(intro_tokenized_list).to(args.device),
        'text_wv_tfidf':torch.tensor(text_hidden).float().to(args.device),
        'user_id_token':torch.tensor(user_id_token).long().to(args.device),
        
        'repost_set_text_wv':torch.tensor(np_repost_text_wv).float().to(args.device),
        'repost_set_userid':torch.tensor(np_repost_user_id).long().to(args.device),
        'repost_set_len' : torch.tensor(np_repost_set_len).long().to(args.device),
        'repost_user_dense':torch.tensor(np_repost_user_dense).float().to(args.device),
        'one_hot_feature':torch.tensor(np.array(df_weibo_sub[one_hot_feature_list])).float().to(args.device),
        '1h_cnt_soft':torch.tensor(np.array(list(df_weibo_sub['1h_cnt_soft']))).float().to(args.device),
        '1h_cnt_hard':torch.tensor(np.array(list(df_weibo_sub['1h_cnt_hard']))).float().to(args.device),
    })
    
    return ret_dict

## Model

In [149]:
# # 表示feature_len=100, hidden_len=20
# cell = nn.RNNCell(100, 20)
# # 某一时刻的输入, 共3个样本序列(batch=3), 每个特征100维度(feature_len=100)
# x = torch.randn(3, 100)
# # 所有时刻的输入, 一共有10个时刻, 即seq_len=10
# xs = [torch.randn(3, 100) for i in range(10)]
# # 初始化隐藏记忆单元, batch=3, hidden_len=20
# h = torch.zeros(3, 20)
# # 对每个时刻的输入, 传入这个nn.RNNCell计算单元, 还要传入上一时h, 以进行前向计算
# for xt in xs:
#     h = cell(xt, h)


In [158]:
class GeLU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1. + torch.tanh(x * 0.7978845608 * (1. + 0.044715 * x * x)))

class Linear(nn.Module):
    def __init__(self, in_features, out_features, activations=False):
        super().__init__()
        linear = nn.Linear(in_features, out_features)
        nn.init.normal_(linear.weight, std=math.sqrt((2. if activations else 1.) / in_features))
        nn.init.zeros_(linear.bias)
        modules = [nn.utils.weight_norm(linear)]
        if activations:
            modules.append(GeLU())
        self.model = nn.Sequential(*modules)

    def forward(self, x):
        return self.model(x)

class CeLossOut(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        embedding = in_features
        self.fc = Linear(in_features, embedding, True)
#         self.loss_func = nn.CrossEntropyLoss(weight=torch.tensor([1.,10,50,100,300])).to(args.device)
#         self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=embedding, nhead=4), 
#                                                 num_layers=1)
        
        self.rnn_cell = nn.GRUCell(5, embedding)

        self.out = Linear(embedding, 5*args.n_label)
        
    def loss_func_soft(self, logit, input_dict, i):
        logit_softmax = torch.nn.Softmax(1)(logit) 
        logit_softmax = torch.log(logit_softmax+0.001)
        score_arr = logit_softmax * input_dict['24_h_%d'%i]
        score_arr = score_arr.mean(1) * input_dict['sample_weight']
        loss = -torch.mean(score_arr)
        return loss
    
    def loss_func_hard(self, logit, input_dict, i):
        loss_func = nn.CrossEntropyLoss(reduction='none').to(args.device)
        loss_arr = loss_func(logit, input_dict['24_h_%d_label'%i])
        loss = (loss_arr * input_dict['sample_weight']).mean()
        return loss

    
    def forward(self, x, input_dict):
#         h = self.fc(x)
#         x0 = input_dict['1h_cnt_soft']
#         seq_list = []
#         for i in range(2,25):
#             h = self.rnn_cell(x0, h)
#             x0 = self.out(h)
#             seq_list.append(x0)

#         logit = torch.cat(seq_list, -1)
        logit = self.out(x)
        loss = None
        if input_dict['cal_loss']:
            loss = 0
            for i in range(2, 25):
                idx = i-2
                loss += self.loss_func_soft(logit[:, idx*5:idx*5+5], input_dict, i)
        return loss, logit

class PoissonLossOut(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.out = Linear(in_features, args.n_label)
    
    def forward(self, x, input_dict):
        logit = self.out(x)
        loss = None
        if input_dict['cal_loss']:
            loss_func = nn.PoissonNLLLoss(log_input=True, reduction='none')
            max_v = 11
            logit[logit>max_v] = max_v
            loss_arr = loss_func(logit, input_dict['24h_repost_cnt']-input_dict['1h_repost_cnt'].unsqueeze(-1))
#             loss_arr = loss_func(logit, input_dict['24h_repost_residual'])

            loss = (loss_arr * input_dict['sample_weight'].unsqueeze(-1)).mean() 
            loss = loss_arr.mean()
        return loss, torch.exp(logit)+input_dict['1h_repost_cnt'].unsqueeze(-1)

class MSELossOut(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.out = Linear(in_features, 1)
    
    def forward(self, x, input_dict):
        global forword_cnt_mean, forword_cnt_std
        logit = self.out(x)
        loss = None
        if input_dict['cal_loss']:
            norm_target = (input_dict['forword_cnt']-forword_cnt_mean)/forword_cnt_std
#             norm_target = torch.log(input_dict['forword_cnt'])
            loss_arr = (logit.squeeze(-1)-norm_target)**2
#             loss = (loss_arr * input_dict['sample_weight']).mean()
            loss = loss_arr.mean()
        
        logit = logit*forword_cnt_std + forword_cnt_mean
#         logit = torch.exp(logit)
        logit[logit < 0] = 0
        return loss, logit
    
class TargetLossOut(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.out = Linear(in_features, 5)
    
    def forward(self, x, input_dict):
        global forword_cnt_mean, forword_cnt_std
        logit = self.out(x)
        loss = None
        if input_dict['cal_loss']:
            logit_softmax = torch.nn.Softmax(1)(logit) 
#             logit_softmax = torch.log(logit_softmax+0.001)
            score_arr = logit_softmax * torch.tensor([0.00333333,0.03333333,0.16666667,0.33333333,1]).to(logit_softmax.device)
            score_arr = score_arr[:, input_dict['label']] * torch.eye(input_dict['label'].shape[0]).to(logit_softmax.device)
            loss = -torch.sum(score_arr)/torch.sum(input_dict['sample_weight'])
            
        return loss, logit         

class MLPAttentionPool(nn.Module):
    def __init__(self,key_size,units):
        super().__init__()
        self.proj = nn.Sequential(nn.Linear(key_size,units,bias=False),
                                  nn.Tanh(),
                                  nn.Linear(units,1,bias=False))
        
    def masked_softmax_1d(self, X, valid_len):
        if valid_len is None:
            return F.softmax(X,dim=-1), _
        else:
            shape=X.shape
            if valid_len.dim()==1:
                valid_len=valid_len.view(-1,1).repeat(1,shape[1])
            
            mask=(torch.arange(0,X.shape[-1]).to(X.device).repeat(X.shape[0],1)<valid_len).bool()
            
            X = X.masked_fill_(~mask, -float('inf'))
            return F.softmax(X,dim=-1).view(shape), mask

    def forward(self, key, valid_len):
        scores = self.proj(key).squeeze(-1)
        attention_weights, mask = self.masked_softmax_1d(scores,valid_len)
        seq_out = attention_weights.unsqueeze(-1) * key
        return seq_out.sum(1)
    
class RepostSetEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        user_id_embed = 10
        text_embed = 10
        user_dense_embed = 8
        self.user_embeding = nn.Embedding(4000, user_id_embed)        
        self.attention_pool = MLPAttentionPool(user_id_embed + text_embed + user_dense_embed, 64)
        
    def forward(self, input_dict):
        x_set_wv = input_dict['repost_set_text_wv']
        x_set_user_embed = self.user_embeding(input_dict['repost_set_userid'])
        x_set_user_dense = input_dict['repost_user_dense']
        x = torch.cat([x_set_wv, x_set_user_embed, x_set_user_dense], -1)
        return self.attention_pool(x, input_dict['repost_set_len'])
    
class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        global continue_feature_list, one_hot_feature_list

        in_features = len(continue_feature_list)
        bert_encode_feature = 312
        bert_encode_feature = 0
        text_wv_tfidf_feature = 40
        user_id_embed = 10
        repost_set_embed = 28
#         repost_set_embed = 0
        one_hot_feature = len(one_hot_feature_list)
        
#         global pretrained
#         self.albert_weibotext = BertModel.from_pretrained(pretrained).to(args.device)
#         self.albert_intro = BertModel.from_pretrained(pretrained).to(args.device)
        
#         self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=bert_encode_feature, nhead=8), 
#                                                     num_layers=1)

        self.user_embeding = nn.Embedding(90, user_id_embed)
        
        self.repost_set_encoder = RepostSetEncoder().to(args.device)

        self.fc = Linear(in_features, args.embedding_size, True)
        
        self.other_fc = Linear(2*bert_encode_feature + text_wv_tfidf_feature + user_id_embed + repost_set_embed + one_hot_feature,
                         args.embedding_size, True)

        self.fc_out = Linear(2*args.embedding_size, args.embedding_size, True)
        
        self.dropout = torch.nn.Dropout(args.dropout_rate)
        
        self.out_layer  = CeLossOut(args.embedding_size)
        
    def forward(self, input_dict):
#         x_weibotext = self.albert_weibotext(input_dict['weibo_text_token'])[1]
#         x_intro = self.albert_intro(input_dict['intro_text_token'])[1]
#         x_seq = self.transformer_encoder(x_seq)
#         x_text_out = torch.max(x_seq, 1)[0]
#         print(x_bert.shape)
        x_user_embed = self.user_embeding(input_dict['user_id_token'])
        
        x_repost_embeding = self.repost_set_encoder(input_dict)
        
        x = self.other_fc(torch.cat([input_dict['text_wv_tfidf'], x_user_embed,
                       x_repost_embeding, input_dict['one_hot_feature']], 1))
        x = torch.cat([x, self.fc(input_dict['x_dense'])], 1)
        x = self.dropout(x)
        x = self.fc_out(x)
        return self.out_layer(x, input_dict)

## 训练过程

In [159]:
def precision_score_cch(predictions, ground_truths):
    
    predictions, ground_truths = np.array(predictions)-1, np.array(ground_truths)-1
    y_pred = predictions
    y_true = ground_truths

    w=[1,10,50,100,300]
    n = len(y_true)
    count_r = [0 for i in range(5)]
    count = [0 for i in range(5)]
    for i in range(n):
        count[y_true[i]] += 1
        if y_pred[i] == y_true[i]:
            count_r[y_pred[i]] += 1
    sum1 = sum(w[i]*count_r[i] for i in range(5))
    sum2 = sum(w[i]*count[i] for i in range(5))
    precision = sum1/sum2
    return precision

def score_24h(predictions, ground_truths):
    return np.abs(predictions-ground_truths).mean(0)
    
def logit_residual(logits, start_cnt):
    logits[:, 0] += start_cnt
    for i in range(1, logits.shape[1]):
        logits[:, i] += logits[:, i-1]
    return logits

def score_24h_ce(logits, ground_truths):
    ret = []
    ground_truths += 1
    for i in range(args.n_label):
        predictions = np.argmax(logits[:, i*5:i*5+5], 1)+1
        ret.append(precision_score_cch(predictions, ground_truths[:, i]))
    return ret

In [160]:

def train_nn(weibo_id_list):
    train_dataset=WeiboDataset(weibo_id_list)
    data_loader = Data.DataLoader(
        dataset=train_dataset,      
        batch_size=args.batch_size,      
        shuffle=True,
        collate_fn=collate_fn,
        num_workers = args.n_worker,
    )
    
    model = DNN().to(args.device)
        
    no_decay = ["bias", "gamma","beta"]
    optimizer_grouped_parameters = [
        {
            "params": [p for name, p in model.named_parameters() if 'albert' not in name],
            "lr": args.lr,
        },
        {
            "params": [p for name, p in model.named_parameters() if 'albert' in name],
            "lr": args.fine_tune_layer_lr,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr = args.lr, weight_decay = args.weight_decay)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(len(weibo_id_list)//(args.batch_size)),
        num_training_steps=int(len(weibo_id_list) / args.batch_size * args.epoch)
    )
    
    
    for _ in range(args.epoch):
        for input_dict in tqdm(data_loader):
#             print(word_matrix)
            loss, logit = model(input_dict)            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 5)
            optimizer.step()
            scheduler.step()
            if args.debug:
                break

    return model

def forward_nn(model, weibo_id_list):
    model.eval()
    train_dataset=WeiboDataset(weibo_id_list)
    data_loader = Data.DataLoader(
        dataset=train_dataset,      
        batch_size=args.batch_size,      
        shuffle=False,
        collate_fn=lambda x:collate_fn(x,True),
        num_workers = args.n_worker,
    )
    
    with torch.no_grad():
        pre_list = []
        for input_dict in tqdm(data_loader):
            _, logit = model(input_dict)
            pre_list.append(logit.cpu().detach().numpy())
    return np.concatenate(pre_list)

def cross_validation_nn(train_weibo_id_list, test_weibo_id_list):
    global df_repost_24h_origin
    n_flod = args.n_flod
    folds = KFold(n_splits=n_flod, shuffle=True, random_state=SEED)
    train_weibo_id_list = np.array(train_weibo_id_list)
    test_weibo_id_list = np.array(test_weibo_id_list)
    
    
    score_train = np.zeros((len(train_weibo_id_list), 5*args.n_label))
    score_test = np.zeros((len(test_weibo_id_list), 5*args.n_label))
    score_train_train = np.zeros((len(train_weibo_id_list), 5*args.n_label))
    
    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_weibo_id_list, train_weibo_id_list)):
        
        trn_weibo_id_list = train_weibo_id_list[trn_idx]
        val_weibo_id_list = train_weibo_id_list[val_idx]
        model = train_nn(trn_weibo_id_list)
        
        model.eval()
        trn_logit = forward_nn(model, trn_weibo_id_list)
        val_logit = forward_nn(model, val_weibo_id_list)
        
#         trn_logit = logit_residual(trn_logit, np.array(df_weibo.loc[trn_weibo_id_list]['repost_weibo_cnt_60_mins']))
#         val_logit = logit_residual(val_logit, np.array(df_weibo.loc[val_weibo_id_list]['repost_weibo_cnt_60_mins']))

#         print(trn_logit)
#         print(pd.Series([count2idx(num)+1 for num in trn_logit[:, 0]]).value_counts())
        
        ipd.display(pd.DataFrame({
            'train': score_24h_ce(trn_logit, np.array(df_repost_24h_label.loc[trn_weibo_id_list]).astype('int')),
            'val': score_24h_ce(val_logit, np.array(df_repost_24h_label.loc[val_weibo_id_list]).astype('int')),
#             'random':score_24h_ce(np.array(df_repost_24h_origin.loc[val_weibo_id_list].sample(frac=1)), 
#                                np.array(df_repost_24h_origin.loc[val_weibo_id_list])),
        }, index=['%dh'%i for i in range(2, 25)]))
        
        score_train_train[trn_idx] = trn_logit
        score_train[val_idx] = val_logit
        
        test_logit = forward_nn(model, test_weibo_id_list)
#         test_logit = logit_residual(test_logit, np.array(df_weibo.loc[test_weibo_id_list]['repost_weibo_cnt_60_mins']))
        score_test += test_logit/n_flod
    
    
    
    ipd.display(pd.DataFrame({
        'train': score_24h_ce(score_train_train, np.array(df_repost_24h_label.loc[train_weibo_id_list]).astype('int')),
        'val': score_24h_ce(score_train, np.array(df_repost_24h_label.loc[train_weibo_id_list]).astype('int')),
    }))
    return score_train, score_test

## Main

In [164]:
SEED = 520
ARG = namedtuple('ARG', [
    'batch_size',
    'epoch',
    'lr',
    'fine_tune_layer_lr',
    'weight_decay',
    'dropout_rate',
    'n_worker',
    'device',
    'embedding_size',
    'weibo_text_max_length',
    'n_flod',
    'debug',
    'n_label',
])

args = ARG(
    batch_size = 64,
    epoch = 5,
    lr = 0.01,
    fine_tune_layer_lr=2e-5,
    weight_decay = 0.01,
    dropout_rate = 0.1,
    n_worker = 0,
    device=torch.device("cuda:1"),
#     device=torch.device("cpu"),
    embedding_size = 100,
    weibo_text_max_length = 500,
    n_flod = 5,
    debug = False,
    n_label = 23
)

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  
# 设置随机数种子
setup_seed(SEED)


start = time.time()
train_weibo_id_list = list(df_weibo.query('type == "train"').index)
test_weibo_id_list = list(df_weibo.query('type == "test"').index)

score_train, score_test =\
cross_validation_nn(train_weibo_id_list, test_weibo_id_list)

time.time() - start

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

Unnamed: 0,train,val
2h,0.86068,0.827219
3h,0.849812,0.751524
4h,0.841996,0.735864
5h,0.846016,0.697336
6h,0.853161,0.68935
7h,0.844702,0.65104
8h,0.84393,0.655403
9h,0.849257,0.643209
10h,0.849796,0.645354
11h,0.842023,0.671956


  0%|          | 0/37 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

Unnamed: 0,train,val
2h,0.873445,0.833216
3h,0.859606,0.78711
4h,0.840132,0.764573
5h,0.842041,0.735137
6h,0.842579,0.740064
7h,0.845003,0.716125
8h,0.840715,0.727753
9h,0.845712,0.725631
10h,0.84196,0.723989
11h,0.847054,0.718424


  0%|          | 0/37 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

Unnamed: 0,train,val
2h,0.8824,0.82639
3h,0.853488,0.774323
4h,0.842042,0.77176
5h,0.838153,0.746268
6h,0.845899,0.72617
7h,0.850353,0.697484
8h,0.853578,0.6697
9h,0.853551,0.663889
10h,0.850745,0.645284
11h,0.841812,0.639572


  0%|          | 0/37 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

Unnamed: 0,train,val
2h,0.857033,0.829723
3h,0.859677,0.748815
4h,0.840342,0.670905
5h,0.834959,0.649163
6h,0.84025,0.658666
7h,0.834576,0.661993
8h,0.838245,0.662721
9h,0.839521,0.652162
10h,0.840104,0.662566
11h,0.835194,0.650573


  0%|          | 0/37 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

Unnamed: 0,train,val
2h,0.872241,0.846738
3h,0.838544,0.825016
4h,0.821091,0.771117
5h,0.829537,0.757197
6h,0.829234,0.754397
7h,0.833763,0.742657
8h,0.832092,0.726578
9h,0.826835,0.72176
10h,0.830431,0.716421
11h,0.829742,0.698567


  0%|          | 0/37 [00:00<?, ?it/s]

Unnamed: 0,train,val
0,0.871755,0.832757
1,0.842234,0.777141
2,0.826161,0.741988
3,0.834478,0.715878
4,0.830985,0.712649
5,0.834227,0.692803
6,0.832933,0.687858
7,0.831221,0.680512
8,0.833187,0.678238
9,0.830843,0.675709


714.5982031822205

In [163]:
df_24h_feature_predict = pd.DataFrame(np.concatenate([score_train, score_test], 0),
                                      columns = ['24h_%d_ce'%(i) for i in range(score_train.shape[1])],
                                      index=train_weibo_id_list+test_weibo_id_list)
pickle.dump(df_24h_feature_predict, open('%s/df_24h_feature_predict.pickle'%VAR_PATH, 'wb'))

In [162]:
print(precision_score_cch(np.argmax(score_train[:,-5:], 1)+1,
                          np.array(df_weibo.loc[train_weibo_id_list]['label']).astype('int')+1))

0.674740475156894


In [112]:
df_24h_feature_predict.loc[train_weibo_id_list]

Unnamed: 0,24h_2_cnt,24h_3_cnt,24h_4_cnt,24h_5_cnt,24h_6_cnt,24h_7_cnt,24h_8_cnt,24h_9_cnt,24h_10_cnt,24h_11_cnt,...,24h_15_cnt,24h_16_cnt,24h_17_cnt,24h_18_cnt,24h_19_cnt,24h_20_cnt,24h_21_cnt,24h_22_cnt,24h_23_cnt,24h_24_cnt
763ce4f8e9efcdee22c2d6ce213e63b1f537a4f3,13.564613,17.246723,19.855774,21.646484,23.347260,24.956751,26.367662,27.477936,28.860573,29.917555,...,32.665710,32.976688,33.666336,34.407837,34.520889,35.282455,35.874401,36.172356,36.536362,37.350380
42d54a1097102d9a75a511e0f5636a994ff7dd7c,4.681041,5.401617,6.078712,6.533150,6.878187,7.145560,7.275946,7.506338,7.988200,7.812168,...,8.410031,8.473125,8.739178,8.812311,8.738842,8.756796,9.042443,9.183988,9.126014,9.251586
2d8941c58c9d2631c20f9c09a9da593b3bbde216,1.231175,1.306143,1.460704,1.489174,1.470732,1.578662,1.579891,1.626033,1.622843,1.695484,...,1.814142,1.769767,1.871040,1.855765,1.844180,1.872044,1.973104,1.887733,1.962945,1.893742
dfb9c0df6b005901a35bf88027b0ef55d9000307,1.471867,1.710959,1.897858,2.077141,2.171470,2.288732,2.389746,2.384694,2.481097,2.543777,...,2.772841,2.807849,2.858420,2.986877,2.957128,3.059203,3.056798,3.058477,3.122874,3.211499
ce721859f7ed701a0897897b80d185f69adbc954,15.728475,17.599394,18.838388,19.682812,20.272060,20.860115,21.110653,21.563339,22.031948,22.454010,...,23.839268,24.008007,24.178602,24.432610,24.544291,24.809427,24.919399,25.094673,25.136696,25.405006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359a2b3f0466421bda578e68fdb1ce02d28a5da,0.868205,1.524034,1.706019,2.097161,2.338561,2.599283,2.787515,3.002707,3.070028,3.298299,...,3.804104,3.922921,4.004694,4.155630,4.188793,4.297733,4.435905,4.395230,4.359338,4.679287
e754c308f1a3d968171e1f15a2524ef2b6016a0f,2.969362,3.369297,3.694206,3.861214,4.062732,4.144640,4.363541,4.493313,4.541786,4.568008,...,4.940418,5.006027,5.088579,4.975103,5.199602,5.184316,5.257410,5.310193,5.417529,5.393267
d63e5339dea5a6f6e441691b5ba1adcdca1a49bd,7.091922,8.185369,9.048647,9.556951,10.012577,10.407040,10.919363,11.130489,11.516726,11.841363,...,12.809282,13.106936,13.366754,13.655855,13.637417,13.856207,13.919920,14.214119,14.203831,14.333319
b56faacc0741690d6b50214fd297ea224fb49070,5.749001,10.480298,13.640436,16.218962,20.007818,22.657492,24.345432,26.912754,29.039322,29.475134,...,33.744221,34.641243,35.221287,34.724224,34.611465,37.595409,38.786518,40.192284,39.682659,40.834507


In [114]:
df_repost_24h_origin.loc['2359a2b3f0466421bda578e68fdb1ce02d28a5da']

24h_2_reposet_cnt     0
24h_3_reposet_cnt     0
24h_4_reposet_cnt     1
24h_5_reposet_cnt     1
24h_6_reposet_cnt     1
24h_7_reposet_cnt     1
24h_8_reposet_cnt     1
24h_9_reposet_cnt     1
24h_10_reposet_cnt    1
24h_11_reposet_cnt    1
24h_12_reposet_cnt    1
24h_13_reposet_cnt    1
24h_14_reposet_cnt    1
24h_15_reposet_cnt    2
24h_16_reposet_cnt    2
24h_17_reposet_cnt    2
24h_18_reposet_cnt    2
24h_19_reposet_cnt    2
24h_20_reposet_cnt    2
24h_21_reposet_cnt    2
24h_22_reposet_cnt    2
24h_23_reposet_cnt    2
24h_24_reposet_cnt    2
Name: 2359a2b3f0466421bda578e68fdb1ce02d28a5da, dtype: int64