In [1]:
%matplotlib inline
import os
import re
import collections
import pickle
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.extmath import randomized_svd
from sklearn.model_selection import train_test_split

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *
from collections import Counter

import smart_open
smart_open.open = smart_open.smart_open
import gensim

from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [2]:
def load(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

def dump(value, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        pickle.dump(value, f)

In [3]:
doc_new_traing = False
matrix_reload = True

nb_feature_main = 20 # ユーザーの特徴量の次元削減の調整値
nb_feature_sub = 50 # ユーザーの特徴量の次元削減の調整値
threshold = 20000 # ユーザーの特徴量を取得するときの調整値（tweet数上位thresholdまでを取得する）

dataset_path = '../data/dataset/'
dict_ = load(dataset_path+'datset_dict.pkl')
df_tweet_data = load(dataset_path+'df_tweet_data.pkl')

In [4]:
def get_stats(dict_):
    """
    datasetから各要素を抽出する
    
    Parameters
    ----------
    dict_ : dict
        event単位で各学習用の要素をまとめたdict

    Returns
    -------
    nb_messages : list
        各eventに紐づく要素数
        
    eid_list : list
        evntのIDのリスト
        
    user_set : set
        userの集合
        
    list_lengths: list
        各eventの最大経過時間差
    
    label_list: list
        labelのlist
    
    """
    nb_messages = []
    eid_list = []
    user_set = set()
    list_lengths = []
    label_list = []
    for eid, messages in dict_.items():
        nb_messages.append(len(messages['timestamps']))
        eid_list.append(eid)

        user_set.update(messages['uid'])
        ts = np.array(messages['timestamps'], dtype=np.float32)
        list_lengths.append(ts[0])
        label_list.append(dict_[eid]['label'])
        
    return nb_messages, eid_list, user_set, list_lengths, label_list

nb_messages, eid_list, user_set, list_lengths, label_list = get_stats(dict_)  

print("# events : {}".format(len(eid_list)))
print("# users : {}".format(len(user_set)))
print("# messages : {}".format(np.sum(nb_messages)))
print("Avg. time length : {} sec\t{} hours".format(np.mean(list_lengths),np.mean(list_lengths)/3600))
print("Avg. # messages : {}".format(np.mean(nb_messages)))
print("Max # messages : {}".format(np.max(nb_messages)))
print("Min # messages : {}".format(np.min(nb_messages)))
print("Avg. messages / each user : {}".format(np.sum(nb_messages)/len(user_set)))

# events : 1910
# users : 70707
# messages : 142518
Avg. time length : 0.0 sec	0.0 hours
Avg. # messages : 74.61675392670158
Max # messages : 1368
Min # messages : 1
Avg. messages / each user : 2.015613729899444


In [5]:
def get_Usample(dict_, most_common=50):
    """
    各ユーザーのeventの出現数を算出する
    
    Parameters
    ----------
    dict_ : dict
        event単位で各学習用の要素をまとめたdict
        
    most_common : int
        上位何ユーザーまでを取得するかの設定値

    Returns
    -------
    cnt.most_common(most_common) : dict
        上位most_commonの各ユーザーのtweetしたevent数 ⇒ [(user_id, Number of appearances in events), ...]
    
    """
    u_sample = []
    cnt = Counter()
    for ii, (eid, value) in enumerate(dict_.items()):
        users = value['uid']
        cnt.update(users)
    return cnt.most_common(most_common)

u_sample = get_Usample(dict_, most_common=threshold)
u_pop = get_Usample(dict_, most_common=len(user_set))
print("u_sample for most common {} users is obtained.".format(threshold))
print("u_pop for all {} users is obtained.".format(len(user_set)))

u_sample for most common 20000 users is obtained.
u_pop for all 70707 users is obtained.


In [6]:
'''
Here are Two user-event matrices.
    1) matrix_main : (all user - all event) relation
        It has #occurrences of a user(row) in an event(col)
        It is very sparse.
        It is decomposed with smaller K.
    2) matrix_sub : (u_sample - eid_sample) relation
        It has #occurrences of a user(row) in an event(col)
        It is denser.
        It is decomposed with larger K. (usually)
'''
user_sample2ind = {}
for ii, (uid, nb_occur) in enumerate(u_sample):
    user_sample2ind[uid] = ii
print("# users in u_sample : {}".format(len(user_sample2ind)))
user2ind = {}
for ii, uid in enumerate(user_set):
    user2ind[uid] = ii
print("# users : {}".format(len(user2ind)))

eid2ind = {}
for ii, eid in enumerate(eid_list):
    eid2ind[eid] = ii
print("# events : {}".format(len(eid2ind)))

# users in u_sample : 20000
# users : 70707
# events : 1910


In [7]:
from scipy.sparse import csr_matrix

def get_user_in_event(dict_, eid, u_events):
    """
    各envetのユーザーごとの出現数を算出する
    
    Parameters
    ----------
    dict_ : dict
        event単位で各学習用の要素をまとめたdict
        
    eid : str
        event id
        
    u_events : dict
        各ユーザーの出現したevent数 [(user_id, Number of appearances in events), ...]

    Returns
    -------
    user_in_event : dict
        対象eventで各ユーザーがtweetした回数 [(user_id, tweets in eid), ...]
    
    """
    value = dict_[eid]
    cnt = Counter(value['uid'])
    users = set(value['uid'])
    user_in_event = []
    for uid, nb_occur in u_events:
        if uid in users:
            user_in_event.append((uid, cnt[uid])) 
    return user_in_event


def get_user_event_matrix(dict_, u_events, user2ind, binary=False):
    """
    userとeventの行列を取得する
    
    Parameters
    ----------
    dict_ : dict
        event単位で各学習用の要素をまとめたdict
        
    eid : str
        event id
        
    u_events : dict
        各ユーザーの出現したevent数 [(user_id, Number of appearances in events), ...]
        
    user2ind : dict
        ユーザーIDとindex [(user_id, index), ...]

    Returns
    -------
    csr_matrix : csr_matrix
        userとeventの行列 [(user_index, event_index), nb_occur or binary]
        
    eid2ind : dict
        tweetしたuserが0以外のeventとindex [(event_id, index), ...]
    
    """
    row = []
    col = []
    data = []
    index = 0
    eid2ind = {}
    for ii, (eid, value) in enumerate(dict_.items()):
        user_in_event = get_user_in_event(dict_, eid, u_events)
        if len(user_in_event)==0:
            # No user in u_events appears in this eid event.
            continue
        else:
            eid2ind[eid] = index
        
        for uid, nb_occur in user_in_event:
            uind = user2ind[uid]
            col.append(index)
            row.append(uind)
            if binary:
                data.append(1)    # Binary matrix
            else:
                data.append(nb_occur)
        index+=1
        
    print("{} events have at least one user in u_sample".format(index))
    print("{} events have no user in u_sample".format(len(dict_)-index))
    return csr_matrix((data, (row, col)), shape=(len(user2ind), len(eid2ind))), eid2ind
    

matrix_sub, eid_sample2ind = get_user_event_matrix(dict_, u_sample, user_sample2ind, binary=True)
matrix_main, eid_main2ind = get_user_event_matrix(dict_, u_pop, user2ind, binary=True)
matrix_main_cnt, eid_main_cnt2ind = get_user_event_matrix(dict_, u_pop, user2ind, binary=False)
print("matrix_sub shape : {}".format(matrix_sub.shape))
print("Sparsity : {}".format(matrix_sub.count_nonzero()/(matrix_sub.shape[0]*matrix_sub.shape[1])))
print("matrix_main shape : {}".format(matrix_main.shape))
print("Sparsity : {}".format(matrix_main.count_nonzero()/(matrix_main.shape[0]*matrix_main.shape[1])))

1894 events have at least one user in u_sample
16 events have no user in u_sample
1910 events have at least one user in u_sample
0 events have no user in u_sample
1910 events have at least one user in u_sample
0 events have no user in u_sample
matrix_sub shape : (20000, 1894)
Sparsity : 0.002246805702217529
matrix_main shape : (70707, 1910)
Sparsity : 0.0010056692180850746


In [8]:
if matrix_reload:
    ### Load matrix_main
    u_main = load(dataset_path+'matrix/tweet_u_main.pikl')
    sigma_main = load(dataset_path+'matrix/tweet_sigma_main.pikl')
    vt_main = load(dataset_path+'matrix/vt_main.pikl')
    user_feature = u_main.dot(np.diag(sigma_main)) # svdで次元削減
    print("user_feature shape : {}".format(user_feature.shape))

    ### Load matrix_sub
    u_sub = load(dataset_path+'matrix/tweet_u_sub.pikl')
    sigma_sub = load(dataset_path+'matrix/tweet_sigma_sub.pikl')
    vt_sib = load(dataset_path+'matrix/vt_sub.pikl')

    user_feature_sub = u_sub.dot(np.diag(sigma_sub))
    print("user_feature_sub shape : {}".format(user_feature_sub.shape))
    print("Loading is Done.")
else:
    n_iter = 7
    u_main, sigma_main, vt_main = randomized_svd(matrix_main, n_components=100,
                                                 n_iter=n_iter, random_state=42)  # random_state=42
    user_feature = u_main.dot(np.diag(sigma_main))
    print("user_feature shape : {}".format(user_feature.shape))

    matrix_sub = matrix_sub.dot(matrix_sub.transpose())
    matrix_sub_array = matrix_sub.toarray()
    u_sub, sigma_sub, vt_sub = randomized_svd(matrix_sub, n_components=100,
                                              n_iter=n_iter, random_state=42)  # random_state=42
    user_feature_sub = u_sub.dot(np.diag(sigma_sub))
    print("user_feature_sub shape : {}".format(user_feature_sub.shape))
    print("SVD is done")
    dump(u_main, dataset_path+'matrix/tweet_u_main.pikl')
    dump(sigma_main, dataset_path+'matrix/tweet_sigma_main.pikl')
    dump(vt_main, dataset_path+'matrix/vt_main.pikl')
    dump(u_sub, dataset_path+'matrix/tweet_u_sub.pikl')
    dump(sigma_sub, dataset_path+'matrix/tweet_sigma_sub.pikl')
    dump(vt_sub, dataset_path+'matrix/vt_sub.pikl')

user_feature shape : (70758, 100)
user_feature_sub shape : (20000, 100)
Loading is Done.


In [9]:
def mk_sntences(dict_, eid_list, resolution='hour', threshold=90*24):
    """
    Doc2Vecの学習用のsentencesを生成する
    
    Parameters
    ----------
    dict_ : dict
    event単位で各学習用の要素をまとめたdict 
    
    eid_list : list
        event idのlist
        
    resolution : str
        day -> binsize:3600*24
        hour -> binsize:3600
        binsize -> binsize:60
        
    threshold : int
        tweetの経過時間のhistogramを取得する閾値

    Returns
    -------
    sentences : list
        各eventのtweet文を時系列順のhistogramのbin単位でまとめたsentences [LabeledSentence(words=['テスト', '作る'], tags=['eid_no']),...]
    
    """
    sentences = []
    
    for ii, eid in enumerate(eid_list):
        if ii%100==0:
            print("{}th event {} is processing...".format(ii+1, eid))
        messages = dict_[eid]
        ts = np.array(messages['timestamps'], dtype=np.int32)
        text_seq = np.array(messages['text'])

        if resolution=='day':
            binsize = 3600*24
        elif resolution=='hour':
            binsize = 3600
        elif resolution=='minute':
            binsize = 60
            
        cnt, bins = np.histogram(ts, bins=range(0, threshold*binsize, binsize)) # histogramの頻度とbinを取得

        nonzero_bins_ind = np.nonzero(cnt)[0] # 頻度0ではない要素番号を抽出
        nonzero_bins = bins[nonzero_bins_ind] # 頻度0ではないbinsを取得
        print(ii, eid, len(nonzero_bins))

        for bid, bin_left in enumerate(nonzero_bins):
            bin_right = bin_left + binsize
            doc = None
            # Collecting text to make doc
            for tid, t in enumerate(ts):
                if t<bin_left:
                    continue
                elif t>=bin_right:
                    break
                else:
                    pass
                string = text_seq[tid]
                string = re.sub(r"http\S+", "", string)
                string = re.sub("[?!.,:;()'@#$%^&*-=+/\[\[\]\]]", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\"
                
                if doc==None:
                    doc = string
                else:
                    doc += string
                    
            if isinstance(eid, int):
                eid_str = str(eid)
            else:
                eid_str = eid
            sentences.append(TaggedDocument(utils.to_unicode(doc).split(), [eid_str+'_%s' % bid]))
    
    print("length of sentences : {}".format(len(sentences)))
    
    return sentences

threshold = 90*24
binsize = 3600
sentences = mk_sntences(dict_, eid_list, resolution='hour', threshold=90*24)

1th event 1261 is processing...
0 1261 36
1 1432 127
2 1594 74
3 707 85
4 404 27
5 1312 76
6 400 5
7 1581 7
8 706 1
9 175 95
10 702 151
11 604 30
12 1886 2
13 1885 42
14 1717 103
15 1314 22
16 1542 13
17 605 9
18 1893 7
19 1466 142
20 578 3
21 144 91
22 643 157
23 290 46
24 154 14
25 142 79
26 250 22
27 896 57
28 195 3
29 1183 28
30 221 4
31 315 30
32 1701 60
33 1262 94
34 1264 7
35 1253 45
36 393 5
37 319 93
38 317 5
39 1260 19
40 316 7
41 1889 40
42 1899 7
43 1268 251
44 1872 2
45 60 3
46 1362 38
47 1884 6
48 1883 15
49 1901 5
50 1256 24
51 1516 1
52 1272 54
53 1898 35
54 1902 5
55 1165 89
56 1438 10
57 1455 19
58 1699 68
59 808 1
60 520 4
61 1896 65
62 304 5
63 701 5
64 1892 78
65 1521 13
66 281 2
67 1888 31
68 92 14
69 155 47
70 1887 20
71 1094 7
72 208 47
73 860 87
74 1075 99
75 140 88
76 1322 158
77 932 161
78 1891 17
79 1719 39
80 51 3
81 1266 4
82 1095 58
83 251 10
84 1651 14
85 1890 26
86 1181 15
87 1367 129
88 1373 96
89 114 2
90 1246 77
91 951 62
92 1310 52
93 1305 120
94 12

In [10]:
if doc_new_traing:
    print("doc_vectorizer is loaded.")
    
    doc_vectorizer = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=8)
    doc_vectorizer.build_vocab(sentences)
    
    for epoch in range(10):
        print(epoch)
        doc_vectorizer.train(sentences, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.epochs)
    print("doc2vec training is done.")

    doc_vectorizer.save('../model/doc2vec_model/twitter.doc2vec.dbow300d.model')
    
else:
    doc_vectorizer = Doc2Vec.load('../model/doc2vec_model/twitter.doc2vec.dbow300d.model')
    print("doc_vectorizer is loaded.")

doc_vectorizer is loaded.


In [11]:
### Building Model
nb_rumor = 0
noerr_eid_list = set()
burnin = 0

### Create dataset ###
X_dict = {}
X_uidx_dict = {}
subX_dict = {}
y_dict = {}
y_list = []
read_text = True
read_user = True

rumor_user = []
nonrumor_user = []
eid_train, eid_test, label_train, label_test = train_test_split(eid_list, label_list,
                                             test_size=0.2, random_state=3, stratify=label_list)

print(f'Label ratio of train: {Counter(label_train)}')
print(f'Label ratio of test: {Counter(label_test)}')

Label ratio of train: Counter({0: 1415, 1: 113})
Label ratio of test: Counter({0: 354, 1: 28})


In [22]:
def create_dataset(dict_, eid, threshold=90, resolution='day', read_text=False, doc2vec_model=None, user_feature=None, user2ind=None, read_user=False,
                   cutoff=50, return_useridx=True):
    """
    main module用の学習データを作成する
    
    Parameters
    ----------
    dict_ : dict
    event単位で各学習用の要素をまとめたdict 
    
    eid : str
        event id
        
    resolution : str
        day -> binsize:3600*24
        hour -> binsize:3600
        binsize -> binsize:60
        
    threshold : int
        tweetの経過時間のhistogramを取得する閾値
        
    read_text : bloom
        tweet文の処理の有無
        
    doc2vec_model : gensim.models.doc2vec.Doc2Vec
        doc2vecの学習用model
    
    return_useridx : bloom
        X_uidxを返すかどうかの判断

    Returns
    -------
    X : array
        対象eventのtweet数の配列、tweet間隔の配列、ユーザー特徴量の平均の配列、tweetテキストの分散表現の配列を横に結合した配列
        ⇒ [[tweet数, tweet間隔, ユーザー特徴量の平均, ..., tweetテキストの分散表現, ...], [....], ....] 
        ⇒ siz (bins or cutoff, 1 + 1 + nb_feature_main + doc2vec_model of vector_size)
    
    X_useridx : list
        対象eventをtweetしたユーザーのリスト
    
    y : int
        対象eventのlabel 
        ⇒ 0:No Fake、 1:Fake
    
    """
    messages = dict_[eid]
    ts = np.array(messages['timestamps'], dtype=np.int32)    
    user_list = messages['uid']
    
    if read_text:
        text_seq = np.array(messages['text'])
    else:
        text_seq = None
        
    if read_user:
        X, X_uidx = get_features(eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)
    else:
        X = get_features(eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)

    y = int(messages['label'])
    if return_useridx:
        return X, X_uidx, y
    else:
        return X, y
        
        
def get_features(eid, timestamps, threshold=90, resolution='day', read_text=False,
                 text_seq=None, read_user=False, doc2vec_model=None, user_feature=None, user2ind=None, user_list=None,
                 cutoff=50, return_useridx=True):
    """
    eid単位で学習データを作成する
    
    Parameters
    ----------
    eid : str
        event id
        
    timestamps : list
        tweet開始からの経過時間（秒）のlist
        
    resolution : str
        day -> binsize:3600*24
        hour -> binsize:3600
        binsize -> binsize:60
        
    threshold : int
        tweetの経過時間のhistogramを取得する閾値
        
    read_text : bloom
        tweet文の処理の有無
        
    text_seq : array
        対象eventのtextデータのarray
        
    doc2vec_model : gensim.models.doc2vec.Doc2Vec
        doc2vecの学習用model
        
    user_feature : array
        ユーザーの特徴量行列
        
    user2ind : dict
        ユーザーIDとindex [(user_id, index), ...]
        
    user_list : list
        ユーザーIDのlist
        
    cutoff : int
        timestampsのhistの上限値
    
    return_useridx : bloom
        X_useridxを返すかどうかの判断

    Returns
    -------
    X : array
        tweet数の配列、tweet間隔の配列、ユーザー特徴量の平均の配列、tweetテキストの分散表現の配列を横に結合した配列
        ⇒ [[tweet数, tweet間隔, ユーザー特徴量の平均, ..., tweetテキストの分散表現, ...], [....], ....] 
        ⇒ siz (bins or cutoff, 1 + 1 + nb_feature_main + doc2vec_model of vector_size)
        
    
    X_useridx : list
        対象eventをtweetしたユーザーのリスト
    
    """
    
    ts = timestamps
    if resolution=='day':
        binsize = 3600*24
    elif resolution=='hour':
        binsize = 3600
    elif resolution=='minute':
        binsize = 60
    cnt, bins = np.histogram(ts, bins=range(0,threshold*binsize,binsize))
    
    nonzero_bins_ind = np.nonzero(cnt)[0]
    nonzero_bins = bins[nonzero_bins_ind]
    
    hist = cnt[nonzero_bins_ind]
    inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
    intervals = np.insert(inv,0,0)
    ### Cutoff sequence
    if len(hist)>cutoff:
        hist = hist[:cutoff]
        intervals = intervals[:cutoff]
        nonzero_bins = nonzero_bins[:cutoff]
        
    print(nonzero_bins.shape)
    ### user feature   
    if read_user:
        X_useridx = []
        X_user = None
        for bid, bin_left in enumerate(nonzero_bins):
            bin_userlist = []
            bin_right = bin_left + binsize
            temp = None
            for tid, t in enumerate(ts):
                if t<bin_left:
                    continue
                elif t>=bin_right:
                    break
                else:
                    pass
                uid = user2ind[user_list[tid]]
                bin_userlist.append(user_list[tid])
                coef = user_feature[uid,:].reshape(1,-1)   # (1,n_components)
                
                if temp is not None:
                    temp = np.concatenate((temp, coef), axis=0)
                else:
                    temp = coef

            X_user_bin = np.mean(temp, axis=0).reshape(1,-1)
            print(X_user_bin.shape)

            if X_user is not None:
                X_user = np.concatenate((X_user, X_user_bin), axis=0)
            else:
                X_user = X_user_bin
                
            X_useridx.append(bin_userlist)
            
    ### text feature
    if read_text:
        text_matrix = get_doc2vec(doc2vec_model, eid, nonzero_bins)
        
    print(hist.reshape(-1,1).shape)
    print(intervals.reshape(-1,1).shape)
    print(X_user.shape)
    print(text_matrix.shape)
    
    if read_text and read_user:
        if return_useridx:
            return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix]), X_useridx
        else:
            return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix])
    elif read_text or read_user:
        if read_text:
            return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), text_matrix])
        elif read_user:
            if return_useridx:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user]), X_useridx
            else:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user])
    else:
        return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1)])
    
def get_doc2vec(doc2vec_model, eid, nonzero_bins):
    """
    eid単位でtweetの分散表現を取得する
    
    Parameters
    ----------
    doc2vec_model : gensim.models.doc2vec.Doc2Vec
        doc2vecの学習用model
        
    eid : str
        event id
    
    nonzero_bins : array
        timestampsのhistogramのbins(度数0は除かれている)

    Returns
    -------
    X_text : array
        tweet textの分散表現
    
    """
    X_text = None
    for bid, bin_left in enumerate(nonzero_bins):
        if isinstance(eid, int):
            eid_str = str(eid)
        else:
            eid_str = eid
        tag = eid_str+'_'+str(bid)
        temp = doc2vec_model.docvecs[tag]  # (300,)
        temp = temp.reshape(1,-1)
        
        if X_text is not None:
            X_text = np.concatenate((X_text, temp), axis=0) # 縦に結合
        else:
            X_text = temp
            
    return X_text

def get_user_feature_in_event(dict_, eid, u_sample, user_feature_sub, user_sample2ind):
    """
    sub module用の学習データをeid単位で作成する
    
    Parameters
    ----------
    dict_ : dict
        event単位で各学習用の要素をまとめたdict
        
    eid : str
        event id
        
    u_sample : dict
        各ユーザーの出現したevent数 [(user_id, Number of appearances in events), ...]
        
    user_feature_sub : array
        ユーザーの特徴量行列
        
    user_sample2ind :  : dict
        ユーザーIDとindex [(user_id, index), ...]
        
    Returns
    -------
    ret_matrix : array
        対象eventでtweetしているユーザーの特徴量行列
        ⇒ [[ユーザーの特徴量, ...], [...], ...]
        ⇒ size (users_in_event, )
    """
    user_in_event = get_user_in_event(dict_, eid, u_sample)
    nb_feature = user_feature_sub.shape[1]
    
    for uid, nb_occur in user_in_event:
        uind = user_sample2ind[uid]
        feature_vec = user_feature_sub[uind,:].reshape(1,-1)
        try:
            ret_matrix = np.concatenate((ret_matrix, feature_vec), axis=0)
        except:
            ret_matrix = feature_vec
    try:
        return ret_matrix
    except:
        ### if user_in_event is empty
        return np.zeros((1,nb_feature))

In [13]:
# doc_vectorizer = None
for ii, eid in enumerate(eid_list):
    # user_feature_sub[:,:nb_feature_main] = 重要度の上位nb_feature_mainまで次元削減する
    if read_user:
        X, X_uidx, y = create_dataset(dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, doc2vec_model=doc_vectorizer, user_feature=user_feature[:,:nb_feature_main], 
                                 user2ind=user2ind, read_user=read_user, cutoff=50,
                                 return_useridx=True)
    else:
        X, y = create_dataset(dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, doc2vec_model=doc_vectorizer, user_feature=user_feature[:,:nb_feature_main], 
                                 user2ind=user2ind, read_user=read_user, cutoff=50,
                                 return_useridx=False)
    if ii%100==0:
        print("processing... {}/{}  shape:{}".format(ii+1, len(eid_list), X.shape))
        
    label = int(dict_[eid]['label'])
    if label==0:
        nonrumor_user.extend(dict_[eid]['uid'])
    elif label==1:
        rumor_user.extend(dict_[eid]['uid'])
        
    X = X.astype(np.float32)
    
    if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
        continue

    X_dict[eid] = X
    
    if read_user:
        X_uidx_dict[eid] = X_uidx
        
    # user_feature_sub[:,:nb_feature_sub] = 重要度の上位nb_feature_subまで次元削減する
    subX_dict[eid] = get_user_feature_in_event(dict_, eid, u_sample, 
                                               user_feature_sub[:,:nb_feature_sub], user_sample2ind)
    y_dict[eid] = y
    
print("Dataset are created.")

processing... 1/1910  shape:(36, 122)
processing... 101/1910  shape:(50, 122)
processing... 201/1910  shape:(14, 122)
processing... 301/1910  shape:(7, 122)
processing... 401/1910  shape:(8, 122)
processing... 501/1910  shape:(7, 122)
processing... 601/1910  shape:(2, 122)
processing... 701/1910  shape:(3, 122)
processing... 801/1910  shape:(2, 122)
processing... 901/1910  shape:(11, 122)
processing... 1001/1910  shape:(1, 122)
processing... 1101/1910  shape:(5, 122)
processing... 1201/1910  shape:(12, 122)
processing... 1301/1910  shape:(39, 122)
processing... 1401/1910  shape:(21, 122)
processing... 1501/1910  shape:(7, 122)
processing... 1601/1910  shape:(3, 122)
processing... 1701/1910  shape:(3, 122)
processing... 1801/1910  shape:(16, 122)
processing... 1901/1910  shape:(4, 122)
Dataset are created.


In [14]:
dump(user2ind, dataset_path+'user2ind.pkl')
dump(eid2ind, dataset_path+'eid2ind.pkl')
dump(eid_train, dataset_path+'eid_train.pkl')
dump(eid_test, dataset_path+'eid_test.pkl')
dump(X, dataset_path+'X.pkl')
dump(X_dict, dataset_path+'X_dict.pkl')
dump(y_dict, dataset_path+'y_dict.pkl')
dump(y_dict, dataset_path+'y_dict.pkl')
dump(dict_, dataset_path+'dict_.pkl')
dump(subX_dict, dataset_path+'subX_dict.pkl')