In [1]:
%matplotlib inline
import re
import collections
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.utils.extmath import randomized_svd
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

In [2]:
dataset_path = '/Users/kanekotakafumi/github/fake_news_detection_research/data/dataset/'
dict_ = pickle.load(open(dataset_path+'datset_dict.pkl','rb'))

In [3]:
def get_stats(dict_):
    nb_messages = []
    eid_list = []
    user_set = set()
    list_lengths = []
    for eid, messages in dict_.items():
        nb_messages.append(len(messages['timestamps']))
        eid_list.append(eid)

        user_set.update(messages['uid'])
        ts = np.array(messages['timestamps'], dtype=np.float32)
        list_lengths.append(ts[-1]-ts[0])
        
    return nb_messages, eid_list, user_set, list_lengths

nb_messages, eid_list, user_set, list_lengths = get_stats(dict_)  

print("# events : {}".format(len(eid_list)))
print("# users : {}".format(len(user_set)))
print("# messages : {}".format(np.sum(nb_messages)))
print("Avg. time length : {} sec\t{} hours".format(np.mean(list_lengths),np.mean(list_lengths)/3600))
print("Avg. # messages : {}".format(np.mean(nb_messages)))
print("Max # messages : {}".format(np.max(nb_messages)))
print("Min # messages : {}".format(np.min(nb_messages)))
print("Avg. messages / each user : {}".format(np.sum(nb_messages)/len(user_set)))

# events : 1914
# users : 71611
# messages : 144098
Avg. time length : 2754689.25 sec	765.1914583333333 hours
Avg. # messages : 75.28631138975966
Max # messages : 1368
Min # messages : 1
Avg. messages / each user : 2.01223275753725


In [4]:
from collections import Counter

def get_Usample(dict_, most_common=50):
    '''Get U_sample who are most_common.'''
    u_sample = []
    cnt = Counter()
    for ii, (eid, value) in enumerate(dict_.items()):
        users = value['uid']
        cnt.update(users)
    return cnt.most_common(most_common)    # [(user_id, #occur in all events), ...]

def get_user_in_event(dict_, eid, u_sample):
    '''Get users who acts on a given event, eid'''
    value = dict_[eid]
    cnt = Counter(value['uid'])
    users = set(value['uid'])
    user_in_event = []
    for uid, nb_occur in u_sample:
        if uid in users:
#             user_in_event.append((uid, nb_occur))    # [(user_id, #occur in all events), (user_id, #occur in all events), ...]
            user_in_event.append((uid, cnt[uid]))    # [(user_id, #occur in eid), (user_id, #occur in eid), ...]
    return user_in_event    # [(user_id, #occur in eid), (user_id, #occur in eid), ...]

threshold = 20000
u_sample = get_Usample(dict_, most_common=threshold)
u_pop = get_Usample(dict_, most_common=len(user_set))
print("u_sample for most common {} users is obtained.".format(threshold))
print("u_pop for all {} users is obtained.".format(len(user_set)))

u_sample for most common 20000 users is obtained.
u_pop for all 71611 users is obtained.


In [5]:
'''
Here are Two user-event matrices.
    1) matrix_main : (all user - all event) relation
        It has #occurrences of a user(row) in an event(col)
        It is very sparse.
        It is decomposed with smaller K.
    2) matrix_sub : (u_sample - eid_sample) relation
        It has #occurrences of a user(row) in an event(col)
        It is denser.
        It is decomposed with larger K. (usually)
'''
user_sample2ind = {}
for ii, (uid, nb_occur) in enumerate(u_sample):
    user_sample2ind[uid] = ii
print("# users in u_sample : {}".format(len(user_sample2ind)))
user2ind = {}
for ii, uid in enumerate(user_set):
    user2ind[uid] = ii
print("# users : {}".format(len(user2ind)))
eid2ind = {}
for ii, eid in enumerate(eid_list):
    eid2ind[eid] = ii
print("# events : {}".format(len(eid2ind)))

# users in u_sample : 20000
# users : 71611
# events : 1914


In [6]:
from scipy.sparse import csr_matrix

def get_user_event_matrix(dict_, u_sample, user2ind, binary=False):
    '''Get (user,event) matrix.
    This matrix will be decomposed by TruncatedSVD (or else?)
    Only users in u_sample are considered.
    '''
    row = []
    col = []
    data = []
    jj = 0
    eid2ind = {}
    for ii, (eid, value) in enumerate(dict_.items()):
        user_in_event = get_user_in_event(dict_, eid, u_sample)
        if len(user_in_event)==0:
            # No user in u_sample appears in this eid event.
            continue
        else:
            eid2ind[eid] = jj
#             eind = eid2ind[eid]
        for uid, nb_occur in user_in_event:
            uind = user2ind[uid]
            col.append(jj)
            row.append(uind)
            if binary:
                data.append(1)    # Binary matrix
            else:
                data.append(nb_occur)
        jj+=1
    print("{} events have at least one user in u_sample".format(jj))
    print("{} events have no user in u_sample".format(len(dict_)-jj))
    return csr_matrix((data, (row, col)), shape=(len(user2ind), len(eid2ind))), eid2ind
    

matrix_sub, eid_sample2ind = get_user_event_matrix(dict_, u_sample, user_sample2ind, binary=True)
matrix_main, eid_main2ind = get_user_event_matrix(dict_, u_pop, user2ind, binary=True)
matrix_main_cnt, eid_main_cnt2ind = get_user_event_matrix(dict_, u_pop, user2ind, binary=False)
print("matrix_sub shape : {}".format(matrix_sub.shape))
print("Sparsity : {}".format(matrix_sub.count_nonzero()/(matrix_sub.shape[0]*matrix_sub.shape[1])))
print("matrix_main shape : {}".format(matrix_main.shape))
print("Sparsity : {}".format(matrix_main.count_nonzero()/(matrix_main.shape[0]*matrix_main.shape[1])))

1898 events have at least one user in u_sample
16 events have no user in u_sample
1914 events have at least one user in u_sample
0 events have no user in u_sample
1914 events have at least one user in u_sample
0 events have no user in u_sample
matrix_sub shape : (20000, 1898)
Sparsity : 0.0022579030558482613
matrix_main shape : (71611, 1914)
Sparsity : 0.001001879027505027


In [7]:
RELOAD = False
if RELOAD:
    ### Load matrix_main
    u_main = np.load(open('matrix/tweet_u_main.npy','rb'))
    sigma_main = np.load(open('matrix/tweet_sigma_main.npy','rb'))
    vt_main = np.load(open('matrix/tweet_vt_main.npy','rb'))

    user_feature = u_main.dot(np.diag(sigma_main))
    nb_feature_main = 20     # 10 for weibo, 20 for tweet
    print("user_feature shape : {}".format(user_feature.shape))

    ### Load matrix_sub
    u_sub = np.load(open('matrix/tweet_u_sub.npy','rb'))
    sigma_sub = np.load(open('matrix/tweet_sigma_sub.npy','rb'))
    vt_sib = np.load(open('matrix/tweet_vt_sub.npy','rb'))

    user_feature_sub = u_sub.dot(np.diag(sigma_sub))
    nb_feature_sub = 50
    print("user_feature_sub shape : {}".format(user_feature_sub.shape))
    print("Loading is Done.")
else:
    nb_feature_main = 20     # 10 for weibo, 20 for tweet
    n_iter = 7    # 15 for weibo, 7 for tweet
    u_main, sigma_main, vt_main = randomized_svd(matrix_main, n_components=100,
                                                 n_iter=n_iter, random_state=42)  # random_state=42
    user_feature = u_main.dot(np.diag(sigma_main))
    print("user_feature shape : {}".format(user_feature.shape))

    nb_feature_sub = 50
    matrix_sub = matrix_sub.dot(matrix_sub.transpose())
    matrix_sub_array = matrix_sub.toarray()
    u_sub, sigma_sub, vt_sub = randomized_svd(matrix_sub, n_components=100,
                                              n_iter=n_iter, random_state=42)  # random_state=42
    user_feature_sub = u_sub.dot(np.diag(sigma_sub))
    print("user_feature_sub shape : {}".format(user_feature_sub.shape))
    print("SVD is done")

user_feature shape : (71611, 100)
user_feature_sub shape : (20000, 100)
SVD is done


In [8]:
import smart_open
smart_open.open = smart_open.smart_open
import gensim

In [9]:
import re

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

threshold = 90*24
resolution = 'hour'
sentences = []

# chinese_stopwords = '、 。 〃 〄 々 〆 〇 〈〉 《 》 「 」 『 』 【】 〒 〓 〔 〕 〖 〗 〘〙 〚 〛 〛 〜 〝 〞 〟，'.decode('utf-8')
# rx = '[' + re.escape(''.join(chinese_stopwords.split())) + ']'

for ii, eid in enumerate(eid_list):
    if ii%100==0:
        print("{}th event {} is processing...".format(ii+1, eid))
    messages = dict_[eid]
    ts = np.array(messages['timestamps'], dtype=np.int32)
    text_seq = np.array(messages['text'])
    
    if resolution=='day':
        binsize = 3600*24
    elif resolution=='hour':
        binsize = 3600
    elif resolution=='minute':
        binsize = 60
    cnt, bins = np.histogram(ts, bins=range(0,threshold*binsize,binsize))
    
    nonzero_bins_ind = np.nonzero(cnt)[0]
    nonzero_bins = bins[nonzero_bins_ind]
    print(ii, eid, len(nonzero_bins))
    hist = cnt[nonzero_bins_ind]
    inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
    intervals = np.insert(inv,0,0)

    for bid, bin_left in enumerate(nonzero_bins):
        bin_right = bin_left + binsize
        try:
            del doc
        except:
            pass
        # Collecting text to make doc
        for tid, t in enumerate(ts):
            if t<bin_left:
                continue
            elif t>=bin_right:
                break
            else:
                pass
            string = text_seq[tid]
            string = re.sub(r"http\S+", "", string)
            string = re.sub("[?!.,:;()'@#$%^&*-=+/\[\[\]\]]", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\"
            try:
                doc += string
            except:
                doc = string
        if isinstance(eid, int):
            eid_str = str(eid)
        else:
            eid_str = eid
        sentences.append(LabeledSentence(utils.to_unicode(doc).split(), [eid_str+'_%s' % bid]))
    

print("length of sentences : {}".format(len(sentences)))

1th event 1263 is processing...
0 1263 36
1 1434 127
2 1596 74
3 708 85




4 404 27
5 1314 76
6 400 5
7 1583 7
8 707 1
9 176 95
10 703 151
11 605 30
12 1888 2
13 1887 42
14 1719 103
15 2 28
16 1316 22
17 1544 13
18 606 9
19 1895 7
20 1 70
21 1468 142
22 578 3
23 145 91
24 644 157
25 290 46
26 155 14
27 143 79
28 250 22
29 897 57
30 196 3
31 1185 28
32 221 4
33 315 30
34 1703 60
35 1264 94
36 1266 7
37 1255 45
38 393 5
39 319 93
40 317 5
41 1262 19
42 316 7
43 1891 40
44 1901 7
45 1270 251
46 1874 2
47 61 3
48 1364 38
49 1886 6
50 1885 15
51 1903 5
52 1258 24
53 1518 1
54 1274 54
55 1900 3
56 1904 5
57 1167 89
58 1440 10
59 1457 19
60 1701 68
61 809 1
62 520 4
63 1898 65
64 304 5
65 702 5
66 1894 78
67 1523 13
68 281 2
69 1890 31
70 93 14
71 156 47
72 1889 20
73 1096 7
74 209 47
75 861 87
76 1077 99
77 141 88
78 1324 158
79 933 161
80 1893 17
81 1721 39
82 52 3
83 1268 4
84 1097 55
85 251 10
86 1653 14
87 1892 26
88 1183 15
89 1369 129
90 1375 96
91 115 2
92 1248 77
93 952 62
94 1312 52
95 1307 120
96 1253 60
97 1896 5
98 1899 14
99 1897 16
101th event 1902 is

In [10]:
from gensim.models import Doc2Vec

new_traing = True

if new_traing:
#     doc_vectorizer = Doc2Vec.load('../model/doc2vec_model/jawiki.doc2vec.dbow300d.model')
    print("doc_vectorizer is loaded.")
    
#     doc_vectorizer.build_vocab(sentences, update=True)
    doc_vectorizer = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
    doc_vectorizer.build_vocab(sentences)
    
    for epoch in range(10):
        print(epoch)
        doc_vectorizer.train(sentences, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    print("doc2vec training is done.")

    # doc_vectorizer.docvecs['TM71_0'].shape
    doc_vectorizer.save('../model/doc2vec_model/twitter.doc2vec.dbow300d.model')

doc_vectorizer is loaded.




0


  from ipykernel import kernelapp as app


1
2
3
4
5
6
7
8
9
doc2vec training is done.


In [11]:
from sklearn.model_selection import train_test_split

def get_user_feature_in_event(dict_, eid, u_sample, user_feature_sub, user_sample2ind):
    '''Get user_feature_sub matrix for event eid'''
    user_in_event = get_user_in_event(dict_, eid, u_sample)
    nb_feature = user_feature_sub.shape[1]
    
    for uid, nb_occur in user_in_event:
        uind = user_sample2ind[uid]
        feature_vec = user_feature_sub[uind,:].reshape(1,-1)
        try:
            ret_matrix = np.concatenate((ret_matrix, feature_vec), axis=0)
        except:
            ret_matrix = feature_vec
    try:
        return ret_matrix
    except:
        ### if user_in_event is empty
        return np.zeros((1,nb_feature))
    
### Building Model
LOAD_MODEL = False
task = "classification"  #"classification"

scaler_dict = {}
nb_rumor = 0
noerr_eid_list = set()
burnin = 5 if task=="regression" else 0

### Create dataset ###
X_dict = {}
X_uidx_dict = {}
subX_dict = {}
y_dict = {}
read_text = True
read_user = True

rumor_user = []
nonrumor_user = []
eid_train, eid_test, _, _ = train_test_split(eid_list, range(len(eid_list)),
                                             test_size=0.2, random_state=3)

In [16]:
def create_dataset(dict_, eid, threshold=90, resolution='day',
                   read_text=False, embeddings_index=None, stopwords=None,
                   doc2vec_model=None, user_feature=None, user2ind=None, read_user=False, task='regression',
                   cutoff=50, return_useridx=True):
    messages = dict_[eid]
    ts = np.array(messages['timestamps'], dtype=np.int32)
    try:
        user_list = messages['uid'].tolist()
    except:
        user_list = messages['uid']
    if read_text:
        text_seq = np.array(messages['text'])
    else:
        text_seq = None
        
    if read_user:
        XX, XX_uidx = get_features(eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, embeddings_index=embeddings_index, stopwords=stopwords,
                              doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)
    else:
        XX = get_features(eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, embeddings_index=embeddings_index, stopwords=stopwords,
                              doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)

#     print(eid, XX.shape, X.shape)
    if task=="regression":
        X = XX[:-1,:]   # (nb_sample, 2+)
        y = XX[1:,:2]
#         y = XX[1:,1]
        if len(y.shape)==1:
            return X, y.reshape(-1,1)
        elif len(y.shape)==2:
            return X, y
    elif task=="classification":
        X = XX   # (nb_sample, 2+)
        y = int(messages['label'])
        if return_useridx:
            return X, XX_uidx, y
        else:
            return X, y
        
        
def get_features(eid, timestamps, threshold=90, resolution='day', sep=False, read_text=False,
                 text_seq=None, embeddings_index=None, stopwords=None, read_user=False,
                 doc2vec_model=None, user_feature=None, user2ind=None, user_list=None,
                 cutoff=50, return_useridx=True):
    '''
    timestamps
        : relative timestamps since the first tweet
        : it should be sorted.
        : unit = second
    unit of threshold and resolution should be matched.
    '''
    ts = timestamps
    if resolution=='day':
        binsize = 3600*24
    elif resolution=='hour':
        binsize = 3600
    elif resolution=='minute':
        binsize = 60
    cnt, bins = np.histogram(ts, bins=range(0,threshold*binsize,binsize))
    
    nonzero_bins_ind = np.nonzero(cnt)[0]
    nonzero_bins = bins[nonzero_bins_ind]
    
    hist = cnt[nonzero_bins_ind]
    inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
    intervals = np.insert(inv,0,0)
    ### Cutoff sequence
#     cutoff = 50
    if len(hist)>cutoff:
        hist = hist[:cutoff]
        intervals = intervals[:cutoff]
        nonzero_bins = nonzero_bins[:cutoff]

    ### user feature   
    if read_user:
        X_useridx = []
        for bid, bin_left in enumerate(nonzero_bins):
            bin_userlist = []
            bin_right = bin_left + binsize
            try:
                del temp
            except:
                pass
            # Collecting text to make doc
            for tid, t in enumerate(ts):
                if t<bin_left:
                    continue
                elif t>=bin_right:
                    break
                else:
                    pass
                uid = user2ind[user_list[tid]]
                bin_userlist.append(user_list[tid])
                coef = user_feature[uid,:].reshape(1,-1)   # (1,n_components)
                try:
                    temp = np.concatenate((temp, coef), axis=0)
                except:
                    temp = coef

            X_user_bin = np.mean(temp, axis=0).reshape(1,-1)

            try:
                X_user = np.concatenate((X_user, X_user_bin), axis=0)
            except:
                X_user = X_user_bin
            X_useridx.append(bin_userlist)
            
    ### text feature
    if read_text:
        text_matrix = get_doc2vec(doc2vec_model, eid, nonzero_bins)
    
    if sep:
        if read_text:
            return hist, intervals, X_user, text_matrix
        else:
            return hist, intervals, X_user
    else:
        if read_text and read_user:
            if return_useridx:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix]), X_useridx
            else:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix])
        elif read_text or read_user:
            if read_text:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), text_matrix])
            elif read_user:
                if return_useridx:
                    return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user]), X_useridx
                else:
                    return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user])
        else:
            return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1)])
    
def get_doc2vec(doc2vec_model, eid, nonzero_bins):
    for bid, bin_left in enumerate(nonzero_bins):
        if isinstance(eid, int):
            eid_str = str(eid)
        else:
            eid_str = eid
        tag = eid_str+'_'+str(bid)
        temp = doc2vec_model.docvecs[tag]  # (300,)
        temp = temp.reshape(1,-1)
        try:
            X_text = np.concatenate((X_text, temp), axis=0)
        except:
            X_text = temp
    return X_text

In [17]:
# embeddings_index = None
# doc_vectorizer = None
for ii, eid in enumerate(eid_list):
    if read_user:
        X, X_uidx, y = create_dataset(dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature[:,:nb_feature_main], 
                                 user2ind=user2ind, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=True)
    else:
        X, y = create_dataset(dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature[:,:nb_feature_main], 
                                 user2ind=user2ind, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=False)
    if ii%100==0:
        print("processing... {}/{}  shape:{}".format(ii+1, len(eid_list), X.shape))
        
    label = int(dict_[eid]['label'])
    if label==0:
        nonrumor_user.extend(dict_[eid]['uid'])
    elif label==1:
        rumor_user.extend(dict_[eid]['uid'])
#     user_ids.update(dict_[eid]['to_user_id'])
    X = X.astype(np.float32)
    if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
        continue

    X_dict[eid] = X
    if read_user:
        X_uidx_dict[eid] = X_uidx
    subX_dict[eid] = get_user_feature_in_event(dict_, eid, u_sample, 
                                               user_feature_sub[:,:nb_feature_sub], user_sample2ind)
    y_dict[eid] = y

    try:
        scaler_dict[eid]
    except:
        scaler_hist = MinMaxScaler(feature_range=(0,1))
        scaler_hist.fit(X[:,0].reshape(-1,1))
        scaler_interval = MinMaxScaler(feature_range=(0,1))
        scaler_interval.fit(X[:,1].reshape(-1,1))
        scaler_dict[eid] = (scaler_hist, scaler_interval)
print("Dataset are created.")

processing... 1/1914  shape:(36, 122)
processing... 101/1914  shape:(50, 122)
processing... 201/1914  shape:(4, 122)
processing... 301/1914  shape:(8, 122)
processing... 401/1914  shape:(11, 122)
processing... 501/1914  shape:(3, 122)
processing... 601/1914  shape:(7, 122)
processing... 701/1914  shape:(25, 122)
processing... 801/1914  shape:(8, 122)
processing... 901/1914  shape:(2, 122)
processing... 1001/1914  shape:(4, 122)
processing... 1101/1914  shape:(2, 122)
processing... 1201/1914  shape:(50, 122)
processing... 1301/1914  shape:(6, 122)
processing... 1401/1914  shape:(4, 122)
processing... 1501/1914  shape:(13, 122)
processing... 1601/1914  shape:(50, 122)
processing... 1701/1914  shape:(7, 122)
processing... 1801/1914  shape:(7, 122)
processing... 1901/1914  shape:(8, 122)
Dataset are created.


In [26]:
X_dict

{'1263': array([[ 1.9000000e+01,  0.0000000e+00,  6.9034204e-02, ...,
         -1.7905626e+00, -3.1407505e-01,  6.3423550e-01],
        [ 3.0000000e+00,  1.0000000e+00,  4.9904697e-02, ...,
         -3.1891254e-01,  6.4592226e-03,  7.6574343e-03],
        [ 1.3000000e+01,  1.0000000e+00,  2.6021790e-02, ...,
         -2.9613385e-01,  3.2788596e-01,  1.6989236e-01],
        ...,
        [ 1.0000000e+00,  7.0000000e+00,  8.4976745e-01, ...,
          3.1275209e-02,  3.3301316e-02, -8.6521573e-02],
        [ 1.0000000e+00,  8.0000000e+00,  2.3896091e-03, ...,
         -1.9643910e-02,  6.6051140e-02, -7.7481672e-02],
        [ 1.0000000e+00,  2.7600000e+02,  6.4417027e-02, ...,
          9.4694324e-02,  3.4075701e-01, -1.7652318e-01]], dtype=float32),
 '1434': array([[ 4.0000000e+00,  0.0000000e+00,  1.5946677e-02, ...,
          6.2909313e-02,  8.5031111e-03, -6.5325359e-03],
        [ 1.0000000e+00,  1.0000000e+00,  6.7957607e-03, ...,
          8.4621776e-03,  1.2465081e-03, -1.0377732e