# CSI: A hybrid deep model for fake news detection

## Temporal event modeling based on LSTM
### Glossary
    - Event(article) : Instance which is True/False or Rumor/NonRumor
    - Message : Temporal behavior on a certain event.
        - Each message arrives at a certain timestamp.
        - Each message has auxiliary information such as context, user_info.
    - Dataset
        - Weibo : 4,664 events, (2,313/2,351 Rumor/NonRumor), 3,805,656 posts, 2,746,818 users
                Avg. # of posts/event : 816
                Max # of posts/event : 59,318
                Min # of posts/event : 10
        - Tweet : 992 events, (498/494 Rumor/NonRumor), 1,101,985 posts,  491,229 users
                Avg. # of posts/event : 1,111
                Max # of posts/event : 62,827
                Min # of posts/event : 10

### Description
    - input: (d) then (d,text,user)
    - output: (d)
    - end result: embedding vector h
    
    - how do we capture text (tf-idf, word2vec, tweet2vec)
    - how do we capture user-info (user name/id, degree?, activity level?)
    - how do we capture *group* dynamics....


In [1]:
%matplotlib inline
from __future__ import division
import re
import collections
import pickle
import numpy as np


## Load preprocessed data

In [2]:
'''
Load dataset
'''
from sklearn.preprocessing import MinMaxScaler

from utils import *


def get_stats(dict_):
    nb_messages = []
    eid_list = []
    user_set = set()
    list_lengths = []
    for eid, messages in dict_.items():
        nb_messages.append(len(messages['timestamps']))
        eid_list.append(eid)

        user_set.update(messages['uid'])
        ts = np.array(messages['timestamps'], dtype=np.float32)
        list_lengths.append(ts[-1]-ts[0])
        
    return nb_messages, eid_list, user_set, list_lengths


def create_dataset(dict_, eid, threshold=90, resolution='day',
                   read_text=False, embeddings_index=None, stopwords=None,
                   doc2vec_model=None, user_feature=None, user2ind=None, read_user=False, task='regression',
                   cutoff=50, return_useridx=True):
    messages = dict_[eid]
    ts = np.array(messages['timestamps'], dtype=np.int32)
    try:
        user_list = messages['uid'].tolist()
    except:
        user_list = messages['uid']
    if read_text:
        text_seq = np.array(messages['text'])
    else:
        text_seq = None
        
    if read_user:
        XX, XX_uidx = get_features(eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, embeddings_index=embeddings_index, stopwords=stopwords,
                              doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)
    else:
        XX = get_features(eid, ts, threshold=threshold, resolution=resolution, read_text=read_text,
                              text_seq=text_seq, embeddings_index=embeddings_index, stopwords=stopwords,
                              doc2vec_model=doc2vec_model, read_user=read_user,
                              user_feature=user_feature, user2ind=user2ind, user_list=user_list,
                              cutoff=cutoff, return_useridx=return_useridx)

#     print(eid, XX.shape, X.shape)
    if task=="regression":
        X = XX[:-1,:]   # (nb_sample, 2+)
        y = XX[1:,:2]
#         y = XX[1:,1]
        if len(y.shape)==1:
            return X, y.reshape(-1,1)
        elif len(y.shape)==2:
            return X, y
    elif task=="classification":
        X = XX   # (nb_sample, 2+)
        y = int(messages['label'])
        if return_useridx:
            return X, XX_uidx, y
        else:
            return X, y


def get_features(eid, timestamps, threshold=90, resolution='day', sep=False, read_text=False,
                 text_seq=None, embeddings_index=None, stopwords=None, read_user=False,
                 doc2vec_model=None, user_feature=None, user2ind=None, user_list=None,
                 cutoff=50, return_useridx=True):
    '''
    timestamps
        : relative timestamps since the first tweet
        : it should be sorted.
        : unit = second
    unit of threshold and resolution should be matched.
    '''
    ts = timestamps
    if resolution=='day':
        binsize = 3600*24
    elif resolution=='hour':
        binsize = 3600
    elif resolution=='minute':
        binsize = 60
    cnt, bins = np.histogram(ts, bins=range(0,threshold*binsize,binsize))
    
    nonzero_bins_ind = np.nonzero(cnt)[0]
    nonzero_bins = bins[nonzero_bins_ind]
    
    hist = cnt[nonzero_bins_ind]
    inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
    intervals = np.insert(inv,0,0)
    ### Cutoff sequence
#     cutoff = 50
    if len(hist)>cutoff:
        hist = hist[:cutoff]
        intervals = intervals[:cutoff]
        nonzero_bins = nonzero_bins[:cutoff]

    ### user feature   
    if read_user:
        X_useridx = []
        for bid, bin_left in enumerate(nonzero_bins):
            bin_userlist = []
            bin_right = bin_left + binsize
            try:
                del temp
            except:
                pass
            # Collecting text to make doc
            for tid, t in enumerate(ts):
                if t<bin_left:
                    continue
                elif t>=bin_right:
                    break
                else:
                    pass
                uid = user2ind[user_list[tid]]
                bin_userlist.append(user_list[tid])
                coef = user_feature[uid,:].reshape(1,-1)   # (1,n_components)
                try:
                    temp = np.concatenate((temp, coef), axis=0)
                except:
                    temp = coef

            X_user_bin = np.mean(temp, axis=0).reshape(1,-1)

            try:
                X_user = np.concatenate((X_user, X_user_bin), axis=0)
            except:
                X_user = X_user_bin
            X_useridx.append(bin_userlist)
            
    ### text feature
    if read_text:
        text_matrix = get_doc2vec(doc2vec_model, eid, nonzero_bins)
    
    if sep:
        if read_text:
            return hist, intervals, X_user, text_matrix
        else:
            return hist, intervals, X_user
    else:
        if read_text and read_user:
            if return_useridx:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix]), X_useridx
            else:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user, text_matrix])
        elif read_text or read_user:
            if read_text:
                return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), text_matrix])
            elif read_user:
                if return_useridx:
                    return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user]), X_useridx
                else:
                    return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1), X_user])
        else:
            return np.hstack([hist.reshape(-1,1), intervals.reshape(-1,1)])
    
def get_doc2vec(doc2vec_model, eid, nonzero_bins):
    for bid, bin_left in enumerate(nonzero_bins):
        if isinstance(eid, int):
            eid_str = str(eid)
        else:
            eid_str = eid
        tag = eid_str+'_'+str(bid)
        temp = doc2vec_model.docvecs[tag]  # (300,)
        temp = temp.reshape(1,-1)
        try:
            X_text = np.concatenate((X_text, temp), axis=0)
        except:
            X_text = temp
    return X_text


### Load dataset ###
dict_ = pickle.load(open('/drive2/sungyong/data/fake_reviews/twitter/tweet_dict_ids.pkl','rb'))
    
# Get statistics
# tweet eid is 'E741' or 'TM1211'
# eid is a number, 3906982031327232
nb_messages, eid_list, user_set, list_lengths = get_stats(dict_)    

print("# events : {}".format(len(eid_list)))
print("# users : {}".format(len(user_set)))
print("# messages : {}".format(np.sum(nb_messages)))
print("Avg. time length : {} sec\t{} hours".format(np.mean(list_lengths),np.mean(list_lengths)/3600))
print("Avg. # messages : {}".format(np.mean(nb_messages)))
print("Max # messages : {}".format(np.max(nb_messages)))
print("Min # messages : {}".format(np.min(nb_messages)))
print("Avg. messages / each user : {}".format(np.sum(nb_messages)/len(user_set)))

# events : 992
# users : 233719
# messages : 592391
Avg. time length : 7138609.0 sec	1982.94694444 hours
Avg. # messages : 597.168346774
Max # messages : 39167
Min # messages : 4
Avg. messages / each user : 2.53462919146


## Get two sets of user features
- `u_sample` : list of tuples (user_ID, # appearances). Top-K users are sampled. 
- `u_pop` : list of tuples (user_ID, # appearances). All users are sampled. 
- `user_feature` ($U$$\Sigma$) : comes from user x event matrix, $M = U$ $\Sigma$$V^T$.
    - $M[i,j]$ : 1 if i-th user from `u_pop` interacts with j-th event. Otherwise, 0.
- `user_feature_sub` ($U'$$\Sigma'$) : comes from user x user matrix, $M' = U'$ $\Sigma'$$V'^T$.
    - $M' = PP^T$
    - $P[i,j]$ : 1 if i-th user from `u_sample` interacts with j-th event. Otherwise, 0.

In [13]:
from collections import Counter

def get_Usample(dict_, most_common=50):
    '''Get U_sample who are most_common.'''
    u_sample = []
    cnt = Counter()
    for ii, (eid, value) in enumerate(dict_.items()):
        users = value['uid']
        cnt.update(users)
    return cnt.most_common(most_common)    # [(user_id, #occur in all events), ...]

def get_user_in_event(dict_, eid, u_sample):
    '''Get users who acts on a given event, eid'''
    value = dict_[eid]
    cnt = Counter(value['uid'])
    users = set(value['uid'])
    user_in_event = []
    for uid, nb_occur in u_sample:
        if uid in users:
#             user_in_event.append((uid, nb_occur))    # [(user_id, #occur in all events), (user_id, #occur in all events), ...]
            user_in_event.append((uid, cnt[uid]))    # [(user_id, #occur in eid), (user_id, #occur in eid), ...]
    return user_in_event    # [(user_id, #occur in eid), (user_id, #occur in eid), ...]

threshold = 20000
u_sample = get_Usample(dict_, most_common=threshold)
u_pop = get_Usample(dict_, most_common=len(user_set))
print("u_sample for most common {} users is obtained.".format(threshold))
print("u_pop for all {} users is obtained.".format(len(user_set)))


'''
Here are Two user-event matrices.
    1) matrix_main : (all user - all event) relation
        It has #occurrences of a user(row) in an event(col)
        It is very sparse.
        It is decomposed with smaller K.
    2) matrix_sub : (u_sample - eid_sample) relation
        It has #occurrences of a user(row) in an event(col)
        It is denser.
        It is decomposed with larger K. (usually)
'''
user_sample2ind = {}
for ii, (uid, nb_occur) in enumerate(u_sample):
    user_sample2ind[uid] = ii
print("# users in u_sample : {}".format(len(user_sample2ind)))
user2ind = {}
for ii, uid in enumerate(user_set):
    user2ind[uid] = ii
print("# users : {}".format(len(user2ind)))
eid2ind = {}
for ii, eid in enumerate(eid_list):
    eid2ind[eid] = ii
print("# events : {}".format(len(eid2ind)))


'''
User Features
    Generate userid-eid matrix and Decompose it
    Truncated SVD
'''
from scipy.sparse import csr_matrix

def get_user_event_matrix(dict_, u_sample, user2ind, binary=False):
    '''Get (user,event) matrix.
    This matrix will be decomposed by TruncatedSVD (or else?)
    Only users in u_sample are considered.
    '''
    row = []
    col = []
    data = []
    jj = 0
    eid2ind = {}
    for ii, (eid, value) in enumerate(dict_.items()):
        user_in_event = get_user_in_event(dict_, eid, u_sample)
        if len(user_in_event)==0:
            # No user in u_sample appears in this eid event.
            continue
        else:
            eid2ind[eid] = jj
#             eind = eid2ind[eid]
        for uid, nb_occur in user_in_event:
            uind = user2ind[uid]
            col.append(jj)
            row.append(uind)
            if binary:
                data.append(1)    # Binary matrix
            else:
                data.append(nb_occur)
        jj+=1
    print("{} events have at least one user in u_sample".format(jj))
    print("{} events have no user in u_sample".format(len(dict_)-jj))
    return csr_matrix((data, (row, col)), shape=(len(user2ind), len(eid2ind))), eid2ind
    

matrix_sub, eid_sample2ind = get_user_event_matrix(dict_, u_sample, user_sample2ind, binary=True)
matrix_main, eid_main2ind = get_user_event_matrix(dict_, u_pop, user2ind, binary=True)
matrix_main_cnt, eid_main_cnt2ind = get_user_event_matrix(dict_, u_pop, user2ind, binary=False)
print("matrix_sub shape : {}".format(matrix_sub.shape))
print("Sparsity : {}".format(matrix_sub.count_nonzero()/(matrix_sub.shape[0]*matrix_sub.shape[1])))
print("matrix_main shape : {}".format(matrix_main.shape))
print("Sparsity : {}".format(matrix_main.count_nonzero()/(matrix_main.shape[0]*matrix_main.shape[1])))

# from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.utils.extmath import randomized_svd

RELOAD = True
if RELOAD:
    ### Load matrix_main
    u_main = np.load(open('matrix/tweet_u_main.npy','rb'))
    sigma_main = np.load(open('matrix/tweet_sigma_main.npy','rb'))
    vt_main = np.load(open('matrix/tweet_vt_main.npy','rb'))

    user_feature = u_main.dot(np.diag(sigma_main))
    nb_feature_main = 20     # 10 for weibo, 20 for tweet
    print("user_feature shape : {}".format(user_feature.shape))

    ### Load matrix_sub
    u_sub = np.load(open('matrix/tweet_u_sub.npy','rb'))
    sigma_sub = np.load(open('matrix/tweet_sigma_sub.npy','rb'))
    vt_sib = np.load(open('matrix/tweet_vt_sub.npy','rb'))

    user_feature_sub = u_sub.dot(np.diag(sigma_sub))
    nb_feature_sub = 50
    print("user_feature_sub shape : {}".format(user_feature_sub.shape))
    print("Loading is Done.")
else:
    nb_feature_main = 20     # 10 for weibo, 20 for tweet
    n_iter = 7    # 15 for weibo, 7 for tweet
    u_main, sigma_main, vt_main = randomized_svd(matrix_main, n_components=100,
                                                 n_iter=n_iter, random_state=42)  # random_state=42
    user_feature = u_main.dot(np.diag(sigma_main))
    print("user_feature shape : {}".format(user_feature.shape))

    nb_feature_sub = 50
    matrix_sub = matrix_sub.dot(matrix_sub.transpose())
    matrix_sub_array = matrix_sub.toarray()
    u_sub, sigma_sub, vt_sub = randomized_svd(matrix_sub, n_components=100,
                                              n_iter=n_iter, random_state=42)  # random_state=42
    user_feature_sub = u_sub.dot(np.diag(sigma_sub))
    print("user_feature_sub shape : {}".format(user_feature_sub.shape))
    print("SVD is done")

u_sample for most common 20000 users is obtained.
u_pop for all 233719 users is obtained.
# users in u_sample : 20000
# users : 233719
# events : 992
932 events have at least one user in u_sample
60 events have no user in u_sample
992 events have at least one user in u_sample
0 events have no user in u_sample
992 events have at least one user in u_sample
0 events have no user in u_sample
matrix_sub shape : (20000, 932)
Sparsity : 0.00855305793991
matrix_main shape : (233719, 992)
Sparsity : 0.00182664383712
user_feature shape : (233719, 100)
user_feature_sub shape : (20000, 100)
Loading is Done.


## Get Doc2Vec model

In [None]:
import jieba, re

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

threshold = 90*24
resolution = 'hour'
sentences = []

chinese_stopwords = '、 。 〃 〄 々 〆 〇 〈〉 《 》 「 」 『 』 【】 〒 〓 〔 〕 〖 〗 〘〙 〚 〛 〛 〜 〝 〞 〟，'.decode('utf-8')
rx = '[' + re.escape(''.join(chinese_stopwords.split())) + ']'

for ii, eid in enumerate(eid_list):
    if ii%100==0:
        print("{}th event {} is processing...".format(ii+1, eid))
    messages = dict_[eid]
    ts = np.array(messages['timestamps'], dtype=np.int32)
    text_seq = np.array(messages['text'])
    
    if resolution=='day':
        binsize = 3600*24
    elif resolution=='hour':
        binsize = 3600
    elif resolution=='minute':
        binsize = 60
    cnt, bins = np.histogram(ts, bins=range(0,threshold*binsize,binsize))
    
    nonzero_bins_ind = np.nonzero(cnt)[0]
    nonzero_bins = bins[nonzero_bins_ind]
    print(ii, eid, len(nonzero_bins))
    hist = cnt[nonzero_bins_ind]
    inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]
    intervals = np.insert(inv,0,0)

    for bid, bin_left in enumerate(nonzero_bins):
        bin_right = bin_left + binsize
        try:
            del doc
        except:
            pass
        # Collecting text to make doc
        for tid, t in enumerate(ts):
            if t<bin_left:
                continue
            elif t>=bin_right:
                break
            else:
                pass
            string = text_seq[tid]
            string = re.sub(r"http\S+", "", string)
            string = re.sub("[?!.,:;()'@#$%^&*-=+/\[\[\]\]]", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\"
            try:
                doc += string
            except:
                doc = string
        if isinstance(eid, int):
            eid_str = str(eid)
        else:
            eid_str = eid
        sentences.append(LabeledSentence(utils.to_unicode(doc).split(), [eid_str+'_%s' % bid]))
    

print("length of sentences : {}".format(len(sentences)))

In [18]:
from gensim.models import Doc2Vec

try:
    doc_vectorizer = Doc2Vec.load('./doc2vec_model/tweet_doc2vec_thrd9024_dim100.model')
    print("doc_vectorizer is loaded.")
except:
    doc_vectorizer = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
    doc_vectorizer.build_vocab(sentences)
    print("build_vocab is done.")

    for epoch in range(10):
        print(epoch)
        doc_vectorizer.train(sentences)
    print("doc2vec training is done.")

    # doc_vectorizer.docvecs['TM71_0'].shape
    # doc_vectorizer.save('./doc2vec_model/weibo_doc2vec_thrd9024_dim100.model')

doc_vectorizer is loaded.


## Create dataset

In [21]:
from sklearn.cross_validation import train_test_split


def get_user_feature_in_event(dict_, eid, u_sample, user_feature_sub, user_sample2ind):
    '''Get user_feature_sub matrix for event eid'''
    user_in_event = get_user_in_event(dict_, eid, u_sample)
    nb_feature = user_feature_sub.shape[1]
    
    for uid, nb_occur in user_in_event:
        uind = user_sample2ind[uid]
        feature_vec = user_feature_sub[uind,:].reshape(1,-1)
        try:
            ret_matrix = np.concatenate((ret_matrix, feature_vec), axis=0)
        except:
            ret_matrix = feature_vec
    try:
        return ret_matrix
    except:
        ### if user_in_event is empty
        return np.zeros((1,nb_feature))
    
### Building Model
LOAD_MODEL = False
task = "classification"  #"classification"

scaler_dict = {}
nb_rumor = 0
noerr_eid_list = set()
burnin = 5 if task=="regression" else 0

### Create dataset ###
X_dict = {}
X_uidx_dict = {}
subX_dict = {}
y_dict = {}
read_text = True
read_user = True

rumor_user = []
nonrumor_user = []
eid_train, eid_test, _, _ = train_test_split(eid_list, range(len(eid_list)),
                                             test_size=0.2, random_state=3)

### To make same sets 
import pickle
eid_train = pickle.load(open('./pickle/tweet_eid_train.pkl','r'))
eid_test = pickle.load(open('./pickle/tweet_eid_test.pkl','r'))

# embeddings_index = None
# doc_vectorizer = None
for ii, eid in enumerate(eid_list):
    if read_user:
        X, X_uidx, y = create_dataset(dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature[:,:nb_feature_main], 
                                 user2ind=user2ind, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=True)
    else:
        X, y = create_dataset(dict_, eid, threshold=90*24, resolution='hour',
                                 read_text=read_text, embeddings_index=None, stopwords=None,
                                 doc2vec_model=doc_vectorizer, user_feature=user_feature[:,:nb_feature_main], 
                                 user2ind=user2ind, read_user=read_user, task=task, cutoff=50,
                                 return_useridx=False)
    if ii%100==0:
        print("processing... {}/{}  shape:{}".format(ii+1, len(eid_list), X.shape))
        
    label = int(dict_[eid]['label'])
    if label==0:
        nonrumor_user.extend(dict_[eid]['uid'])
    elif label==1:
        rumor_user.extend(dict_[eid]['uid'])
#     user_ids.update(dict_[eid]['to_user_id'])
    X = X.astype(np.float32)
    if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
        continue

    X_dict[eid] = X
    if read_user:
        X_uidx_dict[eid] = X_uidx
    subX_dict[eid] = get_user_feature_in_event(dict_, eid, u_sample, 
                                               user_feature_sub[:,:nb_feature_sub], user_sample2ind)
    y_dict[eid] = y

    try:
        scaler_dict[eid]
    except:
        scaler_hist = MinMaxScaler(feature_range=(0,1))
        scaler_hist.fit(X[:,0].reshape(-1,1))
        scaler_interval = MinMaxScaler(feature_range=(0,1))
        scaler_interval.fit(X[:,1].reshape(-1,1))
        scaler_dict[eid] = (scaler_hist, scaler_interval)
print("Dataset are created.")



processing... 1/992  shape:(27, 122)
processing... 101/992  shape:(17, 122)
processing... 201/992  shape:(50, 122)
processing... 301/992  shape:(50, 122)
processing... 401/992  shape:(50, 122)
processing... 501/992  shape:(8, 122)
processing... 601/992  shape:(34, 122)
processing... 701/992  shape:(29, 122)
processing... 801/992  shape:(50, 122)
processing... 901/992  shape:(50, 122)
Dataset are created.


## CSI model
- Python : 2.7.x
- Keras : 1.2.1
- Theano : 0.9.0b1

In [None]:
'''
matrix_main is used for LSTM input.
matrix_sub is used for the scoring module.
'''

from keras.models import load_model
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.layers import Dense, Input, Dropout, Lambda, LSTM, Embedding, Conv1D, TimeDistributed, merge

from keras import regularizers
from keras.optimizers import Adam
from keras import backend as K

acc=0

nb_users = len(user2ind)
nb_events = len(eid2ind)
nb_features = 2+20+100    # (#temporal, #user, #doc)
dim_hidden = 50

##### Main part #####
inputs = Input(shape=(None, nb_features))
emb_out = TimeDistributed(Dense(100, activation='tanh'))(inputs)    # W_e
emb_out = Dropout(0.2)(emb_out)
rnn_out = LSTM(dim_hidden, activation='tanh', return_sequences=False)(emb_out)    #(None, dim_hidden)
rnn_out = Dense(100, activation='tanh')(rnn_out)     # (None, 100) W_r
rnn_out = Dropout(0.2)(rnn_out)


##### Sub part #####
nb_score = 1
nb_expand = 100
sub_input = Input(shape=(None, nb_feature_sub))
user_vec = TimeDistributed(Dense(nb_expand, activation='tanh',
                                 W_regularizer=regularizers.l2(0.01)))(sub_input)   # (None, None, nb_expand)
sub_h = TimeDistributed(Dense(nb_score, activation='sigmoid'))(user_vec)    # (None, None, nb_score)
z = Lambda(lambda x: K.mean(x, axis=1), output_shape=lambda s: (s[0], s[2]))(sub_h)    #(None, nb_score)

##### Concatenate #####
out1 = Dense(1, activation='sigmoid')(rnn_out)
concat_out = merge([out1, z], mode='sum')
# concat_out = merge([rnn_out, z], mode='concat', concat_axis=1)
# concat_out = concatenate([rnn_out, z], axis=1)

##### Classifier #####
# outputs = Dense(1, activation='sigmoid')(concat_out)
# outputs = Dense(1, activation='sigmoid')(concat_out)
outputs = concat_out


##### Model #####
hvector = Model(input=[inputs, sub_input], output=concat_out)
zscore = Model(input=sub_input, output=sub_h)
model = Model(input=[inputs, sub_input], output=outputs)
uvector = Model(input=sub_input, output=user_vec)
# model = Model(input=inputs, output=outputs)


##### Compile #####
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
if task=="regression":
    model.compile(optimizer=adam,
                  loss='mean_squared_error')
elif task=="classification":
    model.compile(optimizer=adam,
                  loss='binary_crossentropy')
print("Model is compiled.")

## Training

In [None]:
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from keras.models import load_model

def sigmoid_array(x):                                        
    return 1 / (1 + np.exp(-x))

### Training... ###
# acc = 0
nb_epoch = 30
if task=="regression":
    eid_train = eid_list
    eid_test = []
    
for ep in range(nb_epoch+1):
    print("{} epoch!!!!!!!!".format(ep))
    ##### Looping for eid_train #####
    losses = []
    for ii, eid in enumerate(eid_train):
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue

        X = X_dict[eid]
        X = X.astype(np.float32)
        y = y_dict[eid]

        label = int(dict_[eid]['label'])
        if task=="classification":
            assert(label==y)

        noerr_eid_list.add(eid)

        sh = scaler_dict[eid][0]
        si = scaler_dict[eid][1]
        
        ##### Main input #####
        trainX = X
        ##### Sub input #####
        sub_trainX = subX_dict[eid]
    
        if task=="regression":
            ### TODO : if we want to predict more features, add here.
            if y.shape[1]>1:
                trainY = np.hstack([sh.transform(y[:,0].reshape(-1,1)),
                                    si.transform(y[:,1].reshape(-1,1))])
            else:
                trainY = si.transform(y)
            dim_output = trainY.shape
            
        elif task=="classification":
            trainY = y
            dim_output = 1
        
        if ep%50==0 and ii%1000==0:
            h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), 
                          batch_size=1, nb_epoch=1, verbose=2)
        else:
            h = model.fit([trainX[np.newaxis,:,:], sub_trainX[np.newaxis,:,:]], np.array([trainY]), 
                          batch_size=1, nb_epoch=1, verbose=0)
        losses.append(h.history['loss'][0])
    print("%% mean loss : {}".format(np.mean(losses)))

    ### Evaluation ###
    preds = []
    rmses = []
    y_test = []
    for ii, eid in enumerate(eid_test):
        if X.shape[0]<=2*burnin:  # ignore length<=1 sequence
            continue

        X = X_dict[eid]
        X = X.astype(np.float32)
        y = y_dict[eid]

        testX = X
        sub_testX = subX_dict[eid]
        
        if task=="classification":
            y_test.append(int(dict_[eid]['label']))

            pred = model.predict([np.array([testX]), np.array([sub_testX])], verbose=0)
            preds.append(pred[0,0])
            
        elif task=="regression":
            predict_y = model.predict(np.array([testX]), verbose=0)
            
            sh = scaler_dict[eid][0]
            si = scaler_dict[eid][1]

            if predict_y.shape[2]==1:
                predict_y = np.hstack([sh.inverse_transform(predict_y[0,burnin:,0].reshape(-1,1))])
            elif predict_y.shape[2]==2:
                predict_y = np.hstack([sh.inverse_transform(predict_y[0,burnin:,0].reshape(-1,1)),
                                       si.inverse_transform(predict_y[0,burnin:,1].reshape(-1,1))])
            elif predict_y.shape[2]>2:
                predict_y = np.hstack([sh.inverse_transform(predict_y[0,burnin:,0].reshape(-1,1)),
                                       si.inverse_transform(predict_y[0,burnin:,1].reshape(-1,1)),
                                       predict_y[0,burnin:,2:]])
            nb_features = predict_y.shape[1]
            rmse = np.sqrt(np.mean((predict_y[:-1,:] - trainX[burnin+1:,:nb_features])**2))
            rmses.append(rmse)

    if task=="classification":
        preds = np.array(preds)
        preds = preds>0.5
        tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
        print("%%% Test results {} samples %%%".format(len(y_test)))
        print("accuracy: {}".format((tp+tn)/(tp+tn+fp+fn)))
        print("precision : {:.4f} / {:.4f}".format(tp/(tp+fp), tn/(fn+tn)))
        print("recall : {:.4f} / {:.4f}".format(tp/(tp+fn), tn/(fp+tn)))
        print("F1 score : {:.4f} / {:.4f}".format(2*tp/(2*tp+fp+fn), 2*tn/(2*tn+fp+fn)))


    elif task=="regression":
        print("%%% Test results {} samples".format(len(rmses)))
        print("mean rmse : {}".format(np.mean(rmses)))
        
    if acc < (tp+tn)/(tp+tn+fp+fn):
        acc = (tp+tn)/(tp+tn+fp+fn)
        print("%%%%%%%%%% Save model\t acc:{} %%%%%%%%%%%%".format(acc))