In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import itertools,time
import sys, os
from collections import OrderedDict
from copy import deepcopy
from time import time
import sys, getopt
import Utils as utils
from gensim import matutils
from collections import namedtuple
import json

from DataLoader import DataLoader
from models import vae

%matplotlib inline

np.random.seed(0)
tf.set_random_seed(0)

In [None]:
params = {

    # Number of recommended entities from each view
    "suggestion_count": 10,
    # Number of online snapshots to consider (the latest snapshots)
    "imp_doc_to_consider": 4,
    # True: normalize TF-IDF weights to sum to 1, False: no normalization. TODO: DOES THIS MAKE SENSE?
    "normalize_terms": True,
    # True: use exploration algorithm (Thompson Sampling) for recommendation, False: use the mean of the estimate.
    "Thompson_exploration": False,
    # True: allow the algorithm to show previously recommended items, False: each item can be recommended only once
    "repeated_recommendation": True,
    # A heuristic method to shrink the variance of the posterior (reduce the exploration). it should be in (0,1];
    "exploration_rate": 1,  # NOT IMPLEMENTED YET
    # Number of iterations of the simulated study
    "num_iterations": 50,
    # Number of latent dimensions for data representation
    "num_latent_dims": 100,
    # Number of runs (only for the simulated study, set to 1 for real data setting)
    "num_runs": 1,  # NOT IMPLEMENTED YET
    # True: prepare the data for FOCUS UI but have the interaction in the terminal
    "FOCUS_UI_simulator": True,
    # The directory of the corpus (It should have /corpus.mm, /dictionary.dict, and views_ind_1.npy files)
    # "corpus_directory": 'corpus1_2/corpus7_sim',
    "corpus_directory": 'corpus1_2/P01',
    # The directory of the new snapshots that will be checked at the beginning of each iteration
    "snapshots_directory": 'user activity',
    # True: Use the simulated user data to simulate the user feedback
    "Simulated_user": False,
    "save_directory": 'models_LDA/P01/',
    "sequence_length": 10,
    "n_step": 1,
    "batch_size": 32,
    "loss_LSTM": 'mean_squared_error',
    "metrics_LSTM": ['accuracy'],
    "num_topics": 20
}

data_dir = params["corpus_directory"]
save_dir = params["save_directory"]
data = DataLoader(data_dir, save_dir)
data.print_info()

n_topics = params["num_topics"]

In [None]:
# Tranform corpus to data frame needed for VAE
from sklearn.preprocessing import MinMaxScaler
df = matutils.corpus2dense(data.corpus, num_terms=data.num_features).T
# Minmax normalize
scaler_x = MinMaxScaler(feature_range =(0, 1))
x = scaler_x.fit_transform(df)
data_tr = x

In [None]:
# Build vocabulary for the corpus
vocab = {}
for item in data.dictionary.items():
    vocab[item[1]] = item[0]
print 'Dim Training Data',data_tr.shape

In [None]:
'''-----------------------------'''

'''--------------Global Params---------------'''
n_samples_tr = data_tr.shape[0]
#n_samples_te = data_te.shape[0]
docs_tr = data_tr
#docs_te = data_te
batch_size=100
learning_rate=0.001
network_architecture = \
    dict(n_hidden_recog_1=500, # 1st layer encoder neurons
         n_hidden_recog_2=500, # 2nd layer encoder neurons
         n_hidden_gener_1=500, # 1st layer decoder neurons
         n_hidden_gener_2=500, # 2nd layer decoder neurons
         n_input=data_tr.shape[1], # MNIST data input (img shape: 28*28)
         n_z=n_topics)  # dimensionality of latent space


In [None]:
'''-----------------------------'''

'''--------------Netowrk Architecture and settings---------------'''

def make_network(layer1=500,layer2=500,num_topics=n_topics,bs=100,eta=0.001):
    tf.reset_default_graph()
    network_architecture = \
        dict(n_hidden_recog_1=layer1, # 1st layer encoder neurons
             n_hidden_recog_2=layer2, # 2nd layer encoder neurons
             n_hidden_gener_1=500, # 1st layer decoder neurons
             n_hidden_gener_2=500, # 2nd layer decoder neurons
             n_input=data_tr.shape[1], # MNIST data input (img shape: 28*28)
             n_z=num_topics)  # dimensionality of latent space
    batch_size=bs
    learning_rate=eta
    return network_architecture,batch_size,learning_rate



'''--------------Methods--------------'''
def create_minibatch(data):
    rng = np.random.RandomState(10)

    while True:
        # Return random data samples of a size 'minibatch_size' at each iteration
        ixs = rng.randint(data.shape[0], size=batch_size)
        yield data[ixs]


#def train(network_architecture, learning_rate=0.001,
def train(network_architecture, minibatches, learning_rate=0.001,
          batch_size=100, training_epochs=10, display_step=5):
    
    _vae = vae.VariationalAutoencoder(network_architecture, 
                                 learning_rate=learning_rate, 
                                 batch_size=batch_size)
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(n_samples_tr / batch_size)
        # Loop over all batches
        for i in range(total_batch):
            #batch_xs, _ = mnist.train.next_batch(batch_size)
            batch_xs = minibatches.next()

            # Fit training using batch data
            cost = _vae.partial_fit(batch_xs)
            # Compute average loss
            avg_cost += cost / n_samples_tr * batch_size

        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), 
                  "cost=", "{:.9f}".format(avg_cost))
    return _vae

In [None]:
f = int(500)
s = int(500)
t = int(n_topics)
b = int(100)
r = float(0.001)
e = int(75)
minibatches = create_minibatch(docs_tr.astype('float32'))
network_architecture,batch_size,learning_rate=make_network(f,s,t,b,r)
print network_architecture
print batch_size
print n_topics

In [None]:
vae_latent = train(network_architecture, minibatches, training_epochs=e)

In [None]:
SEQUENCE_LEN = params["sequence_length"]
STEP = params["n_step"]
BATCH_SIZE = params["batch_size"]

TaggedDocument = namedtuple('TaggedDocument', 'tags words')
all_docs = []
tokenized_texts = []
doc_id = 0
len_doc = []
all_entities = []
docs_topics = np.zeros((len(data.corpus), n_topics))
for i, doc in enumerate(data.corpus):
    words = doc
    words = [data.dictionary[w[0]] for w in words]
    all_docs.append(TaggedDocument([doc_id], words))
    tokenized_texts.append((words))
    doc_id += 1
    len_doc.append(len(words))
    all_entities = all_entities + words

    topics_i = latent_prop[i]
    for j in range(len(topics_i)):
        docs_topics[i, j] = topics_i[j]

M_T_sparse = matutils.corpus2csc(data.corpus, num_terms=data.num_features,
                                 num_docs=data.num_data, num_nnz=data.corpus.num_nnz)

screens = []
next_screens = []
M_T_screens = []
M_T_next_screens = []
for i in range(0, len(docs_topics) - SEQUENCE_LEN):
    screens.append(docs_topics[i: i + SEQUENCE_LEN])
    next_screens.append(docs_topics[i + SEQUENCE_LEN])
    M_T_screens.append(M_T_sparse[:, i: i + SEQUENCE_LEN])
    M_T_next_screens.append(M_T_sparse[:, i + SEQUENCE_LEN])
print('nb sequences:', len(screens))
print screens[0]

In [None]:
# No shuffling train and test set
print len(screens)
print '---'
print next_screens[0]
cut_index = int((len(screens)-10) * (1.-(20/100.)))
print cut_index
screens_train, screens_test = screens[:cut_index], screens[cut_index:]
next_screens_train, next_screens_test = next_screens[:cut_index], next_screens[cut_index:]

# Print the train set
print screens_train[0]

In [None]:
test_seq = np.arange(start=0, stop=len(screens), step=1)
index_train, index_test = test_seq[:cut_index], test_seq[cut_index:]
M_T_screens_test = [M_T_screens[i] for i in list(index_test)]
M_T_next_screens_test = [M_T_next_screens[i] for i in list(index_test)]

In [None]:
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding

# screen_list, next_screen_list, batch_size = screens, next_screen, BATCH_SIZE
# Data generator for fit and evaluate
def generator(screen_list, next_screen_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, n_topics), dtype=np.float64)
        y = np.zeros((batch_size, n_topics), dtype=np.float64)
        for i in range(batch_size):
            for t, tpc in enumerate(screen_list[index % len(screen_list)]):
                x[i, t, :] = tpc
            y[i, :] = next_screen_list[index % len(screen_list)]
            index = index + 1
        yield x, y


def get_model(dropout=0.5):
    "Constructs an LSTM model and adds different layers to it"
    print('Build model...')
    model = Sequential()
    # model.add(Embedding(input_dim=n_topics, output_dim=128))
    # model.add(Bidirectional(LSTM(150)))
    model.add(Bidirectional(LSTM(50, activation="relu"), input_shape=(SEQUENCE_LEN, n_topics)))  # , activation="relu"
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(n_topics))
    # model.add(Activation('softmax'))
    return model


# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas_topic = np.random.multinomial(1, preds, 1)
    sampled_topic = np.flatnonzero(probas_topic)
    # lda_topics[sampled_topic[0]]
    probas_entity = np.random.multinomial(n_entities, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating topic after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(screens_train + screens_test))
    seed = (screens_train + screens_test)[seed_index]

    screen = seed
    # examples_file = screen.tolist()
    examples_file.write('----- Generating with seed:\n"' + ' ' + str(screen) + '"\n')
    for i in range(10):
        x_pred = np.zeros((1, SEQUENCE_LEN, n_topics))
        for t, top in enumerate(screen):
            x_pred[0, t, :] = top

        preds = model.predict_proba(x_pred, verbose=0)[0]

        screen = screen[1:]
        screen = np.vstack([screen, preds])

        # examples_file.append(preds.tolist())
        # return examples_file
        examples_file.write(" " + str(preds))
        examples_file.write('\n')
    examples_file.write('=' * 80 + '\n')
    examples_file.flush()

In [None]:
#(screens_train, next_screens_train), (screens_test, next_screens_test), (
#index_train, index_test) = utils.shuffle_and_split_training_set_LDA(
#    screens, next_screens, SEQUENCE_LEN)

#M_T_screens_test = [M_T_screens[i] for i in list(index_test)]
#M_T_next_screens_test = [M_T_next_screens[i] for i in list(index_test)]
# x = np.zeros((len(screens), SEQUENCE_LEN, n_topics), dtype=np.float64)
# y = np.zeros((len(screens), n_topics), dtype=np.float64)
# for i, scr in enumerate(screens):
#     for t, tpc in enumerate(scr):
#         x[i, t, :] = tpc
#     y[i, :] = next_screen[i]

if not os.path.isdir('./checkpoints/'):
    os.makedirs('./checkpoints/')

model = get_model()
print(model.summary())
model.compile(loss='mean_squared_error', optimizer="adam",
              metrics=['accuracy'])  # categorical_crossentropy , mean_squared_error, logcosh
''' categorical_cross_entropy'''
file_path = "./checkpoints/LSTM_LDA-epoch{epoch:03d}-topics%d-sequence%d-" \
            "loss{loss:.4f}-acc{accuracy:.4f}-val_loss{val_loss:.4f}-val_acc{val_accuracy:.4f}" % \
            (n_topics, SEQUENCE_LEN)

checkpoint = ModelCheckpoint(file_path, monitor='val_accuracy', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20)
callbacks_list = [checkpoint, print_callback, early_stopping]

examples = 'examples_topic_vector.txt'
examples_file = open(examples, "w")
# examples_file = []
history = model.fit_generator(generator(screens_train, next_screens_train, BATCH_SIZE),
                              steps_per_epoch=int(len(screens_train) / BATCH_SIZE) + 1,
                              epochs=100,
                              callbacks=callbacks_list,
                              validation_data=generator(screens_test, next_screens_test, BATCH_SIZE),
                              validation_steps=int(len(screens_test) / BATCH_SIZE) + 1)

# model.save(save_dir + "/" + 'LSTM_LDA.final.hdf5')

In [None]:
"**************  Generate Recommendations  **************"
# Future step
n_future = 1
# Amount test data
n_tests = int(len(screens_test)/100)*100
precision, recall, precision_kw, recall_kw, \
precision_app, recall_app, precision_ppl, recall_ppl = [], [], [], [], [], [], [], []
# Randomly pick a seed sequence
#seed_index = np.random.randint(len(screens_test), size=n_tests)
# Manual pick a seed sequence
seed_index = np.arange(start=0, stop=n_tests, step=1)
n_empty_app = 0

test_ =  np.zeros(shape=(n_tests,20))
test_kw = []
test_ppl = []
test_app = []
test_flat_entities_original = []
for k in range(n_tests):
    #print(k)
    entities_original = []
    flat_entities_original = []
    recommended_entities = []

    # seed = screens_test[seed_index[k]]
    seed = index_test[seed_index[k]]
    screen = screens[seed]
    # entities_screen = M_T_next_screens_test[seed_index[k]:seed_index[k]+n_future]
    entities_screen = M_T_next_screens[index_test[seed_index[k]]:index_test[seed_index[k]] + n_future]
    for n in range(n_future):
        entities_original += [data.dictionary[i] for i in entities_screen[n].indices]
    flat_entities_original = list(dict.fromkeys(entities_original))
    #for i in range(len(flat_entities_original)):
    #    print flat_entities_original[i], data.dictionary.token2id[flat_entities_original[i]], data.views_ind[data.dictionary.token2id[flat_entities_original[i]]]
    flat_entities_original_views = [data.views_ind[data.dictionary.token2id[flat_entities_original[i]]] for i in range(len(flat_entities_original))]
    original_kw = [flat_entities_original[i] for i in range(len(flat_entities_original)) if flat_entities_original_views[i] == 1]
    original_app = [flat_entities_original[i] for i in range(len(flat_entities_original)) if flat_entities_original_views[i] == 2]
    original_ppl = [flat_entities_original[i] for i in range(len(flat_entities_original)) if flat_entities_original_views[i] == 3]
    #print original_app
    test_kw.append(original_kw)
    test_app.append(original_app)
    test_ppl.append(original_ppl)
    test_flat_entities_original.append(flat_entities_original)

    "=====================================    VAE + LSTM   ============================"

    "Prepare the input for the model"
    x_pred = np.zeros((1, SEQUENCE_LEN, n_topics))
    for t, top in enumerate(screen):
        x_pred[0, t, :] = top
    "predict the next latent vector (or the topic vector)"
    preds = model.predict(x_pred, verbose=0)[0]
    #print preds
    test_[k] = preds
#print test_
transformed_test_ = []
test_after = np.array_split(test_, n_tests/100)
for b in test_after:
    for v in vae_latent.generate(b):
        transformed_test_.append(v)

#transformed_test_ = vae_latent.generate(test_)
for i in range(n_tests):
    recommended_entities = []
    
    original_kw = test_kw[i]
    original_app = test_app[i]
    original_ppl = test_ppl[i]
    flat_entities_original = test_flat_entities_original[i]
    
    print original_app
    
    if len(original_app) == 0:
        n_empty_app+=1
    predictionList = []
    #print np.around(transformed_test_[i].astype(float), decimals=3)
    #_feature_names = transformed_test_[i].argsort()[::-1][:10000]
    _feature_names = transformed_test_[i]
    for ind, v in enumerate(_feature_names):
        views_ind_ij = data.views_ind[ind]
        probab_i = v
        predictionList.append((ind, probab_i, views_ind_ij))
        #if data.views_ind[item] == 2:
        #    print i, data.dictionary[item], data.views_ind[item]

    def takeSecond(elem):
        return elem[1]

    
    #print predictionList[:10]
    sorted_terms = sorted(predictionList, key=takeSecond, reverse=True)
    #print sorted_terms[:10]

    def flatRecom(seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x[0] in seen or seen_add(x[0]))]


    flat_entities_recommended = flatRecom(sorted_terms)
    sorted_views_list = []  # sorted ranked list of each view
    for view in range(1, data.num_views):
        sorted_view = [term[0] for term in flat_entities_recommended if term[2] == view]
        sorted_views_list.append(sorted_view)
    #print sorted_views_list
    # IN TERMINAL USER INTERFACE
    if params["FOCUS_UI_simulator"]:

        for view in range(1, data.num_views):
            print('view %d:' % view)
            for i in range(min(params["suggestion_count"], data.num_items_per_view[view])):
                print('    %d,' % sorted_views_list[view - 1][i] + ' ' + data.feature_names[
                    sorted_views_list[view - 1][i]])

        # organize the recommentations in the right format
        data_output = {}
        data_output["keywords"] = [(sorted_views_list[0][i], data.feature_names[sorted_views_list[0][i]])
                                   for i in range(min(params["suggestion_count"], data.num_items_per_view[1]))]
        data_output["applications"] = [(sorted_views_list[1][i], data.feature_names[sorted_views_list[1][i]])
                                       for i in range(min(params["suggestion_count"], data.num_items_per_view[2]))]
        data_output["people"] = [(sorted_views_list[2][i], data.feature_names[sorted_views_list[2][i]])
                                 for i in range(min(params["suggestion_count"], data.num_items_per_view[3]))]

        # # for now write everything in a file
        # with open('data.txt', 'w') as outfile:
        #     json.dump(data_output, outfile)

    recommended_kw = [data.feature_names[sorted_views_list[0][i]] for i in range(len(data_output["keywords"]))]
    recommended_app = [data.feature_names[sorted_views_list[1][i]] for i in range(len(data_output["applications"]))]
    recommended_ppl = [data.feature_names[sorted_views_list[2][i]] for i in range(len(data_output["people"]))]

    recommended_entities = recommended_kw + recommended_app + recommended_ppl
    # def check_element(a, b):
    #   return not set(a).isdisjoint(b)
    #
    # check_element(recommended_kw, flat_entities_original)
    # check_element(recommended_app, flat_entities_original)
    # check_element(recommended_ppl, flat_entities_original)

    c = []
    for bx in recommended_entities:
        if bx in flat_entities_original:
            c.append(bx)
    if c:
        print('these are the elements of list a that are present in list b:')
        print(c)
    else:
        print('no elements of list a are in list b')
    print(float(len(c)) / float(len(recommended_entities)), ' correct recommendations in ', n_future, 'future pages')

    precision.append(float(len(c)) / float(len(recommended_entities)))
    recall.append(float(len(c)) / float(len(flat_entities_original) + 1e-10))

    c_kw = []
    for bx_kw in recommended_kw:
        if bx_kw in original_kw:
            c_kw.append(bx_kw)

    precision_kw.append(float(len(c_kw)) / float(len(recommended_kw)))
    if (original_kw != []):
        recall_kw.append(float(len(c_kw)) / float(len(original_kw)))
    else:
        recall_kw.append(None)

    c_app = []
    for bx_app in recommended_app:
        if bx_app in original_app:
            c_app.append(bx_app)

    precision_app.append(float(len(c_app)) / float(len(recommended_app)))
    if (original_app != []):
        recall_app.append(float(len(c_app)) / float(len(original_app)))
    else:
        recall_app.append(None)

    c_ppl = []
    for bx_ppl in recommended_ppl:
        if bx_ppl in original_ppl:
            c_ppl.append(bx_ppl)

    precision_ppl.append(float(len(c_ppl)) / float(len(recommended_ppl)))
    if(original_ppl != []):
        recall_ppl.append(float(len(c_ppl)) / float(len(original_ppl)))
    else:
        recall_ppl.append(None)

precision_avg = np.mean(precision)
precision_kw_avg = np.mean(precision_kw)
precision_ppl_avg = np.mean(precision_ppl)
precision_app_avg = np.mean(precision_app)

precision_std = np.std(precision)
precision_kw_std = np.std(precision_kw)
precision_ppl_std = np.std(precision_ppl)
precision_app_std = np.std(precision_app)

recall_avg = np.mean([l for l in (recall) if l != None])
recall_kw_avg = np.mean([l for l in (recall_kw) if l != None])
recall_ppl_avg = np.mean([l for l in (recall_ppl) if l != None])
recall_app_avg = np.mean([l for l in (recall_app) if l != None])

recall_std = np.std([l for l in (recall) if l != None])
recall_kw_std = np.std([l for l in (recall_kw) if l != None])
recall_ppl_std = np.std([l for l in (recall_ppl) if l != None])
recall_app_std = np.std([l for l in (recall_app) if l != None])

print('precision: ', precision_avg, '+', precision_std)
print('precision_kw: ', precision_kw_avg, '+', precision_kw_std)
print('precision_ppl: ', precision_ppl_avg, '+', precision_ppl_std)
print('precision_app: ', precision_app_avg, '+', precision_app_std)

print('recall: ', recall_avg, '+', recall_std)
print('recall_kw: ', recall_kw_avg, '+', recall_kw_std)
print('recall_ppl: ', recall_ppl_avg, '+', recall_ppl_std)
print('recall_app: ', recall_app_avg, '+', recall_app_std)
print n_empty_app