In [1]:
import json
import pprint
from random import randint, shuffle, random
import numpy as np

def get_word_dic(fre_limit):
    word_dic = {}
    num = 0
    with open('freq_word.txt', 'r') as data_file:
        inputs = json.load(data_file)
    for couple in inputs:
        if couple[1] >= fre_limit:
            word_dic[couple[0]] = num
            num += 1
    return word_dic

def words_to_vector(word_dic, words):
    vec = [0. for i in range(len(word_dic))]
    num = 0
    for word in words:
        if word in word_dic:
            vec[word_dic[word]] += 1.
            num += 1.
    if num == 0:
        return []
    return [i / num for i in vec]

def get_story_vec(word_dic):
    stories = {}
    with open('story_seqs.txt', 'r') as data_file:
        inputs = json.load(data_file)
    for sid in inputs:
#         print(sid, inputs[sid]['title'], inputs[sid]['words'])
        vec = words_to_vector(word_dic, inputs[sid]['words'])
        if len(vec) == 0:
            print('empty doc vec')
            continue
        stories[int(sid)] = dict(title = inputs[sid]['title'], vec = vec)
    return stories

def get_right_list(set_len, view_set):
    r_index = list(range(len(view_set)))
    shuffle(r_index)
    return [view_set[r_index[i]] for i in range(set_len)]

def generate_data_set(set_len, train_rate):
    with open('view_lists.txt', 'r') as data_file:
        inputs = json.load(data_file)
    outputs = [[],[]]
    labels = [[],[]]
    loop_num = [20, 5]

    for list_json in inputs:
        view_list = list_json['list']
        samples = list_json['samples'] * 1.0

        unview_set = set()
        view_set = set(view_list)
        for i in range(1, 701):
            if i not in view_set:
                unview_set.add(i)

        view_list = list(view_set)
        unview_list = list(unview_set)

        if len(view_list) < set_len:
            continue

        group = 0
        if random() > train_rate:
            group = 1

        sub_num = 1.0
        while samples >= 1.:
            samples -= sub_num
            sub_num *= 1.01
            for i in range(loop_num[group]):
                right_list = get_right_list(set_len, view_list)
                outputs[group].append(right_list)
                labels[group].append([0., 1.])
                if group == 0:
                    ind = randint(0, len(unview_list) - 1)
                    wrong_list = right_list[: set_len - 1] + [unview_list[ind]]
                    outputs[group].append(wrong_list)
                    labels[group].append([1., 0.])
                else:
                    for sid in unview_list:
                        wrong_list = right_list[: set_len - 1] + [sid]
                        outputs[group].append(wrong_list)
                        labels[group].append([1., 0.])
    return outputs, labels

In [2]:
word_fre_limit = 10
view_list_len = 5
train_rate = 0.85

word_dic = get_word_dic(word_fre_limit)
doc_vec_len = len(word_dic)
stories = get_story_vec(word_dic)
# for i in range(1, 2):
#     pprint.pprint(stories[i])
list2_data, labels = generate_data_set(view_list_len, train_rate)

print(doc_vec_len)
print(len(list2_data[0]), len(list2_data[1]))

2130
215480 1139430


In [3]:
input_vecs = [[[stories[sid]['vec'] for sid in viewlist]
              for viewlist in list_data]
              for list_data in list2_data]
print(len(input_vecs[0]), len(input_vecs[1]))

215480 1139430


In [4]:
np.shape(input_vecs[1][0])

(5, 2130)

In [5]:
train_x = input_vecs[0]
train_y = labels[0]

r_index = list(range(len(labels[1])))

test_x = [input_vecs[1][i] for i in r_index]
test_y = [labels[1][i] for i in r_index]

test_x_hit = []
test_y_hit = []
test_x_miss = []
test_y_miss = []
for i in range(len(test_y)):
    if test_y[i][0] < 0.5:
        test_x_hit.append(test_x[i])
        test_y_hit.append(test_y[i])
    else:
        test_x_miss.append(test_x[i])
        test_y_miss.append(test_y[i])

print(len(test_x_hit), len(test_x_miss))

1670 1137760


In [6]:
print(len(train_x))
print(len(train_y))
print(np.shape(train_x[0]))

215480
215480
(5, 2130)


In [7]:
import tensorflow as tf
print(tf.__version__)

  return f(*args, **kwds)


1.4.0


In [8]:
n_hidden = 50
n_classes = 2

graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
    
    weights = {
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([n_classes]))
    }
    
    
    def dynamicRNN(x, weights, biases):
        x = tf.unstack(x, view_list_len, 1)
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)
        outputList, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32) #,sequence_length=10)
        output = outputList[-1]
        print(output.get_shape())
#         outputs = tf.transpose(outputs, [1, 0, 2])
        
#         outputs = tf.reshape(outputs, [-1, n_hidden])

        # Linear activation, using outputs computed above
        return tf.matmul(output, weights['out']) + biases['out'], output, states
        
    x = tf.placeholder("float", [None, view_list_len, doc_vec_len])
    y = tf.placeholder("float", [None, n_classes])
    inputs = x
    labels = y
    logits, outputList, finalstates = dynamicRNN(x, weights, biases)
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate = 0.0001).minimize(loss)
    
    correct_pred = tf.equal(tf.argmax(logits,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    saver = tf.train.Saver()

(?, 50)


In [9]:
batch_size = 1024
batch_start = 0;
train_length = len(train_x)
print(train_length)

train_x_extend = [i for i in train_x]
train_y_extend = [i for i in train_y]

print(len(train_x_extend))

215480
215480


In [10]:
if(len(train_x_extend)==len(train_x)):
    train_x_extend.extend(train_x[0:batch_size])
    train_y_extend.extend(train_y[0:batch_size])
print(len(train_x_extend))

216504


In [11]:
num_steps = 1000

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        feed_dict = {inputs: train_x[batch_start:batch_start+batch_size], labels: train_y[batch_start:batch_start+batch_size]}
        _, l, predictions, acc, _output, _states, _w, _b  =session.run(
            [optimizer, loss, logits, accuracy, outputList, finalstates, weights, biases],
            feed_dict = feed_dict)
        if (step % 100 == 0):
            print('Loss at step %d: %f acc: %f' % (step, l, acc))
        
        batch_start += batch_size
        if(batch_start>=train_length):
            batch_start -=train_length
    saver.save(session, 'lstm_model/bow.ckpt', global_step=num_steps)
    limit_test_len = len(test_x)
    if (limit_test_len > 10000):
        limit_test_len = 10000
    _acc, _loss, _log, _pre = session.run([accuracy, loss, logits, correct_pred],
                feed_dict={inputs: test_x[:limit_test_len], labels: test_y[:limit_test_len]})
    print("Testing Accuracy:", _acc)
    print("Testing Hit Accuracy:", session.run(accuracy,
                                           feed_dict={inputs: test_x_hit, labels: test_y_hit}))
    print("Testing Miss Accuracy:", session.run(accuracy,
                                           feed_dict={inputs: test_x_miss[:limit_test_len],
                                                      labels: test_y_miss[:limit_test_len]}))

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Loss at step 0: 1.465348 acc: 0.500000
Loss at step 100: 0.977223 acc: 0.500000
Loss at step 200: 0.629993 acc: 0.928711
Loss at step 300: 0.603431 acc: 0.893555
Loss at step 400: 0.589132 acc: 0.955078
Loss at step 500: 0.565426 acc: 0.961914
Loss at step 600: 0.532084 acc: 0.975586
Loss at step 700: 0.510906 acc: 0.956055
Loss at step 800: 0.486267 acc: 0.969727
Loss at step 900: 0.458059 acc: 0.965820
Testing Accuracy: 0.921
Testing Hit Accuracy: 0.994611
Testing Miss Accuracy: 0.9203


In [15]:
pprint.pprint(list(_pre))

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,


 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 Tr

 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True

 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True

 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 Tr

In [19]:
with tf.Session(graph=graph) as session:
    saver.restore(session, "lstm_model/bow.ckpt-1000")
    _acc, _loss, _log, _pre = session.run([accuracy, loss,logits, correct_pred],
                feed_dict={inputs: test_x[-1000:], labels: test_y[-1000:]})

INFO:tensorflow:Restoring parameters from lstm_model/bow.ckpt-1000


In [20]:
_acc

0.94999999