In [4]:
import json
import pprint
from random import randint
import numpy as np

def get_word_dic(fre_limit):
    word_dic = {}
    num = 0
    with open('freq_word.txt', 'r') as data_file:
        inputs = json.load(data_file)
    for couple in inputs:
        if couple[1] >= fre_limit:
            word_dic[couple[0]] = num
            num += 1
    return word_dic

def words_to_vector(word_dic, words):
    vec = [0. for i in range(len(word_dic))]
    num = 0
    for word in words:
        if word in word_dic:
            vec[word_dic[word]] += 1.
            num += 1.
    if num == 0:
        return []
    return [i / num for i in vec]

def get_story_vec(word_dic):
    stories = {}
    with open('story_seqs.txt', 'r') as data_file:
        inputs = json.load(data_file)
    for sid in inputs:
#         print(sid, inputs[sid]['title'], inputs[sid]['words'])
        vec = words_to_vector(word_dic, inputs[sid]['words'])
        if len(vec) == 0:
            print('empty doc vec')
            continue
        stories[int(sid)] = dict(title = inputs[sid]['title'], vec = vec)
    return stories

def generate_data_set(list_len):
    with open('view_lists.txt', 'r') as data_file:
        inputs = json.load(data_file)
    outputs = []
    labels = []
    for list_json in inputs:
        view_list = list_json['list']
        if len(view_list) < list_len:
            continue
        view_set = set(view_list)
        for i in range(list_len - 1):
            if view_list[i] in view_set:
                view_set.remove(view_list[i])
        for i in range(list_len - 1, len(view_list)):
            x = view_list[i - list_len + 1: i + 1]
            outputs.append(x)
            labels.append([0., 1.])
            ind = randint(0, 699 - len(view_set))
            j = 1
            x = view_list[i - list_len + 1: i]
            while True:
                if j not in view_set:
                    ind -= 1
                    if ind < 0:
                        x.append(j)
                        outputs.append(x)
                        labels.append([1., 0.])
                        break
                j += 1
    return outputs, labels

In [9]:
word_fre_limit = 10
view_list_len = 5

word_dic = get_word_dic(word_fre_limit)
doc_vec_len = len(word_dic)
stories = get_story_vec(word_dic)
# for i in range(1, 2):
#     pprint.pprint(stories[i])
list_data, labels = generate_data_set(view_list_len)

print(doc_vec_len)
print(len(list_data), len(labels))

2132
4780 4780


In [10]:
input_vecs = [[stories[sid]['vec'] for sid in viewlist]
              for viewlist in list_data]
np.shape(input_vecs)

(4780, 5, 2132)

In [13]:
import random
train_rate = 0.8
r_index = list(range(len(list_data)))
random.shuffle(r_index)
train_x = [input_vecs[i] for i in r_index[:int(len(r_index)*train_rate)]]
train_y = [labels[i] for i in r_index[:int(len(r_index)*train_rate)]]
test_x = [input_vecs[i] for i in r_index[int(len(r_index)*train_rate):]]
test_y = [labels[i] for i in r_index[int(len(r_index)*train_rate):]]
test_x_hit = []
test_y_hit = []
test_x_miss = []
test_y_miss = []
for i in range(len(test_y)):
    if test_y[i][0] < 0.5:
        test_x_hit.append(test_x[i])
        test_y_hit.append(test_y[i])
    else:
        test_x_miss.append(test_x[i])
        test_y_miss.append(test_y[i])

In [14]:
print(len(train_x))
print(len(train_y))
print(np.shape(train_x[0]))

3824
3824
(5, 2132)


In [15]:
import tensorflow as tf
print(tf.__version__)

  return f(*args, **kwds)


1.4.0


In [18]:
n_hidden = 50
num_steps = 5000
n_classes = 2

graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
    
    weights = {
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([n_classes]))
    }
    
    
    def dynamicRNN(x, weights, biases):
        x = tf.unstack(x, view_list_len, 1)
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)
        outputList, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32) #,sequence_length=10)
        output = outputList[-1]
        print(output.get_shape())
#         outputs = tf.transpose(outputs, [1, 0, 2])
        
#         outputs = tf.reshape(outputs, [-1, n_hidden])

        # Linear activation, using outputs computed above
        return tf.matmul(output, weights['out']) + biases['out'], output, states
        
    x = tf.placeholder("float", [None, view_list_len, doc_vec_len])
    y = tf.placeholder("float", [None, n_classes])
    inputs = x
    labels = y
    logits, outputList, finalstates = dynamicRNN(x, weights, biases)
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate = 0.0001).minimize(loss)
    
    correct_pred = tf.equal(tf.argmax(logits,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        feed_dict = {inputs: train_x, labels: train_y}
        _, l, predictions, acc, _output, _states, _w, _b  =session.run(
            [optimizer, loss, logits, accuracy, outputList, finalstates, weights, biases],
            feed_dict = feed_dict)
        if (step % 100 == 0):
            print('Loss at step %d: %f acc: %f' % (step, l, acc))
#       print predictions[:10]
#       print train_y[:10]

    _acc, _loss, _log = session.run([accuracy, loss,logits],
                feed_dict={inputs: test_x, labels: test_y})
   
    print("Testing Accuracy:", session.run(accuracy,
                                           feed_dict={inputs: test_x, labels: test_y}))
    print("Testing Hit Accuracy:", session.run(accuracy,
                                           feed_dict={inputs: test_x_hit, labels: test_y_hit}))
    print("Testing Miss Accuracy:", session.run(accuracy,
                                           feed_dict={inputs: test_x_miss, labels: test_y_miss}))

(?, 50)
Initialized
Loss at step 0: 0.738346 acc: 0.501308
Loss at step 100: 0.665847 acc: 0.888598
Loss at step 200: 0.638451 acc: 0.893044
Loss at step 300: 0.607974 acc: 0.919718
Loss at step 400: 0.572066 acc: 0.938285
Loss at step 500: 0.524684 acc: 0.955805
Loss at step 600: 0.438081 acc: 0.967573
Loss at step 700: 0.316274 acc: 0.970188
Loss at step 800: 0.232896 acc: 0.973849
Loss at step 900: 0.177761 acc: 0.975418
Loss at step 1000: 0.143007 acc: 0.978295
Loss at step 1100: 0.119962 acc: 0.981172
Loss at step 1200: 0.104005 acc: 0.982741
Loss at step 1300: 0.092413 acc: 0.984571
Loss at step 1400: 0.083598 acc: 0.985617
Loss at step 1500: 0.076641 acc: 0.986663
Loss at step 1600: 0.070993 acc: 0.987186
Loss at step 1700: 0.066299 acc: 0.988232
Loss at step 1800: 0.062310 acc: 0.989278
Loss at step 1900: 0.058881 acc: 0.989801
Loss at step 2000: 0.055873 acc: 0.990586
Loss at step 2100: 0.053193 acc: 0.991109
Loss at step 2200: 0.050807 acc: 0.991109
Loss at step 2300: 0.04864

In [20]:
print(_w, _b)

{'out': array([[ 4.1460886 , -0.79982728],
       [-0.09262101, -1.8403368 ],
       [-0.13673005,  1.13932014],
       [ 2.41629648, -0.21761551],
       [ 0.86916351, -0.61454934],
       [ 0.13271935, -0.31958792],
       [ 0.87902445,  0.70441228],
       [-1.76063371,  0.17705514],
       [-1.79775691, -0.19743636],
       [-0.78480625,  0.82893419],
       [ 0.5669027 ,  0.37268162],
       [-0.3448348 ,  1.5058713 ],
       [ 1.76369178, -0.53208447],
       [ 1.2716819 , -1.60101426],
       [ 0.81526417,  0.40886939],
       [ 0.76217526, -0.47564808],
       [-1.44761491, -1.5417738 ],
       [-0.12595981,  1.92044282],
       [-2.25187802, -0.30091959],
       [-2.14260602,  1.54209197],
       [ 0.46331334,  0.38964435],
       [ 0.18812354,  1.69453824],
       [-0.23647772,  1.16230381],
       [ 0.49304643,  0.97173315],
       [-0.01465913,  1.77578568],
       [ 1.1485281 ,  0.01627538],
       [-0.40430701, -0.32460755],
       [ 0.23986933, -0.49009082],
       [ 0.0

In [21]:
_acc, _loss

(0.97907948, 0.096978597)

In [22]:
len(_log)

956

In [23]:
_log[:10]

array([[-3.34424138,  4.78389645],
       [-2.11619663,  4.27642393],
       [-1.0538218 ,  4.51326609],
       [-2.11619663,  4.27642393],
       [ 4.36826944, -2.0003252 ],
       [ 8.52097702, -4.66331625],
       [ 5.3442049 , -2.29197764],
       [-0.79571134,  3.4618299 ],
       [ 5.74861383, -2.86019087],
       [-1.41135812,  4.17305374]], dtype=float32)

In [24]:
test_y[:10]

[[0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [1.0, 0.0],
 [0.0, 1.0],
 [1.0, 0.0],
 [0.0, 1.0]]