In [39]:
import tensorflow as tf
import numpy as np
save_file = './model.ckpt'
tf.reset_default_graph()

In [40]:
g = open('reviews.txt','r')
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r')
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [41]:
len(reviews)

25000

In [42]:
reviews[0:1]

['bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   ']

In [43]:
labels[0]

'POSITIVE'

In [44]:
review_vocab = set()
for review in reviews:
    for word in review.split(' '):
        review_vocab.add(word)

review_vocab = list(review_vocab)

label_vocab = set()
for label in labels:
    label_vocab.add(label)

label_vocab = list(label_vocab)

review_vocab_size = len(review_vocab)
label_vocab_size = len(label_vocab)

word2index = {}
for i, word in enumerate(review_vocab):
    word2index[word] = i
    
label2index = {}
for i, label in enumerate(label_vocab):
    label2index[label] = i

In [45]:
review_vocab

['',
 'buick',
 'pigface',
 'investing',
 'dogpatch',
 'kwame',
 'vacillate',
 'et',
 'mathurin',
 'homesteading',
 'ofcourse',
 'macliammir',
 'aryana',
 'expendable',
 'contaminating',
 'chapeaux',
 'author',
 'yack',
 'swampy',
 'insignificant',
 'surrealness',
 'levels',
 'escort',
 'atrociousthe',
 'slithered',
 'scissors',
 'transcendence',
 'protected',
 'combed',
 'alrite',
 'unassured',
 'sefa',
 'immediacy',
 'gabrielle',
 'phieffer',
 'asco',
 'cache',
 'guild',
 'counterstrike',
 'tolerans',
 'lunacy',
 'entitle',
 'abridge',
 'yojimbo',
 'calibration',
 'galaxina',
 'samot',
 'giornate',
 'stance',
 'zimmerman',
 'meddle',
 'luckely',
 'supertroopers',
 'queequeg',
 'latterday',
 'saxaphone',
 'questions',
 'novelized',
 'fontanelles',
 'truckers',
 'brand',
 'psicoanalitical',
 'rinne',
 'markup',
 'canoe',
 'moly',
 'megalunged',
 'tyrannosaur',
 'remake',
 'libertine',
 'ummmm',
 'blackens',
 'cams',
 'genma',
 'stagehands',
 'rightly',
 'appalingly',
 'misses',
 'pliss

In [46]:
label_vocab

['POSITIVE', 'NEGATIVE']

In [47]:
word2index

{'': 0,
 'buick': 1,
 'pigface': 2,
 'investing': 3,
 'dogpatch': 4,
 'woodenly': 70573,
 'kwame': 5,
 'vacillate': 6,
 'et': 7,
 'mathurin': 8,
 'ofcourse': 10,
 'macliammir': 11,
 'aryana': 12,
 'expendable': 13,
 'swampy': 18,
 'chapeaux': 15,
 'author': 16,
 'yack': 17,
 'surrealness': 20,
 'levels': 21,
 'atrociousthe': 23,
 'slithered': 24,
 'scissors': 25,
 'transcendence': 26,
 'protected': 27,
 'combed': 28,
 'reanimated': 24622,
 'cop': 49521,
 'unassured': 30,
 'sefa': 31,
 'immediacy': 32,
 'asco': 35,
 'cache': 36,
 'guild': 37,
 'tolerans': 39,
 'maris': 59361,
 'lunacy': 40,
 'entitle': 41,
 'booklet': 37066,
 'abridge': 42,
 'yojimbo': 43,
 'calibration': 44,
 'herringbone': 24628,
 'samot': 46,
 'giornate': 47,
 'stance': 48,
 'supertroopers': 52,
 'luckely': 51,
 'queequeg': 53,
 'homesteading': 9,
 'latterday': 54,
 'saxaphone': 55,
 'questions': 56,
 'truckers': 59,
 'psicoanalitical': 61,
 'rinne': 62,
 'markup': 63,
 'canoe': 64,
 'moly': 65,
 'fannie': 61824,
 'm

In [48]:
label2index

{'NEGATIVE': 1, 'POSITIVE': 0}

In [49]:
learning_rate = 0.1
epochs = 20
batch_size = 120
display_step = 1

n_input = review_vocab_size
n_classes = label_vocab_size
hidden_layer = 10

In [50]:
weights = {
    'hidden_layer': tf.Variable(tf.random_normal([n_input, hidden_layer])),
    'out': tf.Variable(tf.random_normal([hidden_layer, n_classes]))
}

biases = {
    'hidden_layer': tf.Variable(tf.random_normal([hidden_layer])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [51]:
x = tf.placeholder("float32", [None, n_input])
y = tf.placeholder("float32", [None, n_classes])

In [52]:
layer = tf.add(tf.matmul(x, weights['hidden_layer']), biases['hidden_layer'])
layer = tf.nn.relu(layer)

logits = tf.add(tf.matmul(layer, weights['out']), biases['out'])

In [53]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

In [68]:
def get_x(x):
    batch_x = np.zeros((1, review_vocab_size))
    
    for word in x.split(' '):
        if (word in word2index.keys()):
            batch_x[0][word2index[word]] = 1
    
    return batch_x

def get_y(y):
    batch_y = np.zeros((1, label_vocab_size))
    if train_y[i] == 'POSITIVE':
        batch_y[0][1] = 1
    else:
        batch_y[0][0] = 1
        
    return batch_y
            
            
init = tf.global_variables_initializer()
# saver = tf.train.Saver()

train_x = reviews[:-10]
train_y = labels[:-10]

test_x = [get_x(reviews[24999 - i]) for i in range(len(reviews[-1000:]))]
test_y = [get_y(labels[24999 - i]) for i in range(len(labels[-1000:]))]

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    # Training cycle
    for epoch in range(1):
#         total_batch = int(len(train_x) / batch_size)
        # Loop over all batches
        for i in range(len(train_x)):
#             batch_x = train_x[i * batch_size:(i + 1) * batch_size]
#             batch_y = train_y[i * batch_size:(i + 1) * batch_size]
            batch_x = get_x(train_x[i])
            batch_y = get_y(train_y[i])
            
            # Run optimization op (backprop) and cost op (to get loss value)
            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})
        # Display logs per epoch step
#         if epoch % 0 == 0:
        c = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
        print("Epoch:", '%04d' % (epoch+1), "cost=", \
            "{:.9f}".format(c))
#     saver.save(sess, save_file)
    print("Optimization Finished!")

#     # Test model
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
#     # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
#     # Decrease test_size if you don't have enough memory
#     test_size = 256
    print("Accuracy:", accuracy.eval({x: test_x, y: test_y}))

Epoch: 0001 cost= 0.667817056
Optimization Finished!


ValueError: Cannot feed value of shape (1000, 1, 74074) for Tensor 'Placeholder:0', which has shape '(?, 74074)'