In [None]:
%matplotlib inline
import os
from tqdm import tqdm, tqdm_notebook
import random
import pickle

import nltk
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

EXP_TITLE = "adam"

# Hyper-Parameters

In [2]:
window_size = 5
feature_size = 100
hidden_unit = 500
learning_rate = 1e-4

# Data Loading & Parsing

In [3]:
def n_gram(words: list, n=window_size):
    return [(words[i: i + n], words[i + n]) for i in range(len(words) - n)]

In [4]:
data = []
file_list = os.listdir("aclImdb/train/pos")
for filename in tqdm_notebook(file_list[:100], desc="File List"):
    with open(os.path.join("aclImdb/train/pos", filename), "rt", encoding="utf-8") as f:
        data.append(f.read())




In [5]:
vocab = set()
dataset = []

for doc in tqdm_notebook(data, desc="Tokenizing"):
    words = nltk.word_tokenize(doc)
    dataset += n_gram(words)
    vocab.update(words)
vocab = list(vocab)
print("DataSet Size: ", len(dataset))
print("Vocabulary Size: ", len(vocab))


DataSet Size:  27862
Vocabulary Size:  4757


## Splitting train, validation and test dataset

In [6]:
train, test = train_test_split(dataset, train_size=0.7, test_size=0.3, random_state=100)
train, valid = train_test_split(train, train_size=0.9, test_size=0.1, random_state=random.randint(0, 99))

# Building Neural Network Model

## One-Hot encoding function

In [17]:
def one_hot(inputs: list or str):
    if type(inputs) == str:
        index = vocab.index(inputs)
        r = np.zeros((len(vocab), 1))
        r[index] = 1
        return r
    elif type(inputs) == list:
        r = []
        for w in inputs:
            index = vocab.index(w)
            zeros = np.zeros((len(vocab)))
            zeros[index] = 1
            r.append(zeros)
        return np.array(r)

## Input Variables

In [8]:
X = tf.placeholder(tf.float32, (None, len(vocab)))
Y = tf.placeholder(tf.float32, (len(vocab), 1))

## Projection Layer

In [9]:
with tf.name_scope("Projection_Layer"):
    proj_w = tf.get_variable("proj_w", shape=(len(vocab), feature_size), initializer=tf.contrib.layers.xavier_initializer())
    proj_b = tf.get_variable("proj_b", shape=(feature_size), initializer=tf.zeros_initializer())

    projection_layer = tf.add(tf.matmul(X, proj_w), proj_b)

    tf.summary.histogram("proj_w", proj_w)
    tf.summary.histogram("proj_b", proj_b)
    tf.summary.histogram("proj", projection_layer)

## Hidden Layer

In [10]:
with tf.name_scope("Hidden_Layer"):
    input_x = tf.reshape(projection_layer, (-1, 1))

    hidden_w = tf.get_variable("hidden_w", shape=(hidden_unit, window_size * feature_size), initializer=tf.contrib.layers.xavier_initializer())
    hidden_b = tf.get_variable("hidden_b", shape=(window_size * feature_size))
    hidden_layer = tf.tanh(tf.add(tf.matmul(hidden_w, input_x), hidden_b))

In [11]:
U = tf.get_variable("U", shape=(len(vocab), hidden_unit))
output = tf.tanh(tf.matmul(U, hidden_layer))

In [12]:
softmax_w = tf.get_variable("softmax_w", shape=(window_size * feature_size, 1), initializer=tf.contrib.layers.xavier_initializer())
softmax_b = tf.get_variable("softmax_b", shape=(1), initializer=tf.zeros_initializer())
output = tf.add(tf.matmul(output, softmax_w), softmax_b)

In [13]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=output, dim=0))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

tf.summary.scalar("cost", cost)

<tf.Tensor 'cost:0' shape=() dtype=string>

In [15]:
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

train_writer = tf.summary.FileWriter("./logs/train_{}".format(EXP_TITLE), graph=sess.graph)
saver = tf.train.Saver()

In [18]:
global_step = 0

for epoch in tqdm_notebook(range(10), desc="Epoch"):
    for x, y in tqdm_notebook(train, leave=False):
        _ = sess.run(train_op, feed_dict={X: one_hot(x), Y: one_hot(y)})
        global_step += 1
        if global_step % 500 == 0:
            c, summary = sess.run([cost, merged], feed_dict={X: one_hot(x), Y: one_hot(y)})
            tqdm.write("Cost: {}".format(c))
            train_writer.add_summary(summary, global_step)
            try:
                saver.save(sess, "./models_{}/model_{}.ckpt".format(EXP_TITLE, global_step))
            except Exception:
                os.mkdir("models_{}".format(EXP_TITLE))
                saver.save(sess, "./models_{}/model_{}.ckpt".format(EXP_TITLE, global_step))

Cost:  8.51029
Cost:  8.41749



KeyboardInterrupt: 