In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pickle
import string
import requests
import collections
import io
import tarfile
import urllib.request
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/yskn67/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
batch_size = 500
embedding_size = 200
vocabulary_size = 2000
generations = 10000
model_learning_rate = 0.05

num_sampled = int(batch_size / 2)
window_size = 3

save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 1000

In [3]:
stops = stopwords.words('english')
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']

In [4]:
def load_movie_data():
    save_folder_name = '../data'
    pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')
    if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
        movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
        req = requests.get(movie_data_url, stream=True)
        with open(os.path.join(save_folder_name, 'temp_movie_review_temp.tar.gz'), 'wb') as f:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
        tar = tarfile.open(os.path.join(save_folder_name, 'temp_movie_review_temp.tar.gz'), "r:gz")
        tar.extractall(path=save_folder_name)
        tar.close()
    
    pos_data = []
    with open(pos_file, 'r', encoding='latin-1') as f:
        for line in f:
            pos_data.append(line.encode('ascii', errors='ignore').decode())
    pos_data = [x.rstrip() for x in pos_data]
    
    neg_data = []
    with open(neg_file, 'r', encoding='latin-1') as f:
        for line in f:
            neg_data.append(line.encode('ascii', errors='ignore').decode())
    neg_data = [x.rstrip() for x in neg_data]
    
    texts = pos_data + neg_data
    target = [1] * len(pos_data) + [0] * len(neg_data)
    
    return (texts, target)

texts, target = load_movie_data()

In [5]:
def normalize_text(texts, stops):
    texts = [x.lower() for x in texts]
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
    texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
    texts = [' '.join(x.split()) for x in texts]
    return texts
texts = normalize_text(texts, stops)

In [6]:
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [7]:
def build_dictionary(sentences, vocabulary_size):
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    count = [['RARE', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    word_dict = {}
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    return word_dict

In [8]:
def text_to_numbers(sentences, word_dict):
    data = []
    for sentences in sentences:
        sentence_data = []
        for word in word_dict:
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return data

In [9]:
word_dictionary = build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_to_numbers(texts, word_dictionary)
valid_examples = [word_dictionary[x] for x in valid_words]

In [10]:
def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        rand_sentences = sentences[np.random.randint(len(sentences))]
        window_sequences = [rand_sentences[max((ix - window_size), 0): (ix + window_size + 1)]
                               for ix, x in enumerate(rand_sentences)]
        label_indices = [ix if ix < window_size else window_size
                            for ix, x in enumerate(window_sequences)]
        
        if method == 'skip_gram':
            batch_and_labels = [(x[y], x[:y] + x[(y + 1):])
                                   for x, y in zip(window_sequences, label_indices)]
            tuple_data = [(x, y_) for x, y in batch_and_labels for y_ in y]
            batch, labels = [list(x) for x in zip(*tuple_data)]
        elif method == 'cbow':
            batch_and_labels = [(x[:y] + x[(y + 1):], x[y]) for x, y in zip(window_sequences, label_indices)]
            batch_and_labels = [(x, y) for x, y in batch_and_labels if len(x) == 2 * window_size]
            batch, labels = [list(x) for x in zip(*batch_and_labels)]
        else:
            raise ValueError('Method {} not implemented yet.'.format(method))
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
        
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return (batch_data, label_data)

In [11]:
with tf.Session() as sess:
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    x_inputs = tf.placeholder(tf.int32, shape=[batch_size, 2 * window_size])
    y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    embed = tf.zeros([batch_size, embedding_size])
    for element in range(2 * window_size):
        embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])
    
    nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                                 stddev=1.0 / np.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                         biases=nce_biases,
                                         labels=y_target,
                                         inputs=embed,
                                         num_sampled=num_sampled,
                                         num_classes=vocabulary_size))
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    saver = tf.train.Saver({"embeddings": embeddings})
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate).minimize(loss)
    init = tf.global_variables_initializer()
    sess.run(init)
    
    text_data = [x for x in text_data if len(x) >= (2 * window_size + 1)]
    
    loss_vec = []
    loss_x_vec = []
    for i in range(generations):
        batch_input, batch_labels = generate_batch_data(text_data, batch_size,
                                                        window_size, method='cbow')
        feed_dict = {x_inputs: batch_input, y_target: batch_labels}
        sess.run(optimizer, feed_dict=feed_dict)
        
        if (i + 1) % print_loss_every == 0:
            loss_val = sess.run(loss, feed_dict=feed_dict)
            loss_vec.append(loss_val)
            loss_x_vec.append(i + 1)
            print("Loss at step {} : {}".format(i + 1, loss_val))
            
        if (i + 1) % print_valid_every == 0:
            sim = sess.run(similarity)
            for j in range(len(valid_words)):
                valid_word = word_dictionary_rev[valid_examples[j]]
                top_k = 5
                nearest = (-sim[j, :]).argsort()[1: top_k + 1]
                log_str = "Nearest to {}:".format(valid_word)
                for k in range(top_k):
                    close_word = word_dictionary_rev[nearest[k]]
                    log_str = "{} {},".format(log_str, close_word)
                print(log_str)
        
        if (i + 1) % save_embeddings_every == 0:
            with open('movie_vocab.pkl', 'wb') as f:
                pickle.dump(word_dictionary, f)
            model_checkpoint_path = './cbow_movie_embeddings.ckpt'
            save_path = saver.save(sess, model_checkpoint_path)
            print('Model save in file: {}'.format(save_path))

Loss at step 1000 : 0.975059449672699
Loss at step 2000 : 0.818419873714447
Loss at step 3000 : 0.7739680409431458
Loss at step 4000 : 0.7645130753517151
Loss at step 5000 : 0.7452167272567749
Nearest to love: fine, dialogue, things, others, probably,
Nearest to hate: behind, sadly, ms, treat, energetic,
Nearest to happy: questions, planet, example, throughout, project,
Nearest to sad: problem, cliches, created, mindless, directing,
Nearest to man: moving, project, inventive, dull, green,
Nearest to woman: mere, degree, poor, believable, went,
Model save in file: ./cbow_movie_embeddings.ckpt
Loss at step 6000 : 0.7114600539207458
Loss at step 7000 : 0.6900984048843384
Loss at step 8000 : 0.7305145859718323
Loss at step 9000 : 0.6865231394767761
Loss at step 10000 : 0.7282634377479553
Nearest to love: fine, dialogue, things, others, probably,
Nearest to hate: behind, sadly, ms, treat, energetic,
Nearest to happy: questions, planet, example, throughout, project,
Nearest to sad: problem, 