In [1]:
import numpy as np
import tensorflow as tf
import random

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(
    path = 'imdb.npz',
    num_words = None,
    skip_top = 0,
    maxlen = None,
    seed = 113,
    start_char = 1,
    oov_char = 2,
    index_from = 3)

positive_reviews = []
negative_reviews = []
for i in range(0, len(x_train)):
    if y_train[i] == 1:
        positive_reviews.append(x_train[i])
    else:
        negative_reviews.append(x_train[i])

In [38]:
selected_positive_reviews = random.sample(positive_reviews, np.int(len(positive_reviews) / 2))
word_index = tf.keras.datasets.imdb.get_word_index(path='imdb_word_index.json')
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

master_holder =[[]]

for record in selected_positive_reviews:
    holder = []
    for idx in record:
        if idx in reverse_word_index:
            holder.append(reverse_word_index[idx])
        else:
            holder.append('UNK')
    holder.pop(0)
    master_holder.append(holder)
master_holder.pop(0)

master_list = []
master_list = [(' '.join(master_holder[idx])) for idx in range(len(master_holder))]

text = ' '.join(master_list)
vocab = sorted(set(text))
# Length of the vocabulary in chars
vocab_size = len(vocab)
vocab_size

96

In [48]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
      ])
    return model

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024
checkpoint_dir = './training_checkpoints'
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
    num_generate = 1000

  # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
    text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
    temperature = 1.0

  # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
      # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [47]:
import re

frequency = { }
match_pattern = re.findall(r'\b[a-z0-9]+\b', text)
for word in match_pattern:
    count = frequency.get(word,0)
    frequency[word] = count + 1
frequency_sorted = sorted(frequency, key=frequency.get, reverse=True)
frequency_seed = frequency_sorted[:12500]
frequency_seed

['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'br',
 'it',
 'i',
 'this',
 'that',
 's',
 'as',
 'with',
 'for',
 'was',
 'but',
 'film',
 'movie',
 'his',
 'you',
 'on',
 'he',
 'are',
 'not',
 't',
 'one',
 'have',
 'be',
 'all',
 'by',
 'an',
 'who',
 'at',
 'from',
 'her',
 'they',
 'has',
 'like',
 'so',
 'very',
 'out',
 'about',
 'there',
 'what',
 'or',
 'good',
 'she',
 'if',
 'more',
 'some',
 'when',
 'just',
 'can',
 'story',
 'my',
 'time',
 'well',
 'great',
 'up',
 'which',
 'see',
 'their',
 'really',
 'we',
 'also',
 'would',
 'will',
 'me',
 'had',
 'only',
 'other',
 'even',
 'him',
 'were',
 'most',
 'first',
 'than',
 'much',
 'into',
 'its',
 'people',
 'no',
 'life',
 'love',
 'best',
 'get',
 'how',
 'been',
 'because',
 'way',
 'made',
 'do',
 'many',
 'them',
 'think',
 'after',
 'films',
 'too',
 'movies',
 'don',
 'two',
 'man',
 'character',
 'characters',
 'show',
 'then',
 'watch',
 'seen',
 'little',
 'still',
 'could',
 'where',
 'make',
 'never',


In [44]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

x_array = selected_positive_reviews
seed_index = 0
for seed in frequency_seed:
    seed_index = seed_index + 1
    generated = generate_text(model, start_string=seed)
    x_array.append(generated)
    if seed_index % 100 == 0:
        print(str(seed_index) + ' is done')

In [None]:
y_array = [1] * 12500 +  [0] * len(negative_reviews)
for i in range(0, 6250):
    x_array.append(selected_positive_reviews[i])
for i in range(0, len(negative_reviews)):
    x_array.append(negative_reviews[i])
train_data = tf.keras.preprocessing.sequence.pad_sequences(x_array,padding='post', maxlen=256)
test_data = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=256, padding='post')
print(train_data.shape, test_data.shape)

In [None]:
from tensorflow.python import keras
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(88587, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(64,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid")])
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['acc'])

In [None]:
from sklearn.model_selection import train_test_split

partial_x_train, x_val, partial_y_train, y_val = train_test_split(train_data, y_array, test_size=0.33, random_state=42)

In [None]:
predictions = []
predictions_probabilities = model.predict(train_data)

In [None]:
for i in range(0, len(predictions_probabilities)):
    if predictions_probabilities[i] <= 0.5:
        predictions.append([0])
    else:
        predictions.append([1])

In [None]:
import pandas as pd
y_label = np.array(y_array)
flat_list = [item for sublist in predictions[0] for item in sublist]
np.array(flat_list)
predictions = np.array(flat_list)

In [None]:
pd.crosstab(y_label, np.array(flat_list), rownames=['Actual'], colnames=['Predicted'])

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_label,flat_list)
report