In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds
from tensorflow import keras  
from tensorflow.keras import layers
import pickle


In [2]:
tokenizer = tfds.deprecated.text.Tokenizer()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = "/content/drive/MyDrive/Deep Learning/Jupyter Notebook/datasets/"

In [5]:
english = tf.data.TextLineDataset(path + "english.csv")
swedish = tf.data.TextLineDataset(path + "swedish.csv")
dataset = tf.data.Dataset.zip((english, swedish))


In [None]:
english

<TextLineDatasetV2 element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [6]:
for eng, swe in dataset.skip(1):
    print(tokenizer.tokenize(eng.numpy()))
    print(tokenizer.tokenize(swe.numpy().decode("UTF-8")))


['i', 'love', 'tuna']
['jag', 'älskar', 'tonfisk']
['i', 'love', 'potato']
['jag', 'älskar', 'potatis']
['i', 'love', 'bacon']
['jag', 'älskar', 'bacon']


In [None]:
# TODO:
# 1. vocabulary (for each language)
# 2. tokenize and numericalize words
# 3. padded_batch, create model


In [7]:
## Example if you have multiple files
file_names = [path + "test_example1.csv", path + "test_example2.csv", path + "test_example3.csv"]
dataset = tf.data.TextLineDataset(file_names)


In [8]:
for val in dataset.take(2):
  print(val)

tf.Tensor(b'index,type,label,file,review', shape=(), dtype=string)
tf.Tensor(b'0,test,neg,0_2.txt,"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner\'s character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he\'s better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher\'s ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour i

In [9]:
dataset1 = tf.data.TextLineDataset(path + "test_example1.csv").skip(1)  # .map(preprocess1)
dataset2 = tf.data.TextLineDataset(path + "test_example2.csv").skip(1)  # .map(preprocess1)
dataset3 = tf.data.TextLineDataset(path + "test_example3.csv").skip(1)  # .map(preprocess1)


In [10]:
dataset = dataset1.concatenate(dataset2).concatenate(dataset3)

for line in dataset.take(3):
    print(line)


tf.Tensor(b'0,test,neg,0_2.txt,"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner\'s character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he\'s better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher\'s ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."', shape=(), dtype=string)
tf.Tensor(b'1,test,neg,10000_4.txt,"T

In [23]:
def filter_train(line):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    dataset_belonging = split_line[1]  # train, test
    sentiment_category = split_line[2]  # pos, neg, unsup

    return (
        True
        if dataset_belonging == "train" and sentiment_category != "unsup"
        else False
    )


def filter_test(line):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    dataset_belonging = split_line[1]  # train, test
    sentiment_category = split_line[2]  # pos, neg, unsup

    return (
        True if dataset_belonging == "test" and sentiment_category != "unsup" else False
    )



In [24]:
ds_train = tf.data.TextLineDataset(path + "imdb.csv").filter(filter_train)
ds_test = tf.data.TextLineDataset(path + "imdb.csv").filter(filter_test)


In [None]:
# TODO:
# 1. Create vocabulary
# 2. Numericalize text str -> indices (TokenTextEncoder)
# 3. Pad the batches so we can send in to an RNN for example


In [25]:
tokenizer = tfds.deprecated.text.Tokenizer()
# 'i love banana' -> ['i', 'love', 'banana'] -> [0, 1, 2]


In [26]:
def build_vocabulary(ds_train, threshold=200):
    """ Build a vocabulary """
    frequencies = {}
    vocabulary = set()
    vocabulary.update(["sostoken"])
    vocabulary.update(["eostoken"])

    for line in ds_train.skip(1):
        split_line = tf.strings.split(line, ",", maxsplit=4)
        review = split_line[4]
        tokenized_text = tokenizer.tokenize(review.numpy().lower())

        for word in tokenized_text:
            if word not in frequencies:
                frequencies[word] = 1

            else:
                frequencies[word] += 1

            # if we've reached the threshold
            if frequencies[word] == threshold:
                vocabulary.update(tokenized_text)

    return vocabulary


In [27]:
# Build vocabulary and save it to vocabulary.obj
vocabulary = build_vocabulary(ds_train)
vocab_file = open("vocabulary.obj", "wb")
pickle.dump(vocabulary, vocab_file)


In [None]:
# Loading the vocabulary
# vocab_file = open("vocabulary.obj", "rb")
# vocabulary = pickle.load(vocab_file)


In [32]:
encoder = tfds.deprecated.text.TokenTextEncoder(
    list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer=tokenizer,
)


In [33]:
def my_encoder(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label


def encode_map_fn(line):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    label_str = split_line[2]  # neg, pos
    review = "sostoken " + split_line[4] + " eostoken"
    label = 1 if label_str == "pos" else 0

    (encoded_text, label) = tf.py_function(
        my_encoder, inp=[review, label], Tout=(tf.int64, tf.int32),
    )

    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label


In [35]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
ds_train = ds_train.shuffle(25000)
ds_train = ds_train.padded_batch(32, padded_shapes=([None], ()))

ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))


In [36]:
model = keras.Sequential(
    [
        layers.Masking(mask_value=0),
        layers.Embedding(input_dim=len(vocabulary) + 2, output_dim=32,),
        layers.GlobalAveragePooling1D(),
        layers.Dense(64, activation="relu"),
        layers.Dense(1),
    ]
)


In [37]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(3e-4, clipnorm=1),
    metrics=["accuracy"],
)


In [38]:
model.fit(ds_train, epochs=15, verbose=2)
model.evaluate(ds_test)


Epoch 1/15
782/782 - 156s - loss: 0.6756 - accuracy: 0.5039 - 156s/epoch - 199ms/step
Epoch 2/15
782/782 - 50s - loss: 0.4999 - accuracy: 0.7149 - 50s/epoch - 65ms/step
Epoch 3/15
782/782 - 27s - loss: 0.3466 - accuracy: 0.8544 - 27s/epoch - 35ms/step
Epoch 4/15
782/782 - 18s - loss: 0.2823 - accuracy: 0.8867 - 18s/epoch - 24ms/step
Epoch 5/15
782/782 - 14s - loss: 0.2454 - accuracy: 0.9038 - 14s/epoch - 18ms/step
Epoch 6/15
782/782 - 9s - loss: 0.2169 - accuracy: 0.9162 - 9s/epoch - 12ms/step
Epoch 7/15
782/782 - 9s - loss: 0.1963 - accuracy: 0.9268 - 9s/epoch - 12ms/step
Epoch 8/15
782/782 - 10s - loss: 0.1774 - accuracy: 0.9349 - 10s/epoch - 12ms/step
Epoch 9/15
782/782 - 10s - loss: 0.1609 - accuracy: 0.9421 - 10s/epoch - 13ms/step
Epoch 10/15
782/782 - 9s - loss: 0.1482 - accuracy: 0.9473 - 9s/epoch - 11ms/step
Epoch 11/15
782/782 - 7s - loss: 0.1338 - accuracy: 0.9542 - 7s/epoch - 9ms/step
Epoch 12/15
782/782 - 9s - loss: 0.1227 - accuracy: 0.9599 - 9s/epoch - 11ms/step
Epoch 13/

[0.3352822959423065, 0.8870400190353394]