In [1]:
# import packages
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer

import tqdm
import joblib
import pandas as pd
import numpy as np
import os

In [2]:
# get data
df = pd.read_csv("./training.1600000.processed.noemoticon.csv", encoding='ansi')
df = df[['''@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D''', '0']]
df.columns = ['tweet', 'sentiment']

In [3]:
dataset_size = len(df.tweet)
dataset_size

1599999

In [4]:
train_size = int(dataset_size * 0.95)
train_size

1519999

In [5]:
# tokenize the dataset
tokenizer = Tokenizer(char_level=False, oov_token=0)
tokenizer.fit_on_texts(df.tweet)
vocab_size = len(tokenizer.word_index.keys())
vocab_size

690956

In [6]:
tokenized = tokenizer.texts_to_sequences(df.tweet)

In [7]:
# get maximum tweet length
max_tweet_length = max(len(sequence) for sequence in tokenized)
max_tweet_length

118

In [8]:
dataset = df.to_numpy()
dataset

array([["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
        0],
       ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
        0],
       ['my whole body feels itchy and like its on fire ', 0],
       ...,
       ['Are you ready for your MoJo Makeover? Ask me for details ', 4],
       ['Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur ',
        4],
       ['happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H ',
        4]], dtype=object)

In [9]:
np.random.shuffle(dataset)
dataset

array([['Ok so after days of working on this giant jawbreaker, it turns out the center is made of HOLLOW bubblegum... I feel so gyped ',
        0],
       ['@JenniferSmithCo Just checked the diary and it clashes with uSwitch.net beers ',
        0],
       ["Okay you know somethin is good when you dream about it. Land of the lost dreams are even stranger than the show. Can't wait for the movie ",
        4],
       ...,
       ["@CyberWasteland Last day o' the year, someone ran into a window and it fell on a teacher two floors down. ",
        4],
       ['ok i completely dont understand why people behave like shit to each other! ',
        0],
       ['@thedilettante but soon you will know the joy of crowding out namesakes on page 1 of search engines ',
        4]], dtype=object)

In [10]:
# convert to tf.Tensor
X = dataset[:, 0]
y = dataset[:, 1]
X, y

(array(['Ok so after days of working on this giant jawbreaker, it turns out the center is made of HOLLOW bubblegum... I feel so gyped ',
        '@JenniferSmithCo Just checked the diary and it clashes with uSwitch.net beers ',
        "Okay you know somethin is good when you dream about it. Land of the lost dreams are even stranger than the show. Can't wait for the movie ",
        ...,
        "@CyberWasteland Last day o' the year, someone ran into a window and it fell on a teacher two floors down. ",
        'ok i completely dont understand why people behave like shit to each other! ',
        '@thedilettante but soon you will know the joy of crowding out namesakes on page 1 of search engines '],
       dtype=object),
 array([0, 0, 4, ..., 4, 0, 4], dtype=object))

In [11]:
# train / valid split
X_train, y_train, X_valid, y_valid = X[:train_size], y[:train_size], X[train_size:], y[train_size:]

In [12]:
# max_allowed_length = 200

# # preprocessing function for raw data
# def preprocess(tweets, sentiments):
#     tokenized = tokenizer.texts_to_sequences(list(tweets)) # tokenize
#     ragged_tensor = tf.ragged.constant(tokenized) # ragged tensor
#     dense_tensor = ragged_tensor.to_tensor(default_value=-1) # apply_padding
#     if tf.shape(dense_tensor)[1] > max_allowed_length:
#         dense_tensor = dense_tensor[:, :max_allowed_length] # truncate if needed
#     X = dense_tensor + 1 # shift padding token to index 0
#     y = (sentiments / 4).astype(np.uint32)
#     return tf.constant(X, dtype=tf.int32), tf.constant(y, dtype=tf.int32)

In [13]:
# X_train_preprocessed, y_train_preprocessed = preprocess(X_train, y_train)
# X_valid_preprocessed, y_valid_preprocessed = preprocess(X_valid, y_valid)
# X_train_preprocessed

In [14]:
# batch_size = 20

# train = tf.data.Dataset.from_tensor_slices((X_train_preprocessed, y_train_preprocessed))

# valid = tf.data.Dataset.from_tensor_slices((X_valid_preprocessed, y_valid_preprocessed))

In [15]:
# save dataset to TFRecord file:

# from tensorflow.train import Int64List, Feature, BytesList, Example, Features

# def serialize_example(instance, label):
#     X = tf.io.serialize_tensor(instance)
#     feature = {
#       'image': Feature(bytes_list=BytesList(value=[X.numpy()])),
#       'label': Feature(int64_list=Int64List(value=[label.numpy()]))
#     }
#     example = Example(features=Features(feature=feature))
#     return example.SerializeToString()

# os.chdir("./datasets/")
# for j, dataset in enumerate([train, valid]):
#     os.chdir(f"./{j}/")
#     for i, inst in dataset.enumerate():
#         with tf.io.TFRecordWriter(os.path.abspath(f"{i // 2500}.tfrecord")) as f:
#             f.write(serialize_example(inst[0], inst[1]))
#     os.chdir("../")
# os.chdir("../")

In [16]:
# os.getcwd()

In [17]:
# os.chdir("../..")

In [18]:
# train = train.shuffle(10000).batch(batch_size).prefetch(1)
# valid = valid.shuffle(10000).batch(batch_size).prefetch(1)

In [19]:
# embed_size = 100

# encoder_input = keras.layers.Input(shape=[None])
# encoder_embedding = keras.layers.Embedding(vocab_size + 1, embed_size, mask_zero=True, input_shape=[None])(encoder_input)
# encoder_positional_embedding = keras.layers.Lambda(lambda x : x + tf.range(tf.shape(x)[1], dtype=tf.float32)[tf.newaxis, :, tf.newaxis])(encoder_embedding)
# Z = encoder_positional_embedding
# for _ in range(2):
#     Z = keras.layers.Attention(use_scale=True)([Z,Z])
# encoder_output = Z
# encoder = keras.Model(inputs=[encoder_input], outputs=[encoder_output])

# decoder_input = keras.layers.Input(shape=[None, embed_size])
# gru1 = keras.layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(decoder_input)
# gru2 = keras.layers.GRU(32, dropout=0.2, recurrent_dropout=0.2)(gru1)
# decoder_output = keras.layers.Dense(1, activation="softmax")(gru2)
# decoder = keras.Model(inputs=[decoder_input], outputs=[decoder_output])

# model = keras.models.Sequential([encoder, decoder])

In [20]:
# model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
# history=model.fit(train.take(10), epochs=5, validation_data=valid.take(5))

In [21]:
# gcloud command I used for training: 
# gcloud ai-platform jobs submit training stream_sentiments1 --region asia-southeast1 --scale-tier BASIC --runtime-version 2.11 --python-version 3.7 --package-path ./stream_sentiments_training --module-name stream_sentiments_training.task --staging-bucket gs://6727667276 --job-dir gs://6727667276 --project compute-387916

# training/task.py script that I used
# import tensorflow as tf
# from tensorflow import keras

# import argparse
# parser = argparse.ArgumentParser()
# parser.add_argument("--job-dir", required=True, type=str)
# args = parser.parse_args()

# embed_size = 100
# max_allowed_length = 200
# vocab_size = 690956

# distribution = tf.distribute.experimental.MultiWorkerMirroredStrategy()
# with distribution.scope():

#     class PositionalEmbedding(keras.layers.Layer):
#         def __init__(self, max_time_steps, dtype=tf.float32):
#             super().__init__(dtype=dtype)
#             positional_embedding = tf.range(max_time_steps)[tf.newaxis, :, tf.newaxis]
#             self.positional_embedding = tf.cast(positional_embedding, self.dtype)
#         def call(self, inputs):
#             shape = tf.shape(inputs)
#             return inputs + self.positional_embedding[:, :shape[1], :]

#     encoder_input = keras.layers.Input(shape=[None])
#     encoder_embedding = keras.layers.Embedding(vocab_size + 1, embed_size, mask_zero=True, input_shape=[None])(encoder_input)
#     encoder_positional_embedding = PositionalEmbedding(max_allowed_length)(encoder_embedding)
#     Z = encoder_positional_embedding
#     for _ in range(2):
#         Z = keras.layers.Attention(use_scale=True)([Z,Z])
#         encoder_output = Z
#     encoder = keras.Model(inputs=[encoder_input], outputs=[encoder_output])

#     decoder_input = keras.layers.Input(shape=[None, embed_size])
#     gru1 = keras.layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(decoder_input)
#     gru2 = keras.layers.GRU(32, dropout=0.2, recurrent_dropout=0.2)(gru1)
#     decoder_output = keras.layers.Dense(1, activation="softmax")(gru2)
#     decoder = keras.Model(inputs=[decoder_input], outputs=[decoder_output])

#     model = keras.models.Sequential([encoder, decoder])
#     model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

# batch_size=40
# tensorboard_callback = keras.callbacks.TensorBoard("gs://6727667276")

# import pandas as pd
# df = pd.read_csv("gs://6727667276/training.1600000.processed.noemoticon.csv", encoding='ansi')
# df = df[['''@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D''', '0']]
# df.columns = ['tweet', 'sentiment']
# dataset_size = len(df.tweet)
# train_size = int(dataset_size * 0.95)
# from tensorflow.keras.preprocessing.text import Tokenizer
# tokenizer = Tokenizer(char_level=False, oov_token=0)
# tokenizer.fit_on_texts(df.tweet)
# vocab_size = len(tokenizer.word_index.keys())
# tokenized = tokenizer.texts_to_sequences(df.tweet)
# max_tweet_length = max(len(sequence) for sequence in tokenized)
# X = df.to_numpy()[:, 0]
# y = df.to_numpy()[:, 1]
# X_train, y_train, X_valid, y_valid = X[:train_size], y[:train_size], X[train_size:], y[train_size:]
# max_allowed_length = 200
# def preprocess(tweets, sentiments):
#     tokenized = tokenizer.texts_to_sequences(list(tweets)) # tokenize                                                                                                                  
#     ragged_tensor = tf.ragged.constant(tokenized) # ragged tensor                                                                                                                      
#     dense_tensor = ragged_tensor.to_tensor(default_value=-1) # apply_padding                                                                                                           
#     if tf.shape(dense_tensor)[1] > max_allowed_length:
#         dense_tensor = dense_tensor[:, :max_allowed_length] # truncate if needed                                                                                                       
#     X = dense_tensor + 1 # shift padding token to index 0                                                                                                                              
#     y = (sentiments / 4).astype(np.uint32)
#     return tf.constant(X, dtype=tf.int32), tf.constant(y, dtype=tf.int32)
# X_train_preprocessed, y_train_preprocessed = preprocess(X_train, y_train)
# X_valid_preprocessed, y_valid_preprocessed = preprocess(X_valid, y_valid)
# train = tf.data.Dataset.from_tensor_slices((X_train_preprocessed, y_train_preprocessed)).shuffle(10000).batch(batch_size).prefetch(1)
# valid = tf.data.Dataset.from_tensor_slices((X_valid_preprocessed, y_valid_preprocessed)).shuffle(10000).batch(batch_size).prefetch(1)

# history = mirrored_model.fit(train, epochs=5, validation_data = valid, callbacks=[tensorboard_callback])

In [22]:
# model.save("model.h5") # save model

# model = keras.models.load_model("model.h5") # load model

In [23]:
# model_version="0001"
# model_name = "stream_sentiments_model"
# model_path = os.path.join(model_name, model_version)

# tf.saved_model.save(model, model_path)

In [24]:
# make smaller version of model

# embed_size = 50

# encoder_input = keras.layers.Input(shape=[None])
# encoder_embedding = keras.layers.Embedding(vocab_size + 1, embed_size, mask_zero=True, input_shape=[None])(encoder_input)
# encoder_positional_embedding = keras.layers.Lambda(lambda x : x + tf.range(tf.shape(x)[1], dtype=tf.float32)[tf.newaxis, :, tf.newaxis])(encoder_embedding)
# Z = encoder_positional_embedding
# for _ in range(2):
#     Z = keras.layers.Attention(use_scale=True)([Z,Z])
# encoder_output = Z
# encoder = keras.Model(inputs=[encoder_input], outputs=[encoder_output])

# decoder_input = keras.layers.Input(shape=[None, embed_size])
# gru1 = keras.layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(decoder_input)
# gru2 = keras.layers.GRU(32, dropout=0.2, recurrent_dropout=0.2)(gru1)
# decoder_output = keras.layers.Dense(1, activation="softmax")(gru2)
# decoder = keras.Model(inputs=[decoder_input], outputs=[decoder_output])

# model = keras.models.Sequential([encoder, decoder])

# model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])
# history=model.fit(train.take(10), epochs=5, validation_data=valid.take(5))

# model_version="0002"
# model_name = "stream_sentiments_model"
# model_path = os.path.join(model_name, model_version)

# tf.saved_model.save(model, model_path)

# model.save("small_model.h5") # save model

# model = keras.models.load_model("small_model.h5") # load model

In [25]:
# make even smaller version of model, that includes preprocessing

# start by deleting this useless word index from our existing tokenizer
del tokenizer.word_index[0]
tokenizer.word_index

{'i': 2,
 'to': 3,
 'the': 4,
 'a': 5,
 'my': 6,
 'and': 7,
 'you': 8,
 'is': 9,
 'it': 10,
 'in': 11,
 'for': 12,
 'of': 13,
 'on': 14,
 'me': 15,
 'so': 16,
 'have': 17,
 'that': 18,
 'but': 19,
 "i'm": 20,
 'just': 21,
 'with': 22,
 'be': 23,
 'at': 24,
 'not': 25,
 'was': 26,
 'this': 27,
 'now': 28,
 'good': 29,
 'up': 30,
 'day': 31,
 'out': 32,
 'all': 33,
 'get': 34,
 'like': 35,
 'are': 36,
 'no': 37,
 'go': 38,
 'quot': 39,
 'http': 40,
 'today': 41,
 'do': 42,
 "it's": 43,
 'too': 44,
 'your': 45,
 'work': 46,
 'love': 47,
 'going': 48,
 'got': 49,
 'lol': 50,
 'time': 51,
 'back': 52,
 'from': 53,
 'u': 54,
 'one': 55,
 'what': 56,
 'com': 57,
 'will': 58,
 'im': 59,
 'know': 60,
 'we': 61,
 'about': 62,
 'am': 63,
 'really': 64,
 "don't": 65,
 'amp': 66,
 'had': 67,
 'can': 68,
 'see': 69,
 "can't": 70,
 'some': 71,
 'its': 72,
 'if': 73,
 'still': 74,
 '2': 75,
 'well': 76,
 'night': 77,
 'new': 78,
 'want': 79,
 'how': 80,
 'think': 81,
 'home': 82,
 'thanks': 83,
 'oh':

In [26]:
encoder_input = keras.layers.Input(shape=[], dtype=tf.string)
# encoder_input = ["Hello! I'm here! ", "What? Where are you?"]

# split
regex = """'|!|"|#|\$|%|&|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|\{|\||\}|~|\t|\n"""
split = keras.layers.Lambda(lambda x : tf.strings.split(
                                       tf.strings.lower(
                                       tf.strings.regex_replace(x, regex, "")
                                       )))(encoder_input)

# tokenize
class VocabLookup(keras.layers.Layer):
    def __init__(self, word_index, num_oov_buckets, **kwargs):
        self.word_index = word_index
        self.vocab = list(word_index.keys())
        self.indices = tf.range(len(self.vocab), dtype=tf.int64)
        table_init = tf.lookup.KeyValueTensorInitializer(self.vocab, self.indices)
        self.table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
        super(VocabLookup, self).__init__(**kwargs)

    def build(self, input_shape):
        self.built = True

    def call(self, X):
        return self.table.lookup(X)
num_oov_buckets = 3
tokenize = VocabLookup(tokenizer.word_index, num_oov_buckets)(split)

# pad
pad = keras.layers.Lambda(lambda x : x.to_tensor(default_value=-1)
                                        + tf.constant(1, dtype=tf.int64))(tokenize)

In [35]:
# embed
embed_size = 2
vocab_size = len(tokenizer.word_index.keys())
encoder_embedding = keras.layers.Embedding(vocab_size + num_oov_buckets + 1, embed_size, 
                                           mask_zero=True)(pad)

# positional embed
encoder_positional_embedding = keras.layers.Lambda(lambda x : x + tf.range(tf.shape(x)[1], 
                                dtype=tf.float32)[tf.newaxis, :, tf.newaxis])(encoder_embedding)

# attention
Z = encoder_positional_embedding
for _ in range(2):
    Z = keras.layers.Attention(use_scale=True, dropout=0.2)([Z,Z])
encoder_output = Z

# encoder
encoder = keras.Model(inputs=[encoder_input], outputs=[encoder_output])

In [36]:
decoder_input = keras.layers.Input(shape=[None, embed_size])

gru1 = keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(decoder_input)
gru2 = keras.layers.GRU(32, dropout=0.2, recurrent_dropout=0.2)(gru1)
decoder_output = keras.layers.Dense(1, activation="sigmoid")(gru2)
decoder = keras.Model(inputs=[decoder_input], outputs=[decoder_output])

# decoder
model = keras.models.Sequential([encoder, decoder])

In [37]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_2 (Functional)        (None, None, 2)           1381920   
                                                                 
 model_3 (Functional)        (None, 1)                 66273     
                                                                 
Total params: 1,448,193
Trainable params: 1,448,193
Non-trainable params: 0
_________________________________________________________________


In [38]:
# get data
batch_size = 20

train = tf.data.Dataset.from_tensor_slices((X_train, 
                                            (y_train/4).astype(np.uint32)
                                           ))
train = train.shuffle(10000).batch(batch_size).prefetch(1)

valid = tf.data.Dataset.from_tensor_slices((X_valid, 
                                            (y_valid/4).astype(np.uint32)
                                           ))
valid = valid.shuffle(10000).batch(batch_size).prefetch(1)

In [41]:
train_set = train.take(100)
valid_set = valid.take(10)

epochs = 1
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

In [42]:
history=model.fit(train_set, epochs=epochs, validation_data=valid_set)



In [45]:
model_version="0002"

model_name = "stream_sentiments_model"
model_path = os.path.join(model_name, model_version)
tf.saved_model.save(model, model_path)



INFO:tensorflow:Assets written to: stream_sentiments_model\0002\assets


INFO:tensorflow:Assets written to: stream_sentiments_model\0002\assets


In [29]:
model.save("even_smaller_model.h5") # save model

model = keras.models.load_model("even_smaller_model.h5") # load model

ValueError: Unknown layer: 'VocabLookup'. Please ensure you are using a `keras.utils.custom_object_scope` and that this object is included in the scope. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.

In [44]:
# test out the model

message = "Pleased to meet you"

model(tf.constant([message])).numpy()[0][0]

0.57973665

In [24]:
# docker run -it --rm -p 8500:8500 -p 8501:8501 -v "C:\\Users\\vaheg\\ml\\StreamSentiments\\stream-sentiments\\stream_sentiments_model:/models/stream_sentiments_model" -e MODEL_NAME=stream_sentiments_model tensorflow/serving 
# import numpy as np
# X_new = np.random.randint(vocab_size, size=(10, 10)) #######################
# import json 
# input_data_json = json.dumps({
#     "signature_name" :  "serving_default",
#     "instances" :  X_new.tolist()
# })
# import requests
# server_url = "http://localhost:8501/v1/models/stream_sentiments_model:predict" ###############
# response = requests.post(server_url, data=input_data_json)
# response.raise_for_status()
# response = response.json()
# response

In [25]:
# import sklearn

# def add_preprocessing():
#     small_model_with_preprocessing = keras.models.Sequential()
#     model.add(keras.layers.InputLayer(input_shape=input_shape))
#     for layer in range(n_hidden):
#         model.add(keras.layers.Dense(n_neurons, activation='relu'))
#     model.add(keras.layers.Dense(1))
#     optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
#     model.compile(loss='mse', optimizer='adam')
#     return model

# make regressor that will be compatible with scikit-learn
# keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

In [6]:
# load saved model
from tensorflow import keras
import tensorflow as tf
regex = """'|!|"|#|\$|%|&|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|\{|\||\}|~|\t|\n"""
class VocabLookup(keras.layers.Layer):
    def __init__(self, word_index, num_oov_buckets, **kwargs):
        self.word_index = word_index
        self.vocab = list(word_index.keys())
        self.indices = tf.range(len(self.vocab), dtype=tf.int64)
        table_init = tf.lookup.KeyValueTensorInitializer(self.vocab, self.indices)
        self.table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)
        super(VocabLookup, self).__init__(**kwargs)

    def build(self, input_shape):
        self.built = True

    def call(self, X):
        return self.table.lookup(X)
model = keras.models.load_model("even_smaller_model.h5", custom_objects={"VocabLookup":VocabLookup})