In [1]:
# !pip install tensorflow==1.13.1

In [2]:
import os
import boto3
import json
import math
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Convolution1D, GlobalMaxPool1D, MaxPool1D

In [3]:
config = {
    "embeddings_dictionary_size": 500000,
    "embeddings_vector_size": 50,
    "padding_size": 20,
    "batch_size": 1000,
    "embeddings_path": "glove.50d.txt",
    "input_tensor_name": "embedding_input",
    "num_epoch":5
}

In [4]:
#define function to read data from S3
def read_data(path,mode):
           
    s3_client = boto3.client("s3")
    path_split = path.replace("s3://", "").split("/")

    bucket = path_split.pop(0)
    key = "/".join(path_split)

    data = s3_client.get_object(Bucket=bucket, Key=key)

    decoded_file = data["Body"].read().decode('utf-8').split('\n')
    
    features=[]; sentiment=[]
    for line in decoded_file:
        content = json.loads(line)
        features.append(content['features'])
        sentiment.append(content["sentiment"]/4)  
    
    num_data_points = len(features)
    num_batches = math.ceil(len(features)/config['batch_size'])
    
    Dataset = tf.data.Dataset
    
    dataset = Dataset.from_tensor_slices((features, sentiment))

    if mode == "train":

        dataset = Dataset.from_tensor_slices((features, sentiment))
        dataset = dataset.batch(config["batch_size"]).shuffle(10000, seed=12345).repeat(
            config["num_epoch"])
        num_batches = math.ceil(len(features) / config["batch_size"])

    if mode in ("validation", "eval"):

        dataset = dataset.batch(config["batch_size"]).repeat(config["num_epoch"])
        num_batches = int(math.ceil(len(features) / config["batch_size"]))

    iterator = dataset.make_one_shot_iterator()
    dataset_features, dataset_sentiments = iterator.get_next()


    return [{config["input_tensor_name"]: dataset_features}, dataset_sentiments,
            {"num_data_point": num_data_points, "num_batches": num_batches}]

#read data from S3
train_dataset=read_data('s3://ai-assignment/assignment6/data/train.json','train')
eval_dataset=read_data('s3://ai-assignment/assignment6/data/eval.json','eval')
dev_dataset=read_data('s3://ai-assignment/assignment6/data/dev.json','validation')

In [5]:
#define function to read embedding dictionary with embeddings_path
def read_dictionary(path,embeddings_dictionary_size,embeddings_vector_size):
    
    embedding_matrix = np.zeros((embeddings_dictionary_size, embeddings_vector_size))
    
    s3_client = boto3.client("s3")
    path_split = path.replace("s3://", "").split("/")

    bucket = path_split.pop(0)
    key = "/".join(path_split)

    data = s3_client.get_object(Bucket=bucket, Key=key)

    decoded_file = data["Body"].read().decode('utf-8').split('\n')
    
    for i in range(embeddings_dictionary_size):
        if len(decoded_file[i].split()[1:]) != embeddings_vector_size:
            continue
        embedding_matrix[i] = np.asarray(decoded_file[i].split()[1:], dtype='float32')
           
    return embedding_matrix


In [6]:
#define CNN model
def keras_model_fn(_, config):
    """
    Creating a CNN model for sentiment modeling

    """

    embedding_matrix = read_dictionary('s3://ai-assignment/assignment6/glove.50d.txt',config["embeddings_dictionary_size"],config["embeddings_vector_size"])

    cnn_model = Sequential()
    cnn_model.add(Embedding(weights=[embedding_matrix], input_length = config["padding_size"],input_dim = config["embeddings_dictionary_size"],output_dim = config["embeddings_vector_size"], trainable = True))
    cnn_model.add(Convolution1D(filters=200,kernel_size=3,strides = 1, padding='valid',activation = 'relu'))
    cnn_model.add(MaxPool1D(pool_size = 2))
    cnn_model.add(Convolution1D(filters=100,kernel_size=2,strides = 1, padding='valid',activation = 'relu'))
    cnn_model.add(GlobalMaxPool1D())
    cnn_model.add(Dense(units=100, activation = 'relu'))
    cnn_model.add(Dense(units=1, activation = 'sigmoid'))
    Adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    cnn_model.compile(loss = 'binary_crossentropy', optimizer = 'Adam', metrics =['accuracy'])

    return cnn_model


In [7]:
#define main function to fit model
def main(train_dataset,validation_dataset,eval_dataset):
    """
    Main training method

    """

    print("Preparing for training...")

    training_config = config    

    model = keras_model_fn(None, training_config)
    print("Starting training...")

    model.fit(
        x=train_dataset[0]['embedding_input'], y=train_dataset[1], steps_per_epoch=train_dataset[2]["num_batches"],
        epochs=training_config["num_epoch"],
        validation_data=(validation_dataset[0]['embedding_input'], validation_dataset[1]),
        validation_steps=validation_dataset[2]["num_batches"])

    score = model.evaluate(
        eval_dataset[0]['embedding_input'], eval_dataset[1], steps=eval_dataset[2]["num_batches"], verbose=0)

    print("Test loss:{}".format(score[0]))
    print("Test accuracy:{}".format(score[1]))
    

    # save model
    str_ = tf.contrib.saved_model.save_keras_model(model, "assignment6/output3/sentiment_model.h5")
    
    s3 = boto3.client('s3')
    s3.put_object(Bucket='ai-assignment',Key=('sentiment_model.h5/3/assets'+'/'))
    s3.put_object(Bucket='ai-assignment',Key=('sentiment_model.h5/3/variables'+'/'))
    
    s3 = boto3.resource('s3')
    
    s3.meta.client.upload_file(str_.decode() +'/saved_model.pb', 'ai-assignment', 'assignment6/output/sentiment_model.h5/3/saved_model.pb')
    s3.meta.client.upload_file(str_.decode() +'/variables/variables.data-00000-of-00001', 'ai-assignment', 'assignment6/output/sentiment_model.h5/3/variables/variables.data-00000-of-00001')
    s3.meta.client.upload_file(str_.decode() +'/variables/variables.index', 'ai-assignment', 'assignment6/output/sentiment_model.h5/3/variables/variables.index')
    s3.meta.client.upload_file(str_.decode() +'/assets/saved_model.json', 'ai-assignment', 'assignment6/output/sentiment_model.h5/3/assets/saved_model.json')
    s3.meta.client.upload_file(str_.decode() +'/variables/checkpoint', 'ai-assignment', 'assignment6/output/sentiment_model.h5/3/variables/checkpoint')
    
    print('Model successfully saved')
   


In [8]:
model = main(train_dataset,dev_dataset,eval_dataset)

Preparing for training...
Instructions for updating:
Colocations handled automatically by placer.
Starting training...
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss:0.4115882899612188
Test accuracy:0.8127375245094299

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.


Consider using a TensorFlow optimizer from `tf.train`.
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Sign