In [1]:
import os
import boto3
import json
import math
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Convolution1D, GlobalMaxPool1D

In [2]:
config = {
    "embeddings_dictionary_size": 500000,
    "embeddings_vector_size": 25,
    "padding_size": 20,
    "batch_size": 1000,
    "embeddings_path": "glove.txt",
    "input_tensor_name": "embedding_input",
    "num_epoch":10
}

In [3]:
#define function to read data from S3
def read_data(path,mode):
           
    s3_client = boto3.client("s3")
    path_split = path.replace("s3://", "").split("/")

    bucket = path_split.pop(0)
    key = "/".join(path_split)

    data = s3_client.get_object(Bucket=bucket, Key=key)

    decoded_file = data["Body"].read().decode('utf-8').split('\n')
    
    features=[]; sentiment=[]
    for line in decoded_file:
        content = json.loads(line)
        features.append(content['features'])
        sentiment.append(content["sentiment"]/4)  
    
    num_data_points = len(features)
    num_batches = math.ceil(len(features)/config['batch_size'])
    
    Dataset = tf.data.Dataset
    
    dataset = Dataset.from_tensor_slices((features, sentiment))

    if mode == "train":

        dataset = Dataset.from_tensor_slices((features, sentiment))
        dataset = dataset.batch(config["batch_size"]).shuffle(10000, seed=12345).repeat(
            config["num_epoch"])
        num_batches = math.ceil(len(features) / config["batch_size"])

    if mode in ("validation", "eval"):

        dataset = dataset.batch(config["batch_size"]).repeat(config["num_epoch"])
        num_batches = int(math.ceil(len(features) / config["batch_size"]))

    iterator = dataset.make_one_shot_iterator()
    dataset_features, dataset_sentiments = iterator.get_next()


    return [{config["input_tensor_name"]: dataset_features}, dataset_sentiments,
            {"num_data_point": num_data_points, "num_batches": num_batches}]

#read data from S3
train_dataset=read_data('s3://ai-assignment/assignment4/train_data/train.json','train')
eval_dataset=read_data('s3://ai-assignment/assignment4/eval_data/eval.json','eval')
dev_dataset=read_data('s3://ai-assignment/assignment4/dev_data/dev.json','validation')

In [4]:
#define function to read embedding dictionary with embeddings_path
def read_dictionary(path,embeddings_dictionary_size,embeddings_vector_size):
    
    embedding_matrix = np.zeros((embeddings_dictionary_size, embeddings_vector_size))
    
    s3_client = boto3.client("s3")
    path_split = path.replace("s3://", "").split("/")

    bucket = path_split.pop(0)
    key = "/".join(path_split)

    data = s3_client.get_object(Bucket=bucket, Key=key)

    decoded_file = data["Body"].read().decode('utf-8').split('\n')
    
    for i in range(embeddings_dictionary_size):
        if len(decoded_file[i].split()[1:]) != embeddings_vector_size:
            continue
        embedding_matrix[i] = np.asarray(decoded_file[i].split()[1:], dtype='float32')
           
    return embedding_matrix


In [5]:
#define CNN model
def keras_model_fn(_, config):
    """
    Creating a CNN model for sentiment modeling

    """

    embedding_matrix = read_dictionary('s3://ai-assignment/assignment4/glove.txt',config["embeddings_dictionary_size"],config["embeddings_vector_size"])

    cnn_model = Sequential()
    cnn_model.add(Embedding(weights=[embedding_matrix], input_length = config["padding_size"],input_dim = config["embeddings_dictionary_size"],output_dim = config["embeddings_vector_size"], trainable = True))
    cnn_model.add(Convolution1D(filters=100,kernel_size=2,strides = 1, padding='valid',activation = 'relu'))
    cnn_model.add(GlobalMaxPool1D())
    cnn_model.add(Dense(units=100, activation = 'relu'))
    cnn_model.add(Dense(units=1, activation = 'sigmoid'))
    Adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    cnn_model.compile(loss = 'binary_crossentropy', optimizer = 'Adam', metrics =['accuracy'])

    return cnn_model

#save CNN model into S3
def save_model(model):
    
    model.save('s3://ai-assignment/assignment4/output/sentiment_model.h5')
    s3 = boto3.resource('s3')
    s3.meta.client.upload_file('assignment4/output/sentiment_model.h5','ai-assignment','sentiment_model.h5')
    
    print("Model successfully saved")

In [6]:
#define main function to fit model
def main(train_dataset,validation_dataset,eval_dataset):
    """
    Main training method

    """

    print("Preparing for training...")

    training_config = config    

    model = keras_model_fn(None, training_config)
    print("Starting training...")

    model.fit(
        x=train_dataset[0]['embedding_input'], y=train_dataset[1], steps_per_epoch=train_dataset[2]["num_batches"],
        epochs=training_config["num_epoch"],
        validation_data=(validation_dataset[0]['embedding_input'], validation_dataset[1]),
        validation_steps=validation_dataset[2]["num_batches"])

    score = model.evaluate(
        eval_dataset[0]['embedding_input'], eval_dataset[1], steps=eval_dataset[2]["num_batches"], verbose=0)

    print("Test loss:{}".format(score[0]))
    print("Test accuracy:{}".format(score[1]))
    
#     save_model(model)
   


In [7]:
model = main(train_dataset,dev_dataset,eval_dataset)

Preparing for training...
Instructions for updating:
Colocations handled automatically by placer.
Starting training...
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss:0.5747805237770081
Test accuracy:0.7039999961853027
