In [1]:
import numpy as np

def PolarityTokenizer(text, tokenizer, max_length):
    """
        Tokenizes text using the polarity tokenizer and
        returns the input arrays for polarity model.

        :param text: list of str
        :param tokenizer: transformers.PreTrainedTokenizer object
        :return: list of arrays
    """
    inputs = tokenizer.encode_plus(
        text=text,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        is_split_into_words=False,
        return_tensors='tf',
        return_attention_mask=True)

    input_id = np.array(inputs['input_ids'].numpy()[0]).reshape((1, -1))
    att_mask = np.array(inputs['attention_mask'].numpy()[0]).reshape((1, -1))

    return [input_id, att_mask]


def PhraseTokenizer(text, sentiment, tokenizer, max_length):
    """
        Tokenizes text and sentiments and returns the
        input arrays for phrase model.

        :param text: list of str
        :param sentiment: list of str
        :param tokenizer: transformers.PreTrainedTokenizer object
        :return: list of arrays
    """
    inputs = tokenizer.encode_plus(
        text=text,
        text_pair=sentiment,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        is_split_into_words=False,
        return_tensors='tf',
        return_attention_mask=True)

    input_id = list(inputs['input_ids'].numpy()[0])
    att_mask = list(inputs['attention_mask'].numpy()[0])

    sent_enc = tokenizer.encode(
        text=sentiment,
        add_special_tokens=False)

    sent_idx = input_id.index(sent_enc[0])
    sent_mask = np.zeros(max_length, dtype='int32')
    sent_mask[sent_idx] = 1

    input_id = np.array(input_id).reshape((1, -1))
    att_mask = np.array(att_mask).reshape((1, -1))
    sent_mask = np.array(sent_mask).reshape((1, -1))

    return [input_id, att_mask, sent_mask]


def PhraseDecoder(input_ids, prediction, tokenizer):
    """
        Decodes predicted start and end tokens from phrase
        model and returns the raw text string form

        :param input_ids: array
        :param prediction: list of arrays
        :param tokenizer: transformers.PreTrainedTokenizer object
        :return: str
    """
    start_idx = np.argmax(prediction[0], axis=-1)[0]
    end_idx = np.argmax(prediction[1], axis=-1)[0]
    selected_text = input_ids[start_idx:end_idx]
    selected_text = tokenizer.decode(selected_text).strip()
    return selected_text


def get_outputs(texts, polarity_tokenizer, phrase_tokenizer, polarity_model, phrase_model, POmax_len, QAmax_len):
    sentiments = []
    selected_texts = []
    for text in texts:
        polarity_inputs = PolarityTokenizer(
            text=text,
            tokenizer=polarity_tokenizer,
            max_length=POmax_len)
        polarity_preds = polarity_model.predict(polarity_inputs)
        polarity_preds = np.argmax(polarity_preds, axis=1)
        if polarity_preds == 0:
            sentiment = 'negative'
        elif polarity_preds == 1:
            sentiment = 'neutral'
        else:
            sentiment = 'positive'            
        phrase_inputs = PhraseTokenizer(
            text=text,
            sentiment=sentiment,
            tokenizer=phrase_tokenizer,
            max_length=QAmax_len)
        phrase_ids = phrase_inputs[0][0]
        phrase_preds = phrase_model.predict(phrase_inputs)
        selected_text = PhraseDecoder(
            input_ids=phrase_ids, 
            prediction=phrase_preds, 
            tokenizer=phrase_tokenizer)
        sentiments.append(sentiment)
        selected_texts.append(selected_text)
    return sentiments, selected_texts

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import (
    BatchNormalization, Dense, Dropout,
    Input, Conv1D, Flatten, Activation)
from transformers import (AutoConfig, TFAutoModelForSequenceClassification,
    TFAutoModelForQuestionAnswering)


def PolarityModel(model_path, max_len, num_classes):
    """
    Returns the polarity model loaded with pretrained weights.

    :return: tf.keras.Model object
    """
    input_ids = tf.keras.layers.Input(shape=(max_len,), name='input_1', dtype=tf.int32)
    att_mask = tf.keras.layers.Input(shape=(max_len,), name='input_2', dtype=tf.int32)

    enc = TFAutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)
    x = enc(input_ids, attention_mask=att_mask)[0]

    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(256, activation=None)(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_ids, att_mask], outputs=x)

    for layer in model.layers[:3]:
        layer.trainable = True

    model.load_weights(model_path)

    return model


def PhraseModel(model_path, max_len):
    """
    Returns the phrase model loaded with pretrained weights.

    :return: tf.keras.Model object
    """
    input_ids = tf.keras.layers.Input(shape=(max_len,), name="input_1",  dtype=tf.int32)
    att_mask = tf.keras.layers.Input(shape=(max_len,), name="input_2", dtype=tf.int32)
    sent_mask = tf.keras.layers.Input(shape=(max_len,), name="input_3", dtype=tf.int32)
    
    config = AutoConfig.from_pretrained(
        'bert-base-uncased', 
        output_attention=True, 
        output_hidden_states=True, 
        use_cache=True)

    enc = TFAutoModelForQuestionAnswering.from_pretrained(
        'bert-base-uncased', config=config)
    x = enc(input_ids, attention_mask=att_mask, token_type_ids=sent_mask)

    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.expand_dims(x1, axis=-1)
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x[0])
    x1 = tf.keras.layers.Activation('softmax')(x1)

    x2 = tf.keras.layers.Dropout(0.1)(x[1])
    x2 = tf.expand_dims(x2, axis=-1)
    x2 = tf.keras.layers.Conv1D(1,1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.Model(inputs=[input_ids, att_mask, sent_mask], outputs=[x1,x2])

    for layer in model.layers[3:4]:
        layer.trainable = True

    model.load_weights(model_path)

    return model

In [3]:
import os
import ast
import argparse
import pandas as pd
import tarfile
import sys
import subprocess
import datetime

import tensorflow as tf
from transformers import AutoTokenizer
import transformers

# Mute warnings
tf.get_logger().setLevel('ERROR')

print(subprocess.check_output('nvcc --version'.split(' ')).decode())
print(sys.version)
print(tf.__version__)
print(transformers.__version__)
print(pd.__version__)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="For S3 bucket access")
    parser.add_argument(
        "--access_id", dest='access_id', type=str)
    parser.add_argument(
        "--access_key", dest='access_key', type=str)
    parser.add_argument(
        "--num_classes", dest='num_classes', type=int, default=3)
    parser.add_argument(
        "--QAmax_len", dest='QAmax_len', type=int, default=64)
    parser.add_argument(
        "--POmax_len", dest='POmax_len', type=int, default=128)    
    parser.add_argument(
        "--bucket_name", dest='bucket_name', type=str, default='syalabi-bucket')

    args, _ = parser.parse_known_args()

    tokenizer = AutoTokenizer.from_pretrained(
        'roberta-base',
        add_prefix_space=True)
    
    print("INFO -- Tokenizers initialized.")

    # Input
    input_data_path = "/opt/ml/processing/input/input.csv"
    data = pd.read_csv(input_data_path)

    print("INFO -- Input data initialized.")

    model_path = os.path.join(
        "/opt/ml/processing/model/sentiment_models.tar.gz")

    with tarfile.open(model_path) as tar:
        tar.extractall(path="/opt/ml/processing/model/")

    polarity_model = PolarityModel(
        model_path='/opt/ml/processing/model/polarity_model.h5',
        max_len=args.POmax_len,
        num_classes=args.num_classes)

    print("INFO -- Polarity model initialized.")
   
    phrase_model = PhraseModel(
        model_path='/opt/ml/processing/model/phrase_model.h5',
        max_len=args.QAmax_len)

    print("INFO -- Phrase model initialized.")    

    # Main loop
    all_sentiments = []
    all_selected_texts = []
    for texts in data['text']:
        texts = ast.literal_eval(texts)
        sentiments, selected_texts = get_outputs(
            texts=texts, 
            polarity_tokenizer=tokenizer, 
            phrase_tokenizer=tokenizer,
            polarity_model=polarity_model, 
            phrase_model=phrase_model,
            POmax_len=args.POmax_len,
            QAmax_len=args.QAmax_len)
        all_sentiments.append(sentiments)
        all_selected_texts.append(selected_texts)
    
    data['sentiments'] = all_sentiments
    data['selected_texts'] = all_selected_texts
    
    # Output
    time_stamp = datetime.datetime.now().strftime("%m%d%Y-%H%Mhrs")
    output_path = os.path.join(
        "/opt/ml/processing/output", f"output_{time_stamp}.csv")
    data.to_csv(
        output_path,
        index=False)

    print("INFO -- Output saved in S3.")

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:54:10_Pacific_Daylight_Time_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.relgpu_drvr455TC455_06.29190527_0

3.8.10 (default, May 19 2021, 13:12:57) [MSC v.1916 64 bit (AMD64)]
2.5.0
4.8.2
1.3.0
INFO -- Tokenizers initialized.
INFO -- Input data initialized.


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO -- Polarity model initialized.


All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFO -- Phrase model initialized.
INFO -- Output saved in S3.
