# System Configurations
<b>RAM</b> : 32GB DDR4 2133MHz

<b>GPU</b> : RTX 3060 12GB VRAM

<b>CPU</b> : AMD Ryzen 5 3600XT 6-core

<b>OS build</b> : Windows 10 Pro Build 21354.co_release.210402-1630

In [2]:
import boto3
import sagemaker

# Inference variables
role = sagemaker.get_execution_role()
bucket_name = 'syalabi-bucket'
sess = sagemaker.session.Session(default_bucket=bucket_name)

# AWS CLI arguments
account_id = boto3.client('sts').get_caller_identity().get('Account')
region = boto3.Session().region_name
ecr_repo = 'sentiment-training'
tag = ':latest'
processing_repo_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repo + tag}'

print('role:', role)
print('region:', region)
print('processing_repo:', processing_repo_uri)

role: arn:aws:iam::183640780274:role/syalabi-user
region: ap-southeast-1
processing_repo: 183640780274.dkr.ecr.ap-southeast-1.amazonaws.com/sentiment-training:latest


# requirements.txt

In [1]:
%%writefile docker/requirements.txt
pandas
transformers
tensorflow
scikit-learn
tensorflow-addons

Overwriting docker/requirements.txt


# Dockerfile

In [3]:
%%writefile docker/Dockerfile
# Dockerfile for local v1
FROM nvcr.io/nvidia/tensorrt:21.03-py3
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
LABEL maintainer="Xaltius"
ENV PYTHONUNBUFFERED=TRUE
ENV TOKENIZERS_PARALLELISM=false
RUN mkdir -p /opt/ml/processing/model
COPY . /usr/src/app
COPY input/train_cleaned.csv /opt/ml/processing/input/train_cleaned.csv
RUN mkdir -p /opt/ml/processing/model
RUN mkdir -p /opt/ml/processing/output
ENTRYPOINT ["python3", "/usr/src/app/training.py", "--access_id=AKIASVQOX5XZJMYPBX3T", "--access_key=JRhjq9NwbJF1AwKxPgZvRe2ffvkuHuTH+nCKnimN"]

Overwriting docker/Dockerfile


In [4]:
%%writefile docker/Dockerfile
# Dockerfile for cloud v1
FROM nvcr.io/nvidia/tensorrt:21.03-py3
RUN apt-get update && apt install -y \
    nvidia-utils-450 \
    ubuntu-drivers-common
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
LABEL maintainer="Xaltius"
ENV PYTHONUNBUFFERED=TRUE
ENV TOKENIZERS_PARALLELISM=false
RUN mkdir -p /opt/ml/processing/model
RUN mkdir -p /opt/ml/processing/output
COPY . /usr/src/app

Overwriting docker/Dockerfile


In [6]:
# Create ECR repo and push docker image
!docker build --no-cache -t $ecr_repo docker
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {account_id}.dkr.ecr.{region}.amazonaws.com
# !aws ecr create-repository --repository-name $ecr_repo
!docker tag {ecr_repo + tag} $processing_repo_uri
!docker push $processing_repo_uri
!docker image rm $ecr_repo

# training.py

In [None]:
import sys
import argparse
import subprocess
import tensorflow as tf

from helper_functions import *

# Environ set-up
tf.get_logger().setLevel('ERROR')

SEED = 42
seed_everything(SEED)

print("INFO -- Dependencies")
print(subprocess.check_output('nvcc --version'.split(' ')).decode())
print("Python version:", sys.version)
print("Tensorflow version:", tf.__version__)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--access_id", dest="access_id", type=str)
    parser.add_argument(
        "--access_key", dest="access_key", type=str)
    parser.add_argument(
        "--max_length", dest="max_len", type=int, default=128)
    parser.add_argument(
        "--num_labels", dest="num_labels", type=int, default=3)
    parser.add_argument(
        "--batch_size", dest="batch_size", type=int, default=32)
    parser.add_argument(
        "--epochs", dest="epochs", type=int, default=1)
    parser.add_argument(
        "--lr", dest='lr', type=float, default=1e-5)
    parser.add_argument(
        "--splits", dest='splits', type=int, default=2)
    parser.add_argument(
        "--verbose", dest='verbose', type=int, default=1)
    parser.add_argument(
        "--pretrained_QA", dest='pretrained_QA', type=str, 
        default='bert-base-uncased')
    parser.add_argument(
        "--pretrained_PO", dest='pretrained_PO', type=str, 
        default='distilroberta-base')   
    parser.add_argument(
        "--bucket_name", dest="bucket_name", type=str, 
        default="syalabi-bucket")

    args, _ = parser.parse_known_args()

    try: 
        train_data = get_data('/opt/ml/processing/input/train_cleaned.csv')
        print("INFO -- Successfully fetched data from S3 bucket.")
    except:
        print('ERROR -- Fetching data from S3 bucket has failed.')

    QA_best_score, PO_best_score = 0, 0
    skf = StratifiedKFold(n_splits=args.splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(skf.split(
            X=train_data.drop('sentiment', axis=1),
            y=train_data['sentiment'])):
        
        QAtrain = get_input_arrays(train_data.loc[train_idx, :].reset_index(drop=True), type="QA")
        QAval = get_input_arrays(train_data.loc[val_idx, :].reset_index(drop=True), type="QA")

        POtrain = get_input_arrays(train_data.loc[train_idx, :].reset_index(drop=True), type="PO")
        POval = get_input_arrays(train_data.loc[train_idx, :].reset_index(drop=True), type="PO")

        # Phrase Loop
        print(f'INFO -- Phrase Training Fold {fold+1} of {args.splits}.')
        QAmodel = QAtraining(
            model=QAModel(
                pretrained=args.pretrained_QA, 
                lr=args.lr, 
                max_len=args.max_len),
            epochs=args.epochs,
            batch_size=args.batch_size,
            train_inputs=QAtrain,
            val_inputs=QAval,
            verbose=args.verbose)

        QA_train_score = get_Jaccard_score(
            inputs=QAtrain,
            model=QAmodel,
            verbose=args.verbose)
        
        QA_val_score = get_Jaccard_score(
            inputs=QAval,
            model=QAmodel,
            verbose=args.verbose)

        print(f"jaccard: {QA_train_score}% - val_jaccard: {QA_val_score}%")   
        
        if QA_val_score > QA_best_score:
            QA_best_score = QA_val_score
            QAmodel.save_weights('/opt/ml/processing/model/phrase_model.h5')
            print(f"INFO -- FOLD {fold+1} phrase model weights saved.")

        tf.keras.backend.clear_session()
        # Polarity Loop
        print(f'INFO -- Polarity Training Fold {fold+1} of {args.splits}.')
        POmodel = POtraining(
            model=POModel(
                pretrained=args.pretrained_PO,
                lr=args.lr,
                max_len=args.max_len,
                num_labels=args.num_labels),
            epochs=args.epochs,
            batch_size=args.batch_size,
            train_inputs=POtrain,
            val_inputs=POval,
            verbose=args.verbose)

        PO_train_score = get_f1_score(
            inputs=POtrain, 
            model=POmodel,
            verbose=args.verbose)

        PO_val_score = get_f1_score(
            inputs=POval, 
            model=POmodel,
            verbose=args.verbose)

        print(f"f1: {PO_train_score}% - val_f1: {PO_val_score}%")

        if PO_val_score > PO_best_score:
            PO_best_score = PO_val_score
            POmodel.save_weights('/opt/ml/processing/model/polarity_model.h5')
            print(f"INFO -- FOLD {fold+1} polarity model weights saved.")

    try: 
        save_tarfile(
            outfile="/opt/ml/processing/output/sentiment_models.tar.gz",
            model_dir="/opt/ml/processing/model")
        print("INFO -- Successfully compressed models into tar.")
    except:
        print("ERROR -- Failed to compress models.")

# helper_functions.py

In [None]:
# %%writefile docker/helper_functions.py
import os
import re
import ast
import numpy as np
import tensorflow as tf
import pandas as pd

from transformers import TFAutoModelForQuestionAnswering, AutoConfig

from sklearn.model_selection import train_test_split, StratifiedKFold

from tensorflow.keras.layers import Input, Flatten, Conv1D, Activation
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping


def get_data(input_path):
    train_data = pd.read_csv(input_path)
    for col in train_data.columns[3:]:
        train_data[col] = train_data[col].apply(ast.literal_eval)
    return train_data


def get_input_arrays(df):
    ids = np.vstack(df['input_ids'])
    mask = np.vstack(df['attention_mask'])
    start = np.vstack(df['start_token'])
    end = np.vstack(df['end_token'])
    sentiment = np.vstack(df['sentiment_token'])
    return ids, mask, start, end, sentiment


def QAModel(max_len, pretrained, lr):
    input_ids = Input(shape=(max_len,), dtype=tf.int32)
    att_mask = Input(shape=(max_len,), dtype=tf.int32)
    sent_mask = Input(shape=(max_len,), dtype=tf.int32)
    
    config = AutoConfig.from_pretrained(
        pretrained, 
        output_attention=True, 
        output_hidden_states=True, 
        use_cache=True)

    enc = TFAutoModelForQuestionAnswering.from_pretrained(
        pretrained, config=config)
    x = enc(input_ids, attention_mask=att_mask, token_type_ids=sent_mask)

    x1 = tf.expand_dims(x[0], axis=-1)
    x1 = Conv1D(1,1)(x1)
    x1 = Flatten()(x[0])
    x1 = Activation('softmax')(x1)

    x2 = tf.expand_dims(x[1], axis=-1)
    x2 = Conv1D(1,1)(x2)
    x2 = Flatten()(x2)
    x2 = Activation('softmax')(x2)

    model = tf.keras.Model(inputs=[input_ids, att_mask, sent_mask], outputs=[x1,x2])

    model.compile(
        loss=CategoricalCrossentropy(label_smoothing=0.1),
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr))

    for layer in model.layers[3:4]:
        layer.trainable = False

    return model


def model_training(model, epochs, save_path, batch_size, train_inputs, val_inputs):
    
    save_weights = ModelCheckpoint(
        filepath=save_path,
        monitor='val_loss',
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode='min')

    early_stopping = EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=2,
        verbose=1,
        mode='min',
        restore_best_weights=True)

    model.fit(
        x=[train_inputs[0], train_inputs[1], train_inputs[4]],
        y=[train_inputs[2], train_inputs[3]],
        validation_data=(
            [val_inputs[0], val_inputs[1], val_inputs[4]],
            [val_inputs[2], val_inputs[3]]),
        epochs=epochs,
        batch_size=batch_size,
        verbose=2,
        callbacks=[save_weights, early_stopping])
    
    return model


def get_score(
    model, inputs):
    preds = model.predict(
        [inputs[0], inputs[1], inputs[4]], 
        verbose=1)

    def JaccardSim(y_true, y_pred):
        if (len(y_true)==0) & (len(y_pred)==0):
            score = 0.5
        else:
            intersect = y_true.intersection(y_pred)
            score = float(len(intersect)) / (len(y_true) + len(y_pred) - len(intersect))
        return score

    scores = []
    for row in range(len(inputs[0])):
        input = inputs[0][row]
        t_start_idx = np.argmax(inputs[2][row], axis=-1)
        t_end_idx = np.argmax(inputs[3][row], axis=-1)
        t_output = set(input[t_start_idx:t_end_idx])

        p_start_idx = np.argmax(preds[0][row], axis=-1)
        p_end_idx = np.argmax(preds[1][row], axis=-1)
        p_output = set(input[p_start_idx:p_end_idx])

        score = JaccardSim(y_true=t_output, y_pred=p_output)
        scores.append(score)

    return round(np.mean(scores)*100, 2)