### Import Libraries

In [5]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)
gpus    

[]

In [3]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
import tensorflow_addons as tfa 
import transformers
from transformers import TFAutoModel, AutoTokenizer

### Check GPU

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


2022-01-31 19:46:02.338750: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-01-31 19:46:02.338789: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (yitingtsai-G5-5590): /proc/driver/nvidia/version does not exist


### Config / Set Constant Variable

In [3]:
TEST_CSV_PATH = './data/test.csv'

BATCH_SIZE = 16
MAX_LEN = 256
MODEL_NAME = 'jplu/tf-xlm-roberta-base'

os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Set Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### Helper Functions

In [5]:
# load test set into Pandas Dataframe
def load_test_data_into_dataframe(test_csv_location):
    """
    Load CSV datasets into Pandas Dataframe.
    
    Parameters:
    ------------
    test_location : str 
        A string of path location of test dataset in csv format.
        
    Returns:
    ------------
    test_df : Pandas Dataframe
        A Dataframe of test data.
    """
    # Load csv in Pandas Dataframe
    test_df = pd.read_csv(test_csv_location)
    
    return test_df
    

In [6]:
# Transformer Tokenizer tokenize and encode the textual data into embedding
def tokenizer_encode(texts, tokenizer, maxlen=512):
    """
    Let Transformers Tokenizer API prepare the input data and encode, precisely tokenizing 
    
    Parameters:
    ------------
    texts : list of str
        A list of string to be encoded by the tokenizer.
    tokenizer : Transformers AutoTokenizer
        A Tensorflow AutoTokenizer object loaded in order to encode the text data.
    max_len : int
        An integer representing the maximun length of each sample, also as the shape of outputs from 'frozen' body of transformer model.
        
    Returns:
    ------------
    model : Numpy Array
        An array of tokenizer-encoded vector from the texts.
    """
    encoding = tokenizer.batch_encode_plus(
        texts,
        truncation=True,
        return_attention_mask=False, 
        return_token_type_ids=False,
        padding='max_length',
        max_length=maxlen
    )
    
    encoding_array = np.array(encoding['input_ids'])
    
    return encoding_array

In [7]:
# load test set into Tensorflow Dataset API
def load_test_into_tf_Dataset(tokenizer, test_df, batch_size=BATCH_SIZE):
    """
    Load splitted test dataset into Tensorflow Dataset API for a more efficient input pipeline, especially for parallelism.
    
    Parameters:
    ------------
    tokenizer : Transformers AutoTokenizer
        A Tensorflow AutoTokenizer object loaded in order to encode the text data.
    test_df : Pandas Dataframe
        A Dataframe of loaded test data.
    batch_size : int
        An integer indicating the size of the batch. Here uses 16*num_of_TPU_core (=128) by default.

    
    Returns 
    ------------
    test_dataset : tf.data.Dataset
        A Tensorflow Dataset API object of test set as an input pipeline for model inference.
    """
    ## Tokenize the textual format data by calling tokenizer_encode()
    x_test = tokenizer_encode(texts=test_df.content.values.tolist(), tokenizer=tokenizer, maxlen=MAX_LEN)
    
    ## Build Tensorflow Dataset objects
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(BATCH_SIZE)
    )
    
    return test_dataset 

In [8]:
# build output layer on top of the transformer model
def build_model(transformer, num_classes=3, activation='softmax', max_len=512):
    """
    Create top layer on top of HuggingFace Transformer model for down-stream task. cls_token
    In my case, a multi-class classification is the goal. Taking into account that there are 3 classes, 
    I use categorical accuracy, as well as weighted F1 score and Matthews correlation coefficient as metrics.
    
    Parameters:
    ------------
    transformer : Transformers TFAutoModel
        A string of path location of training dataset in csv format.
    num_classes : int
        A integer representing num
    activation : str
        A string indicating which actvation to be used in the output layer. 
    max_len : int
        An integer representing the maximun length of each sample, also as the shape of outputs from 'frozen' body of transformer model.
        
    Returns:
    ------------
    model : 
        configed model ready to be used
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(units=num_classes, activation=activation, name=activation)(cls_token) # set units=3 because we have three classes
    
    # add weighted F1 score and Matthews correlation coefficient as metrics
    f1 = tfa.metrics.F1Score(num_classes=num_classes, average='weighted')
    mcc = tfa.metrics.MatthewsCorrelationCoefficient(num_classes=num_classes)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['categorical_accuracy', f1, mcc])
    
    return model

In [9]:
# load model weights from fine-tuned model weights saved in directory
def load_model(model_dir='./xlmr-model/', max_len=256):
    """
    Function to load a keras model that uses a transformer layer
    
    Parameters :
    ------------
    model_dir : str
        A string indicating where model's weight and config file are.
    max_len : int
        An integer representing the maximun length of each sample, to be passed to build_model() function.

    Returns:
    ------------
    model : 
        configed model with weights loaded from fine-tuned model.
    """
    transformer = TFAutoModel.from_pretrained(model_dir)
    model = build_model(transformer, max_len=max_len)
    softmax = pickle.load(open(model_dir+'softmax.pickle', 'rb'))
    model.get_layer('softmax').set_weights(softmax)

    return model

In [20]:
# function to be used in later coonverting inference label from int back to string
def label_int_2_str(x): 
    """
    Convert encoded int labels back to string sentiment labels.
    """
    if x == 0:
        return 'negative'
    elif x == 1:
        return 'neutral'
    elif x == 2:
        return 'positive'

### Load Data

load test set into Pandas Dataframe and Tensorflow Dataset API

In [10]:
test_df = load_test_data_into_dataframe(TEST_CSV_PATH)

In [11]:
test_dataset = load_test_into_tf_Dataset(tokenizer=tokenizer, test_df=test_df, batch_size=BATCH_SIZE)

2022-01-31 19:48:46.968758: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load Model

load model weights from fine-tuned model weights saved in `/xlmr-model/` directory.

In [12]:
model = load_model(model_dir='./xlmr-model/', max_len=MAX_LEN)

2022-01-31 19:49:52.130901: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 768006144 exceeds 10% of free system memory.
2022-01-31 19:49:52.903005: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 768006144 exceeds 10% of free system memory.
2022-01-31 19:49:52.998794: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 768006144 exceeds 10% of free system memory.
2022-01-31 19:49:55.597002: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 768006144 exceeds 10% of free system memory.
2022-01-31 19:49:56.229169: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 768006144 exceeds 10% of free system memory.
All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at ./xlmr-model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for 

In [13]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer)  [(None, 256)]            0         
                                                                 
 tfxlm_roberta_model (TFXLMR  TFBaseModelOutputWithPoo  278043648
 obertaModel)                lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             256, 768),                          
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                          
                                                             

### Inference

In [14]:
prediction_array = model.predict(test_dataset, verbose=1)



#### convert inference predictions back to sentiment labels in text

In [26]:
# convert probability for each class to int label
test_df['inference_int'] = np.argmax(prediction_array, axis=-1)
# convert int label to string label : 0=negative, 1=neutral, 2=positive
test_df['inference_sentiment'] = test_df['inference_int'].apply(label_int_2_str)
# drop column of int label
test_df = test_df.drop('inference_int', axis=1)
# show 10 first rows / pairs of text comments and their corresponding predicted sentiment label
test_df.head(10)

Unnamed: 0,content,inference_sentiment
0,"Tudo ok, de acordo com o pedido",positive
1,This math below is quite true. The dexcom sens...,neutral
2,its a stupid app. follow my advice and do not ...,negative
3,LASK könnte in Trondheim schon EL-Aufstieg feiern,neutral
4,bagus...bagus..baguss....says mau pesan lg leb...,positive
5,The SLO team is looking for Innovators. If you...,positive
6,السلعه غير مطابقة للصوره ولم تحتوي على بطارية,negative
7,I loved the look of this neck lace. I was ver...,positive
8,Waiting for the Doctorrrr - wee,negative
9,UUS LAHENDUS: teadlased võitlevad emakakaelavä...,neutral


### Save Prediction to `csv` file

In [27]:
test_df.to_csv('predictions.csv')