In handling language data, try to think of using packages before you code from scratch.

- *OpenAI*
- *HuggingFace*

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [None]:
import numpy as np
from transformers import BertTokenizer

In [None]:
def greeting1(name: str) -> str:
    return 'Hello ' + name


In [None]:
greeting1('dlkfsdlkfj')

'Hello dlkfsdlkfj'

In [None]:
def greeting2(name):
    return 'Hello ' + name


In [None]:
greeting2

<function __main__.greeting2(name)>

In [None]:
def tokenize_string(x: str) -> np.array:
    # Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Tokenize the string
    tokens = tokenizer.encode(x, add_special_tokens=True)
    
    # Convert the tokens to a NumPy array
    token_array = np.array(tokens)
    
    return token_array


In [None]:
tokenize_string("1, this sentiment is negative because the person doesn't like negative growth rate")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

array([  101,  1015,  1010,  2023, 15792,  2003,  4997,  2138,  1996,
        2711,  2987,  1005,  1056,  2066,  4997,  3930,  3446,   102])

In [None]:
# from transformers import BertTokenizer

# Instantiate a BERT tokenizer object
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Get the ID of the [SEP] token
sep_token_id = tokenizer.sep_token_id
sep_token_id

102

In [None]:
# import numpy as np
# from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokenize_string(string: str, max_length: int) -> np.array:
    # Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Tokenize the string
    tokens = tokenizer.encode(string, add_special_tokens=True)
    
    # Limit the length of the token array
    if len(tokens) > max_length - 2:
        tokens = tokens[:max_length - 2] + [tokenizer.sep_token_id]
    else:
        tokens += [tokenizer.sep_token_id]
    
    # Pad the token array with zeros
    token_array = pad_sequences([tokens], maxlen=max_length, dtype='int32', padding='post', truncating='post', value=0)
    
    return np.asarray(token_array[0])


In [None]:
y1 = tokenize_string("1, I don't know this finance.")
len(y1)

128

In [None]:
y2 = tokenize_string("1, I don't know this finance. Because the company has negative earnings.")
len(y2)

128

In [None]:
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel

def encode_texts(texts: list[str, str, ...]) -> np.array:
    # Load the tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = TFRobertaModel.from_pretrained('roberta-base')
    
    # Tokenize the texts
    tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in texts]
    
    # Pad the tokenized texts to a fixed length
    max_length = max([len(text) for text in tokenized_texts])
    padded_texts = np.array([text + [tokenizer.pad_token_id] * (max_length - len(text)) for text in tokenized_texts])
    
    # Encode the padded texts using the RoBERTa model
    input_ids = tf.constant(padded_texts)
    outputs = model(input_ids)
    vectors = outputs[0][:, 0, :].numpy()
    
    return vectors


In [None]:
output1 = encode_texts(
    [
        "0, i don't like this financials",
        "1, i don't understand financial data",
        "2, i like this company and i think the stock price goes up"
    ]
)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
output1.shape

(3, 768)