In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

import tensorflow as tf
import pandas as pd

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Metal device set to: Apple M2 Pro


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
!pwd

/Users/tony/Project/Testing Bert


In [17]:
csv_file_path = "./data.csv" #dataset source:https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis?resource=download
dataset = pd.read_csv(csv_file_path)

#remove neutral rows from dataset
dataset = dataset[dataset['Sentiment'] != 'neutral']
print(dataset)

                                               Sentence Sentiment
0     The GeoSolutions technology will leverage Bene...  positive
1     $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2     For the last quarter of 2010 , Componenta 's n...  positive
5       $SPY wouldn't be surprised to see a green close  positive
6     Shell's $70 Billion BG Deal Meets Shareholder ...  negative
...                                                 ...       ...
5832  Operating profit fell to EUR 38.1 mn from EUR ...  negative
5835  HSBC Says Unit to Book $585 Million Charge on ...  negative
5836  Daily Mail parent company in talks with potent...  positive
5837  RISING costs have forced packaging producer Hu...  negative
5841  HELSINKI AFX - KCI Konecranes said it has won ...  positive

[2712 rows x 2 columns]


In [4]:
def encode_dataset(data, tokenizer, max_length=512):
    input_ids, attention_masks, labels = [], [], []

    sentiment_map = {"positive": 1, "negative": 0}

    for index, row in data.iterrows():
        inputs = tokenizer.encode_plus(
            row["Sentence"],
            add_special_tokens=True,
            max_length=max_length,
            padding="max_length",
            truncation=True,
        )

        input_ids.append(inputs["input_ids"])
        attention_masks.append(inputs["attention_mask"])
        labels.append(sentiment_map[row["Sentiment"]])

    return (
        tf.data.Dataset.from_tensor_slices(
            ({"input_ids": input_ids, "attention_mask": attention_masks}, labels)
        ),
        max_length,
    )

In [5]:
tf_data, max_length = encode_dataset(dataset, tokenizer)
print(tf_data)

<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>


In [6]:
# train-test-val split
length = len(tf_data)
train_size = int(length*0.7)
test_size = int(length*0.15)
val_size = int(length*0.15)

BATCH_SIZE = 4

tf_data = tf_data.shuffle(len(tf_data)).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
train_data = tf_data.take(train_size)
test_data = tf_data.skip(train_size)
val_data = test_data.skip(val_size)
test_data = test_data.take(test_size)

In [7]:
print(tf_data)
print(train_data)

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>
<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


In [8]:
tf.config.list_physical_devices(
    device_type=None
)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [9]:
#Training
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)  #use tf.keras.optimizers.legacy.Adam for m1
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

EPOCHS = 3

device = "/GPU:0" if tf.config.list_physical_devices("GPU") else "/CPU:0"
with tf.device(device):
    history = model.fit(tf_data, epochs=EPOCHS, validation_data=val_data)
    
# model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
#               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
#               metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])



Epoch 1/3


2023-05-24 03:23:55.636187: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/3
Epoch 3/3


In [12]:
pred_sentences = ['''Boldizsar Gyori Tue, May 23, 2023 at 10:10 PM GMT+8 (Updates with rate cut, new prices) By Boldizsar Gyori BUDAPEST, May 23 (Reuters) - The forint made a recovery on Tuesday after the Hungarian central bank cut its one-day deposit rate by an expected 100 basis points, becoming the first monetary authority in central Europe to loosen its policy. Analysts had expected the bank to start paring back emergency rate hikes taken in October last year when the forint was at record lows, as investor sentiment has seen a sea change with the currency strengthening. The one-day deposit rate was introduced last autumn and has helped attract money flows with its high rate, now at 17%. The Hungarian central bank's 13% base policy rate is also the highest in the European Union. The bank has reduced the one-day deposit rate even as inflation is yet to drop below 20%, while the economy has been stuck in a mild recession as higher prices hit consumers. "If the improvement in risk perceptions persists, the Bank will continue the gradual convergence of the interest rate conditions of one-day tenders to the base rate," central bank Governor Zsolt Matolcsy said at a press conference. The forint firmed to 373.75 from 376.6 versus the euro, after the rate cut. The curency is 16% stronger compared with record lows of about 434 hit last year. Analysts said Hungary's rate cut could weigh on other currencies in central Europe although the region took it in its stride on Tuesday. "We might argue that the NBH is only trying to 'take the top off' high interest rates in Hungary in order to support the economy," Commerzbank said in a note prior to the decision. The beginning of an easing cycle was flagged by the central bank in April, when it first delivered a technical cut of 450 bps to the top end of its rate corridor. Hungary's inflation peaked at 25.7%, but has only retreated slightly, to 24% in April. CEE SNAPSHO AT MARKETS T 1536 CET CURRENC IES Latest Previou Daily Change s bid close change in 2023 EURCZK Czech EURHUF Hungary 0 0 EURPLN Polish EURRON Romanian EURRSD Serbian 0 0 Note: calculated from 1800 daily CET change Latest Previou Daily Change s close change in 2023 .PX Prague 1321.67 1319.33 +0.18% +9.98% 00 .BUX Budapest 46397.3 46562.4 -0.35% +5.95% 1 5 .WIG20 Warsaw <.WIG20 2008.09 2004.72 +0.17% +12.06 > % .BETI Buchares 12313.7 12277.3 +0.30% +5.57% t 3 4 Spread Daily vs Bund change in Czech spread Republic CZ2YT= 2-year s CZ5YT= 5-year s CZ10YT s Poland PL2YT= 2-year s PL5YT= 5-year s PL10YT s FORWARD 3x6 6x9 9x12 3M interba nk Czech Hungary Poland Note: are for ask FRA prices quotes ******************************************** ****************** (Reporting by Boldizsar Gyori in Budapest, Jason Hovet in Prague; Editing by Shailesh Kuber and Vinay Dwivedi)''']

In [13]:
#Predictons
tf_batch = tokenizer(pred_sentences, max_length=512, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])


Boldizsar Gyori Tue, May 23, 2023 at 10:10 PM GMT+8 (Updates with rate cut, new prices) By Boldizsar Gyori BUDAPEST, May 23 (Reuters) - The forint made a recovery on Tuesday after the Hungarian central bank cut its one-day deposit rate by an expected 100 basis points, becoming the first monetary authority in central Europe to loosen its policy. Analysts had expected the bank to start paring back emergency rate hikes taken in October last year when the forint was at record lows, as investor sentiment has seen a sea change with the currency strengthening. The one-day deposit rate was introduced last autumn and has helped attract money flows with its high rate, now at 17%. The Hungarian central bank's 13% base policy rate is also the highest in the European Union. The bank has reduced the one-day deposit rate even as inflation is yet to drop below 20%, while the economy has been stuck in a mild recession as higher prices hit consumers. "If the improvement in risk perceptions persists, the