In [7]:
import pandas as pd
from datasets import load_dataset
import numpy as np

random_seed=4321

In [2]:

dataset = load_dataset('financial_phrasebank', 'sentences_50agree')

Reusing dataset financial_phrasebank (C:\Users\thush\.cache\huggingface\datasets\financial_phrasebank\sentences_50agree\1.0.0\8573a5b5922d152c7b77924429a18b5546458c179db2685eb266b227d51d1b6b)


## Print the data

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})


In [4]:
for s,l in zip(dataset['train']['sentence'][:10], dataset['train']['label'][:10]):
    print(s)
    print('\tLabel: {}'.format(l))

According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
	Label: 1
Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
	Label: 1
The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .
	Label: 0
With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
	Label: 2
According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .
	

In [5]:
print(pd.Series(dataset['train']['label']).value_counts())

1    2879
2    1363
0     604
dtype: int64


In [8]:
inputs, labels = np.array(dataset['train']['sentence']).reshape(-1,1), np.array(dataset['train']['label'])

In [9]:
from imblearn.under_sampling import OneSidedSelection, NearMiss, RandomUnderSampler
import numpy as np

n=75 # Number of instances for each class for train/validation sets

rus = RandomUnderSampler(sampling_strategy={0:n, 1:n, 2:n}, random_state=random_seed)
rus.fit_resample(inputs, labels)

# Get test indices
test_inds = rus.sample_indices_
test_x, test_y = inputs[test_inds], np.array(labels)[test_inds]
print("Test statistics")
print(pd.Series(test_y).value_counts())

# Get rest (train + valid)
rest_inds = [i for i in range(inputs.shape[0]) if i not in test_inds]
rest_x, rest_y = inputs[rest_inds], labels[rest_inds]

# Get valid indices
rus.fit_resample(rest_x, rest_y)
valid_inds = rus.sample_indices_
valid_x, valid_y = rest_x[valid_inds], rest_y[valid_inds]
print("Valid statistics")
print(pd.Series(valid_y).value_counts())

# Rest goes in training
train_inds = [i for i in range(rest_x.shape[0]) if i not in valid_inds]
train_x, train_y = rest_x[train_inds], rest_y[train_inds]
print("Train statistics")
print(pd.Series(train_y).value_counts())

Test statistics
2    75
1    75
0    75
dtype: int64
Valid statistics
2    75
1    75
0    75
dtype: int64
Train statistics
1    2729
2    1213
0     454
dtype: int64


## Sample sentences

In [65]:
test_x[:40,0].tolist()

['Finnish communication electronics components supplier Scanfil Oyj Tuesday said sales in the first half of 2006 will be 15 % lower than during the same period a year ago .',
 'Finnish Exel Composites , a technology company that designs , manufactures , and markets composite profiles and tubes for various industrial applications , reports its net sales decreased by 0.6 % in the second quarter of 2010 to EUR 19.2 mn from EUR 19.3 mn in the corresponding period in 2009 .',
 'Earnings per share ( EPS ) amounted to a loss of to EUR0 .06 .',
 "In the first half of 2008 , the Bank 's operating profit fell to EUR 11.8 mn from EUR 18.9 mn , while net interest income increased to EUR 20.9 mn from EUR 18.8 mn in the first half of 2007 .",
 "Last year 's third quarter result had been burdened by costs stemming from restructuring in the US .",
 "HELSINKI ( AFX ) - KCI Konecranes said that Franklin Resources Inc 's share of voting rights in the Finnish cranes company fell last week to 4.65 pct from

## Downloading the BERT model

URL: https://huggingface.co/bert-base-uncased

In [10]:
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [66]:
text = "Nokia s U.S. shares were 3.3 percent lower at $ 12.73 today than what it was yesterday ."

## Understanding the tokenizer

In [67]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

encoded_ids = tokenizer(text, return_tensors='tf')
print(encoded_ids)

encoded_tokens = tokenizer.convert_ids_to_tokens(encoded_ids['input_ids'].numpy()[0])
print(encoded_tokens)

{'input_ids': <tf.Tensor: shape=(1, 27), dtype=int32, numpy=
array([[  101, 22098,  1055,  1057,  1012,  1055,  1012,  6661,  2020,
         1017,  1012,  1017,  3867,  2896,  2012,  1002,  2260,  1012,
         6421,  2651,  2084,  2054,  2009,  2001,  7483,  1012,   102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 27), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 27), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]])>}
['[CLS]', 'nokia', 's', 'u', '.', 's', '.', 'shares', 'were', '3', '.', '3', 'percent', 'lower', 'at', '$', '12', '.', '73', 'today', 'than', 'what', 'it', 'was', 'yesterday', '.', '[SEP]']


In [68]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")

def get_model_output(text, model, tokenizer):
    # [CLS] is automatically added by the tokenizer
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    return output


output = get_model_output(text, model, tokenizer)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [69]:
print(output.pooler_output.shape)
print(output.last_hidden_state.shape)
print(output.attentions.shape)

(1, 768)
(1, 27, 768)


AttributeError: 'NoneType' object has no attribute 'shape'

## Auxiliary outputs with BERT

In [70]:
from transformers import BertTokenizer, TFBertModel, BertConfig

config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, output_hidden_states=False)

model = TFBertModel.from_pretrained("bert-base-uncased", config=config)

output = get_model_output(text, model, tokenizer)



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [71]:
print(model.config)

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.3.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [72]:
print(output.pooler_output.shape)
print(output.last_hidden_state.shape)
print(output.attentions[0].shape)

(1, 768)
(1, 27, 768)
(1, 12, 27, 27)


In [73]:
# tokens = tokenizer.tokenize("I like NLP")
encoded_ids = tokenizer(text, return_tensors='tf')
print(encoded_ids)

tokens = tokenizer.convert_ids_to_tokens(encoded_ids['input_ids'].numpy()[0])
#head_view(output.attentions[0], tokens)

{'input_ids': <tf.Tensor: shape=(1, 27), dtype=int32, numpy=
array([[  101, 22098,  1055,  1057,  1012,  1055,  1012,  6661,  2020,
         1017,  1012,  1017,  3867,  2896,  2012,  1002,  2260,  1012,
         6421,  2651,  2084,  2054,  2009,  2001,  7483,  1012,   102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 27), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 27), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1]])>}


# Visualize attention

In [74]:
import torch
from bertviz import head_view

head_view([torch.from_numpy(layer_attn.numpy()) for layer_attn in output[-1]], tokens)

<IPython.core.display.Javascript object>

## Distance between two sentences

In [None]:
# two words that wouldn't be used in the same context

In [97]:
from collections import Counter
cnt = Counter([w for doc in train_x[:,0].tolist() for w in doc.split() ])
print(cnt.most_common(100))
print(len(cnt))

[('.', 4339), ('the', 4292), (',', 4225), ('of', 2899), ('in', 2461), ('and', 2364), ('to', 2263), ('a', 1467), ('The', 1248), ('for', 1019), ("'s", 916), ('is', 848), ('will', 795), ('EUR', 791), ('company', 735), ('from', 672), ('on', 613), ('its', 550), ('has', 525), ('be', 506), ('with', 498), ('by', 485), ('said', 484), (')', 461), ('(', 459), ('Finnish', 452), ('as', 446), ('mn', 424), ('that', 398), ('million', 398), ('at', 397), ('%', 384), ('sales', 368), (':', 333), ('was', 330), ('profit', 328), ('it', 325), ('net', 313), ('Finland', 306), ('an', 281), ('-', 277), ('are', 267), ('2009', 266), ('2008', 247), ('mln', 247), ('m', 246), ('``', 234), ('period', 231), ('year', 229), ('new', 228), ('2007', 215), ('share', 207), ('business', 207), ('2010', 204), ('have', 201), ('Oyj', 201), ('which', 200), ("''", 200), ('quarter', 197), ('market', 197), ('In', 196), ('also', 188), ('$', 188), ('shares', 181), ('services', 175), ('up', 167), ('first', 156), ('2006', 153), ('Group', 1

In [75]:
#net
#share
#market
#Nokia

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")

def get_model_output(text, model, tokenizer):
    # [CLS] is automatically added by the tokenizer
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    return output


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [107]:
from sklearn.metrics.pairwise import cosine_distances

def get_distance(s1,s2, model, tokenizer):
    o1 = get_model_output(s1, model, tokenizer)
    o2 = get_model_output(s2, model, tokenizer)

    d = cosine_distances(o1.pooler_output, o2.pooler_output)[0][0]
    return d


In [82]:
s1 = "my fishing net was torn"
s2 = "the profit margin was up"
d = get_distance(s1,s2)
print(d)

s1 = "my fishing net was torn"
s2 = "the net profit margin was up"
d = get_distance(s1,s2)
print(d)

s1 = "john gave his lunch to mary"
s2 = "john bougth Tesla shares"
d = get_distance(s1,s2)
print(d)

s1 = "john shares his lunch with mary"
s2 = "john bougth Tesla shares"
d = get_distance(s1,s2)
print(d)


0.10874623
0.051901698
0.13316393
0.044555068


In [106]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=758.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=252.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=437992753.0, style=ProgressStyle(descri…




In [110]:
print(model.config)

BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.3.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [109]:
encoded_input = tokenizer(text)
print(encoded_input)
output = model(encoded_input)
print(output)

{'input_ids': [101, 22098, 1055, 1057, 1012, 1055, 1012, 6661, 2020, 1017, 1012, 1017, 3867, 2896, 2012, 1002, 2260, 1012, 6421, 2651, 2084, 2054, 2009, 2001, 7483, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


AttributeError: 

In [None]:
s1 = "my fishing net was torn"
s2 = "the profit margin was up"
d = get_distance(s1,s2, model, tokenizer)
print(d)

s1 = "my fishing net was torn"
s2 = "the net profit margin was up"
d = get_distance(s1,s2, model, tokenizer)
print(d)

s1 = "john gave his lunch to mary"
s2 = "john bougth Tesla shares"
d = get_distance(s1,s2, model, tokenizer)
print(d)

s1 = "john shares his lunch with mary"
s2 = "john bougth Tesla shares"
d = get_distance(s1,s2, model, tokenizer)
print(d)


# Training a FinBERT

In [84]:
from transformers import AlbertTokenizer, TFAlbertModel


model = TFAlbertModel.from_pretrained("albert-base-v2")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=63048440.0, style=ProgressStyle(descrip…




Some layers from the model checkpoint at albert-base-v2 were not used when initializing TFAlbertModel: ['predictions']
- This IS expected if you are initializing TFAlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFAlbertModel were initialized from the model checkpoint at albert-base-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel for predictions without further training.


In [93]:
import os
# https://huggingface.co/blog/how-to-train
if not os.path.exists('data'):
    os.makedirs('data')

np.savetxt(os.path.join('data', 'documents.txt'), inputs, fmt="%s", encoding='utf-8')
    

In [98]:
from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(
    files=[os.path.join('data', 'documents.txt')], 
    vocab_size=15_000, 
    min_frequency=2, 
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

# Save files to disk
tokenizer.save_model("data", "fin-albert-base-v2")

['data\\fin-albert-base-v2-vocab.json', 'data\\fin-albert-base-v2-merges.txt']

In [99]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./data/fin-albert-base-v2-vocab.json",
    "./data/fin-albert-base-v2-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [104]:
encoded_ids = tokenizer.encode(text)
print(encoded_ids.ids)
print(encoded_ids.tokens)


[0, 1790, 268, 474, 18, 55, 18, 618, 760, 419, 18, 23, 745, 1371, 422, 583, 834, 18, 2677, 701, 801, 3084, 337, 456, 2076, 269, 2]
['<s>', 'Nokia', 'Ġs', 'ĠU', '.', 'S', '.', 'Ġshares', 'Ġwere', 'Ġ3', '.', '3', 'Ġpercent', 'Ġlower', 'Ġat', 'Ġ$', 'Ġ12', '.', '73', 'Ġtoday', 'Ġthan', 'Ġwhat', 'Ġit', 'Ġwas', 'Ġyesterday', 'Ġ.', '</s>']
