## Install required packages

In [None]:
#!pip3 install -U spacy
#!python -m spacy download en_core_web_sm

# !pip3 install -U 'transformers[torch]'
# !pip install -U optimum
# !pip3 install -U dataset
# !pip3 install -U evaluate

# !pip3 install -U numpy
# !pip install -U scikit-learn

## Prepare training data for Spacy NER
The data is annotated using [Label Studio](https://labelstud.io/), includes the first 232 items in sample.txt  
Data is transformed to Spacy format, saved to "./ner/train.spacy"

In [41]:
import json

with open('ner/ner_train.json', mode='r', encoding='utf-8') as fd:
    ner_train_data = json.loads(fd.read());

print('Number of samples: ', len(ner_train_data))

Number of samples:  232


In [40]:
print(ner_train_data[5])

{'id': 6, 'annotations': [{'id': 6, 'completed_by': 1, 'result': [{'value': {'start': 0, 'end': 4, 'text': 'Gold', 'labels': ['COM']}, 'id': 'RBQlNv0cLe', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}], 'was_cancelled': False, 'ground_truth': False, 'created_at': '2024-03-02T11:01:07.440534Z', 'updated_at': '2024-03-02T11:01:07.440550Z', 'draft_created_at': None, 'lead_time': 50.531, 'prediction': {}, 'result_count': 0, 'unique_id': '1d691744-a08c-46a3-91ad-4cb85d778ccc', 'import_id': None, 'last_action': None, 'task': 6, 'project': 2, 'updated_by': 1, 'parent_prediction': None, 'parent_annotation': None, 'last_created_by': None}], 'file_upload': '70091d08-sample.txt', 'drafts': [], 'predictions': [], 'data': {'text': 'Gold edges lower as firmer US yields pinch appeal https://t.co/P0HPkmXvBZ'}, 'meta': {}, 'created_at': '2024-03-02T10:50:00.501559Z', 'updated_at': '2024-03-02T11:01:07.460181Z', 'inner_id': 6, 'total_annotations': 1, 'cancelled_annotatio

In [32]:
print(ner_train_data[16])

{'id': 17, 'annotations': [{'id': 17, 'completed_by': 1, 'result': [{'value': {'start': 0, 'end': 4, 'text': 'Gold', 'labels': ['COM']}, 'id': 'j7D1UyehxE', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}], 'was_cancelled': False, 'ground_truth': False, 'created_at': '2024-03-02T11:09:57.820429Z', 'updated_at': '2024-03-03T02:00:26.895744Z', 'draft_created_at': '2024-03-02T11:09:50.008467Z', 'lead_time': 188.464, 'prediction': {}, 'result_count': 0, 'unique_id': 'a9387a03-9297-4734-831a-574ddb42bcc6', 'import_id': None, 'last_action': None, 'task': 17, 'project': 2, 'updated_by': 1, 'parent_prediction': None, 'parent_annotation': None, 'last_created_by': None}], 'file_upload': '70091d08-sample.txt', 'drafts': [], 'predictions': [], 'data': {'text': 'Gold Price News and Forecast: XAU/USD is trapped in daily support and resistance. Posted by: EUR Editor  in EUR  1 min ago  XAU/USD struggles below $1,800 amid risk-off mood. Gold steps back from intraday high

In [33]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.load("en_core_web_sm")

db = DocBin()
for item in ner_train_data:
    text = item['data']['text']
    annotations = item['annotations']

    if len(text) < 20:
        continue

    doc = nlp(text)
    ents = []

    # [E1010] Unable to set entity information for token 0 which is included in more than one span in entities, blocked, missing or outside.
    # Eg:
    #   text: Gold Price News and Forecast: XAU/USD is trapped in daily support and resistance. Posted by: EUR Editor  in EUR  1 min ago  XAU/USD struggles below $1,800 amid risk-off mood. Gold steps back from intraday high while flashing $1,772 as a quote amid Friday’s Asian session. In doing … Read Full Story at source (may require registration)     Latest posts by EUR Editor ( see all )
    #   ents: [Gold Price News, XAU/USD, daily, 1 min ago, XAU/USD, 1,800, 1,772, Friday, Asian, … Read Full Story, EUR Editor, Gold]
    #   span as text[0:16] and span as text[0:4] are overlaping each other -> token 0 which is included in more than one span

    annotated = annotations[0]

    for r in annotated['result']:
        value = r['value']
        span = doc.char_span(
            value['start'],
            value['end'],
            label=value['labels'][0]
            )
        if span is None:
            print('[result] ', r);
            print('[span] ', span);
            print('[text] ', item['data']['text']);
            raise Error('The entity is marked inproperly that makes span to None');
        else:
            ents.append(span)

    for ent in doc.ents:
        span = doc.char_span(ent.start_char, ent.end_char, label=ent.label_)
        is_span_overlaping = False
        for included_span in ents:
            if ((included_span.start <= span.start and span.start <= included_span.end)
                or (included_span.start <= span.end and span.end <= included_span.end)
                or (span.start <= included_span.start and included_span.start <= span.end)
                or (span.start <= included_span.end and included_span.end <= span.end)):
                    is_span_overlaping = True

        if is_span_overlaping:
            continue

        ents.append(span)

    if len(ents) == 0:
        continue
    doc.ents = ents
    db.add(doc)

db.to_disk("./ner/train.spacy")

## Prepare training data for Spacy TextCategorizer
The data is annotated using Label Studio, includes 45 out of the first 232 items in sample.txt  
Data is transformed to Spacy format, saved to "./sentiment_analysis_spacy/train.spacy"  

In [49]:
import json

with open('sentiment_analysis_spacy/sentiment_train.json', mode='r', encoding='utf-8') as fd:
    sent_train_data = json.loads(fd.read());

In [50]:
print(sent_train_data[5])

{'id': 2108, 'annotations': [{'id': 470, 'completed_by': 1, 'result': [{'value': {'choices': ['Negative']}, 'id': '8CdQ4W1i46', 'from_name': 'sentiment', 'to_name': 'text', 'type': 'choices', 'origin': 'manual'}], 'was_cancelled': False, 'ground_truth': False, 'created_at': '2024-03-03T08:17:06.304621Z', 'updated_at': '2024-03-03T08:24:43.405397Z', 'draft_created_at': None, 'lead_time': 97.557, 'prediction': {}, 'result_count': 0, 'unique_id': '14b4b0a3-db9d-4c72-936c-f9c8df8be723', 'import_id': 238, 'last_action': None, 'task': 2108, 'project': 6, 'updated_by': 1, 'parent_prediction': None, 'parent_annotation': None, 'last_created_by': None}], 'file_upload': '9a09ceda-sentiment_train.json', 'drafts': [], 'predictions': [], 'data': {'text': 'Gold edges lower as firmer US yields pinch appeal https://t.co/P0HPkmXvBZ'}, 'meta': {}, 'created_at': '2024-03-03T08:17:06.279707Z', 'updated_at': '2024-03-03T08:24:43.432821Z', 'inner_id': 6, 'total_annotations': 1, 'cancelled_annotations': 0, 't

In [51]:
from pathlib import Path
import spacy
from spacy.tokens import DocBin

def read_categories():
    return Path('sentiment_analysis_spacy/categories.txt').open().read().strip().split("\n")
categories = read_categories();
print('categories: ', categories)

nlp = spacy.blank("en")
db = DocBin()
for item in sent_train_data:
    text = item['data']['text']
    annotated = item['annotations'][0]

    if len(text) < 20:
        continue

    result = annotated['result']
    if len(result) == 0:
        continue;
    label = result[0]['value']['choices'][0]

    doc = nlp.make_doc(text)
    doc.cats = {category: 0 for category in categories}
    # True labels get value 1
    doc.cats[label] = 1

    db.add(doc)

print('Number of samples: ', len(db))

db.to_disk("./sentiment_analysis_spacy/train.spacy")

categories:  ['Positive', 'Negative', 'Neutral']
Number of samples:  45


## Train Spacy NER model
The training set up is placed at directory './ner', two important files are config.cfg and train.spacy.  
Note: To avoid spending too much time in annotation, training data also is used as validation data. In actual practice validation data must differ from training data to evaluate model's generisation.  
Trained model is stored in "./ner/output" for later use.

In [37]:
!python3 -m spacy train ner/config.cfg --output ./ner/output --paths.train ./ner/train.spacy --paths.dev ./ner/train.spacy

[38;5;4mℹ Saving to output directory: ner/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Frozen components: ['tok2vec'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0      6.30   92.44   88.87   96.32    0.92
  1     200   2312.57   89.89   89.43   90.35    0.90
  3     400   2034.95   93.04   93.16   92.91    0.93
  5     600   1955.29   96.01   96.16   95.87    0.96
  7     800   1244.66   96.71   96.77   96.65    0.97
  9    1000    734.19   96.35   96.77   95.93    0.96
 11    1200   1279.18   97.92   98.39   97.47    0.98
 14    1400    944.25   98.16   98.78   97.56    0.98
 16    1600    733.11   98.04   97.84   98.25    0.98
 18    1800    881.16   98.41   98.61   98.22    0.98
 21    2000    621.95   98.17   98.22   98.13    0.98
[38;5;2m✔ Saved pipeline to output direct

## Train Spacy TextCategorizer model
The training set up is placed at directory './sentiment_analysis_spacy', two important files are config.cfg and train.spacy.  
Note: To avoid spending too much time in annotation, training data also is used as validation data. In actual practice validation data must differ from training data to evaluate model's generisation.  
Trained model is stored in "./sentiment_analysis_spacy/output" for later use.

In [39]:
!python3 -m spacy train sentiment_analysis_spacy/config.cfg --output ./sentiment_analysis_spacy/output --paths.train ./sentiment_analysis_spacy/train.spacy --paths.dev ./sentiment_analysis_spacy/train.spacy

[38;5;4mℹ Saving to output directory: sentiment_analysis_spacy/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.22       16.17    0.16
  4     200         35.61         41.96       56.67    0.57
  8     400         64.90         28.88       87.16    0.87
 13     600         90.67         18.46       80.50    0.81
 17     800        165.87         10.96       94.91    0.95
 22    1000        180.13          3.98      100.00    1.00
 26    1200        307.44          3.18      100.00    1.00
 31    1400        347.69          3.67      100.00    1.00
 35    1600        115.13          3.29      100.00    1.00
 40    1800       5961.54         12.00       88.28    0.88
 45    2000       1124.46          5.50      

## Evaluate Sentiment Analysis on gold and silver commodities related content
Two models are used to recognise sentiment, the results are then compared against each other.  
One model is Spacy model, another is [FinancialBERT](https://huggingface.co/ahmedrachid/FinancialBERT-Sentiment-Analysis) model from Huggingface  
Note: one disadvantage of FinancialBERT model is that it can not process text that has more than 512 tokens.

In [3]:
# to exclude first 232 rows that have been used for training
padding = 231

with open('sample.txt', mode='r', encoding='utf-8') as fd:
    samples = fd.readlines()
samples = samples[padding:]

#### Load Spacy model

In [4]:
import spacy
ner = spacy.load("ner/output/model-last")

#### Load Huggingface model

In [52]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from datasets import Dataset, DatasetDict

pretrainedFinancialBERT = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
pretrainedFinancialBERT.to_bettertransformer()
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

bert_sent = pipeline("sentiment-analysis", model=pretrainedFinancialBERT, tokenizer=tokenizer)

sentences = ["Operating profit rose to EUR 13.1 mn from EUR 8.7 mn in the corresponding period in 2007 representing 7.7 % of net sales.",  
             "Bids or offers include at least 1,000 shares and the value of the shares must correspond to at least EUR 4,000.", 
             "Raute reported a loss per share of EUR 0.86 for the first half of 2009 , against EPS of EUR 0.74 in the corresponding period of 2008.", 
             ]

results = bert_sent(sentences)
print(results)

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


[{'label': 'positive', 'score': 0.9998133778572083}, {'label': 'neutral', 'score': 0.9997822642326355}, {'label': 'negative', 'score': 0.9877365231513977}]


#### Filter gold and silver commodities related content
Use Spacy model that has been trained before to perform NER

In [7]:
gold_silver_com_sample = []
com_label = 'COM'
for sample in samples:
    doc = ner(sample)
    for ent in doc.ents:
        if ent.label_ == com_label and ent.text.lower() in ['gold', 'silver']:
            # print(sample);
            # print(ent.text, ent.start_char, ent.end_char, ent.label_)
            gold_silver_com_sample.append(sample);

#### Evaluate and collect data

In [57]:
import spacy
sent = spacy.load("sentiment_analysis_spacy/output/model-last")

text_length = 1000
data = []
for sample in gold_silver_com_sample:

    doc = sent(sample)
    spacy_label = max(doc.cats.items(), key=lambda x: x[1])[0]
    spacy_label = spacy_label.lower()

    # model's max_seq_length is 512, FinancialBERT is not able to process too long documents
    if len(sample.split()) < 320:
        bert_predicted = bert_sent(sample)
        bert_label = bert_predicted[0]['label'].lower()
        unmatched = 'X' if spacy_label != bert_label else 'O'
    else:
        bert_label = 'n/a'
        unmatched = ''


    data.append([sample[:text_length], spacy_label, bert_label, unmatched])

#### Display data

In [24]:
# !pip3 install -U tabulate
import tabulate

headers = ['Sample', 'Spacy', 'FinancialBERT', 'Comparison']
table = tabulate.tabulate(
    data,
    headers=headers,
    tablefmt='simple',
    colalign=('center', 'left','center', 'center', 'center'),
    maxcolwidths=[4, 60, 8, 8, 8], showindex="always"
    )
print(table)

     Sample                                                         Spacy     FinancialBERT    Comparison
---  ------------------------------------------------------------  --------  ---------------  ------------
 0   Silver Futures Discussions. silver 25.90                      negative      neutral           X
 1   Gold 💫 bagged, learned so much here, and nice people too     neutral       neutral           O
     https://t.co/g9qQ8sbDue
 2   Gold Futures Discussions. Bears game...                       negative      neutral           X
 3   Gold Rate Today, 30 April 2021: Gold, Silver fall, know –     negative      neutral           X
     what are the 10 grams gold rate today
     https://t.co/kSVOTVuXw5
 4   @BTC_Archive @PeterSchiff Gold price is suppressed by the     negative      neutral           X
     government. Buy Bitcoin
 5   Silver Futures Discussions. usd/inr at support so the         negative      neutral           X
     advantage to run by double engine ( plunging zi

## Fine tune FinancialBERT using the same training data that is used for training Spacy TextCategorizer
In model overview we see that FinancialBERT include bert layer, classifier layer. The fine tuning experiment is trying to train classifier layer only, bert layer is kept unchanged during training in order to preserve pretrained performance.

In [88]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from datasets import Dataset, DatasetDict

model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
print('Model summarization: \n\n', model)

for param in model.bert.embeddings.parameters():
  param.requires_grad = False

for param in model.bert.encoder.parameters():
  param.requires_grad = False

for param in model.bert.pooler.parameters():
  param.requires_grad = False


Model summarization: 

 BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

#### Prepare training data

In [89]:
from pathlib import Path

id2label = model.config.id2label
label2id = model.config.label2id

dataset = []
for item in sent_train_data:
    text = item['data']['text']
    annotated = item['annotations'][0]

    if len(text) < 20:
        continue

    result = annotated['result']
    if len(result) == 0:
        continue;
    label = result[0]['value']['choices'][0]
    label = label.lower()

    dataset.append({
        'text': text,
        'label': label2id[label]
        })

print('number of sample: ', len(dataset))
print(dataset[3])


number of sample:  45
{'text': "Gold Futures Discussions. So, any more short paper expire?\xa0Moy said, but it's only a matter of time before the short contracts keeping the price down expire.", 'label': 1}


In [90]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = Dataset.from_list(dataset)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 708.93 examples/s]


#### Training (fine tuning) 
Note: traing_dataset and eval_datasset are the same.

In [91]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

batch_size = 10
training_args = TrainingArguments(
  output_dir="output_trainer",
  num_train_epochs=120,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  evaluation_strategy="epoch",
  save_total_limit=2,
  gradient_checkpointing=True,  #  If True, use gradient checkpointing to save memory at the expense of slower backward pass
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    compute_metrics=compute_metrics,
)

In [92]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.659011,0.688889
2,No log,1.636894,0.688889
3,No log,1.614349,0.688889
4,No log,1.592872,0.688889
5,No log,1.570359,0.688889
6,No log,1.548936,0.688889
7,No log,1.530016,0.688889
8,No log,1.510552,0.688889
9,No log,1.491386,0.688889
10,No log,1.47193,0.688889


Checkpoint destination directory output_trainer/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=600, training_loss=1.0319591395060221, metrics={'train_runtime': 225.6178, 'train_samples_per_second': 23.934, 'train_steps_per_second': 2.659, 'total_flos': 1420812455731200.0, 'train_loss': 1.0319591395060221, 'epoch': 120.0})

In [93]:
model.save_pretrained('models/FinancialBERT')

## Perform Spacy model, FinancialBERT model, fine tuned FinancialBERT model

#### Evaluate and colllect data

In [94]:
import spacy
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from datasets import Dataset, DatasetDict

# Load Spacy model
sent = spacy.load("sentiment_analysis_spacy/output/model-last")

# Load FinancialBERT
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

# Load fine tuned model
pretrainedFinancialBERT = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
pretrainedFinancialBERT.to_bettertransformer()
bert_sent1 = pipeline("sentiment-analysis", model=pretrainedFinancialBERT, tokenizer=tokenizer)

tunedFinancialBERT = BertForSequenceClassification.from_pretrained('models/FinancialBERT')
tunedFinancialBERT.to_bettertransformer()
bert_sent2 = pipeline("sentiment-analysis", model=tunedFinancialBERT, tokenizer=tokenizer)

text_length = 1000
data2 = []
for sample in gold_silver_com_sample:

    doc = sent(sample)
    spacy_label = max(doc.cats.items(), key=lambda x: x[1])[0]
    spacy_label = spacy_label.lower()

    # model's max_seq_length is 512, FinancialBERT is not able to process too long documents
    if len(sample.split()) < 320:
        bert_predicted1 = bert_sent1(sample)
        bert_label1 = bert_predicted1[0]['label'].lower()
        unmatched1 = 'X' if spacy_label != bert_label1 else 'O'

        bert_predicted2 = bert_sent2(sample)
        bert_label2 = bert_predicted2[0]['label'].lower()
        unmatched2 = 'X' if spacy_label != bert_label2 else 'O'
    else:
        bert_label = 'n/a'
        unmatched = ''


    data2.append([sample[:text_length], spacy_label, bert_label1, unmatched1, bert_label2, unmatched2])

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.
The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


#### Display data
Manually checking some samples that fine tuned model predicts differently from origin one, found that funed one is slightly better.

In [95]:
# !pip3 install -U tabulate
import tabulate

headers = ['Sample', 'Spacy', 'FinancialBERT', 'vs Spacy', 'tuned FinancialBERT', 'vs Spacy']
table = tabulate.tabulate(
    data2,
    headers=headers,
    tablefmt='simple',
    colalign=('center', 'left','center', 'center', 'center', 'center', 'center'),
    maxcolwidths=[4, 50, 8, 8, 8, 8, 8], showindex="always"
    )
print(table)

     Sample                                               Spacy     FinancialBERT    vs Spacy    tuned FinancialBERT    vs Spacy
---  --------------------------------------------------  --------  ---------------  ----------  ---------------------  ----------
 0   Silver Futures Discussions. silver 25.90            negative      neutral          X              neutral             X
 1   Gold 💫 bagged, learned so much here, and nice      neutral       neutral          O              neutral             O
     people too https://t.co/g9qQ8sbDue
 2   Gold Futures Discussions. Bears game...             negative      neutral          X              neutral             X
 3   Gold Rate Today, 30 April 2021: Gold, Silver fall,  negative      neutral          X              neutral             X
     know – what are the 10 grams gold rate today
     https://t.co/kSVOTVuXw5
 4   @BTC_Archive @PeterSchiff Gold price is suppressed  negative      neutral          X              neutral             