## Modules 

In [1]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

from transformers import AutoModelForSequenceClassification

from finbert.finbert import *
import finbert.utils as tools

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd().parent
pd.set_option('max_colwidth', -1)

2022-10-02 15:14:53.812310: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-02 15:14:54.513140: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-02 15:14:54.513191: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-02 15:14:54.607079: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-02 15:14:56.042613: W tensorflow/stream_executor/platform/de

In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

## Prepare the model

### Setting path variables:
1. `lm_path`: the path for the pre-trained language model (If vanilla Bert is used then no need to set this one).
2. `cl_path`: the path where the classification model is saved.
3. `cl_data_path`: the path of the directory that contains the data files of `train.csv`, `validation.csv`, `test.csv`.
---

In the initialization of `bertmodel`, we can either use the original pre-trained weights from Google by giving `bm = 'bert-base-uncased`, or our further pre-trained language model by `bm = lm_path`


---
All of the configurations with the model is controlled with the `config` variable. 

In [3]:
lm_path = 'ProsusAI/finbert'
#cl_path = 'finbert-sentiment'
cl_path = project_dir/'resources'/'models'/'classifier_model'/'finbert-sentiment'
cl_data_path = project_dir/'data'/'sentiment_data'


###  Configuring training parameters

You can find the explanations of the training parameters in the class docsctrings. 

In [4]:
# Clean the cl_path
try:
    shutil.rmtree(cl_path) 
except:
    pass

bertmodel = AutoModelForSequenceClassification.from_pretrained(lm_path,cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

`finbert` is our main class that encapsulates all the functionality. The list of class labels should be given in the prepare_model method call with label_list parameter.

In [5]:
finbert = FinBert(config)
finbert.base_model = lm_path
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

In [6]:
finbert.prepare_model(label_list=['positive','negative','neutral'])

10/02/2022 15:15:00 - INFO - finbert.finbert -   device: cpu n_gpu: 0, distributed training: False, 16-bits training: False


## Fine-tune the model

In [7]:
# Get the training examples
train_data = finbert.get_data('train')

In [8]:
model = finbert.create_the_model()



### [Optional] Fine-tune only a subset of the model
The variable `freeze` determines the last layer (out of 12) to be freezed. You can skip this part if you want to fine-tune the whole model.

<span style="color:red">Important: </span>
Execute this step if you want a shorter training time in the expense of accuracy.

In [9]:
# This is for fine-tuning a subset of the model.

freeze = 6

for param in model.bert.embeddings.parameters():
    param.requires_grad = False
    
for i in range(freeze):
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = False

### Training

In [10]:
trained_model = finbert.train(train_examples = train_data, model = model)

Token indices sequence length is longer than the specified maximum sequence length for this model (704 > 512). Running this sequence through the model will result in indexing errors
10/02/2022 15:15:01 - INFO - finbert.utils -   *** Example ***
10/02/2022 15:15:01 - INFO - finbert.utils -   guid: train-1
10/02/2022 15:15:01 - INFO - finbert.utils -   tokens: [CLS] first would like cong ##rat ##ulate excel ##len ##cy trek ##i ##en ex ##tre ##mist ##s region beyond full confidence full commitment family afghanistan able overcome legacy decades violence suffering restore historic position model cooperation different cultures regional crossroads hub trade transit transportation tourism asian continent [SEP]
10/02/2022 15:15:01 - INFO - finbert.utils -   input_ids: 101 2034 2052 2066 26478 8609 9869 24970 7770 5666 10313 2072 2368 4654 7913 23738 2015 2555 3458 2440 7023 2440 8426 2155 7041 2583 9462 8027 5109 4808 6114 9239 3181 2597 2944 6792 2367 8578 3164 16760 9594 3119 6671 5193 6813 

Iteration:   0%|          | 0/33 [00:00<?, ?it/s]

10/02/2022 15:17:06 - INFO - finbert.utils -   *** Example ***
10/02/2022 15:17:06 - INFO - finbert.utils -   guid: validation-1
10/02/2022 15:17:06 - INFO - finbert.utils -   tokens: [CLS] honour stand prestigious ro ##st ##rum today represent fellow country ##wo experience freedoms opportunities democracy sacrificed much build afghan truly able live peace freedom day every single afghan lives peace freedom day live work — day know achieve solidarity international friends partners day longer hope believe [SEP]
10/02/2022 15:17:06 - INFO - finbert.utils -   input_ids: 101 6225 3233 8919 20996 3367 6824 2651 5050 3507 2406 12155 3325 22467 6695 7072 20268 2172 3857 12632 5621 2583 2444 3521 4071 2154 2296 2309 12632 3268 3521 4071 2154 2444 2147 1517 2154 2113 6162 14657 2248 2814 5826 2154 2936 3246 2903 102
10/02/2022 15:17:06 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10/02/2022 15:17:06 -

Validating:   0%|          | 0/7 [00:00<?, ?it/s]

Validation losses: [1.186575940677098]
No best model found


Epoch:  25%|█████████▎                           | 1/4 [02:15<06:45, 135.24s/it]

Iteration:   0%|          | 0/33 [00:00<?, ?it/s]

10/02/2022 15:19:44 - INFO - finbert.utils -   *** Example ***
10/02/2022 15:19:44 - INFO - finbert.utils -   guid: validation-1
10/02/2022 15:19:44 - INFO - finbert.utils -   tokens: [CLS] honour stand prestigious ro ##st ##rum today represent fellow country ##wo experience freedoms opportunities democracy sacrificed much build afghan truly able live peace freedom day every single afghan lives peace freedom day live work — day know achieve solidarity international friends partners day longer hope believe [SEP]
10/02/2022 15:19:44 - INFO - finbert.utils -   input_ids: 101 6225 3233 8919 20996 3367 6824 2651 5050 3507 2406 12155 3325 22467 6695 7072 20268 2172 3857 12632 5621 2583 2444 3521 4071 2154 2296 2309 12632 3268 3521 4071 2154 2444 2147 1517 2154 2113 6162 14657 2248 2814 5826 2154 2936 3246 2903 102
10/02/2022 15:19:44 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10/02/2022 15:19:44 -

Validating:   0%|          | 0/7 [00:00<?, ?it/s]

Validation losses: [1.186575940677098, 1.0806583166122437]


Epoch:  50%|██████████████████▌                  | 2/4 [04:53<04:57, 148.53s/it]

Iteration:   0%|          | 0/33 [00:00<?, ?it/s]

10/02/2022 15:22:38 - INFO - finbert.utils -   *** Example ***
10/02/2022 15:22:38 - INFO - finbert.utils -   guid: validation-1
10/02/2022 15:22:38 - INFO - finbert.utils -   tokens: [CLS] honour stand prestigious ro ##st ##rum today represent fellow country ##wo experience freedoms opportunities democracy sacrificed much build afghan truly able live peace freedom day every single afghan lives peace freedom day live work — day know achieve solidarity international friends partners day longer hope believe [SEP]
10/02/2022 15:22:38 - INFO - finbert.utils -   input_ids: 101 6225 3233 8919 20996 3367 6824 2651 5050 3507 2406 12155 3325 22467 6695 7072 20268 2172 3857 12632 5621 2583 2444 3521 4071 2154 2296 2309 12632 3268 3521 4071 2154 2444 2147 1517 2154 2113 6162 14657 2248 2814 5826 2154 2936 3246 2903 102
10/02/2022 15:22:38 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10/02/2022 15:22:38 -

Validating:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch:  75%|███████████████████████████▊         | 3/4 [07:45<02:39, 159.66s/it]

Validation losses: [1.186575940677098, 1.0806583166122437, 1.1422536543437414]


Iteration:   0%|          | 0/33 [00:00<?, ?it/s]

10/02/2022 15:26:08 - INFO - finbert.utils -   *** Example ***
10/02/2022 15:26:08 - INFO - finbert.utils -   guid: validation-1
10/02/2022 15:26:08 - INFO - finbert.utils -   tokens: [CLS] honour stand prestigious ro ##st ##rum today represent fellow country ##wo experience freedoms opportunities democracy sacrificed much build afghan truly able live peace freedom day every single afghan lives peace freedom day live work — day know achieve solidarity international friends partners day longer hope believe [SEP]
10/02/2022 15:26:08 - INFO - finbert.utils -   input_ids: 101 6225 3233 8919 20996 3367 6824 2651 5050 3507 2406 12155 3325 22467 6695 7072 20268 2172 3857 12632 5621 2583 2444 3521 4071 2154 2296 2309 12632 3268 3521 4071 2154 2444 2147 1517 2154 2113 6162 14657 2248 2814 5826 2154 2936 3246 2903 102
10/02/2022 15:26:08 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10/02/2022 15:26:08 -

Validating:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 100%|█████████████████████████████████████| 4/4 [11:16<00:00, 169.01s/it]

Validation losses: [1.186575940677098, 1.0806583166122437, 1.1422536543437414, 1.1106150150299072]





## Test the model

`bert.evaluate` outputs the DataFrame, where true labels and logit values for each example is given

In [11]:
test_data = finbert.get_data('test')

In [12]:
results = finbert.evaluate(examples=test_data, model=trained_model)

10/02/2022 15:26:24 - INFO - finbert.utils -   *** Example ***
10/02/2022 15:26:24 - INFO - finbert.utils -   guid: test-1
10/02/2022 15:26:24 - INFO - finbert.utils -   tokens: [CLS] stand general assembly today reminded wise men women displayed unique capacity messages love peace hope afghanistan yet asian crossroads dialogue among civilizations model harmony culture tolerance engagement confident plans programmes self - reliance reform bo ##lster ##ed commitment international partners chart path towards realizing full potential [SEP]
10/02/2022 15:26:24 - INFO - finbert.utils -   input_ids: 101 3233 2236 3320 2651 6966 7968 2273 2308 6913 4310 3977 7696 2293 3521 3246 7041 2664 4004 16760 7982 2426 24784 2944 9396 3226 13986 8147 9657 3488 8497 2969 1011 17975 5290 8945 29576 2098 8426 2248 5826 3673 4130 2875 9301 2440 4022 102
10/02/2022 15:26:24 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Testing:   0%|          | 0/8 [00:00<?, ?it/s]

### Prepare the classification report

In [13]:
def report(df, cols=['label','prediction','logits']):
    #print('Validation loss:{0:.2f}'.format(metrics['best_validation_loss']))
    cs = CrossEntropyLoss(weight=finbert.class_weights)
    loss = cs(torch.tensor(list(df[cols[2]])),torch.tensor(list(df[cols[0]])))
    print("Loss:{0:.2f}".format(loss))
    print("Accuracy:{0:.2f}".format((df[cols[0]] == df[cols[1]]).sum() / df.shape[0]) )
    print("\nClassification Report:")
    print(classification_report(df[cols[0]], df[cols[1]]))

In [14]:
results['prediction'] = results.predictions.apply(lambda x: np.argmax(x,axis=0))

In [15]:
report(results,cols=['labels','prediction','predictions'])

Loss:1.11
Accuracy:0.22

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.11      0.17        97
           1       0.11      0.65      0.19        26
           2       0.51      0.21      0.30       105

    accuracy                           0.22       228
   macro avg       0.33      0.33      0.22       228
weighted avg       0.40      0.22      0.23       228



  loss = cs(torch.tensor(list(df[cols[2]])),torch.tensor(list(df[cols[0]])))
