In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

### Bart For Summarize Text

In [None]:
# Example usage
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BART model for summarization and move it to the specified device (GPU if available)
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def split_text(text, max_chunk_size=1024):
    tokens = bart_tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_chunk_size):
        chunk = bart_tokenizer.convert_tokens_to_string(tokens[i:i + max_chunk_size])
        chunks.append(chunk)
    return chunks

def summarize_sentence(text_list):
    summaries = []
    cur_sentence = ''
    for ii, chunk in enumerate(text_list):
        if len(cur_sentence+chunk) < 1000 and ii < len(text_list)-1:
            if ii == len(text_list)-1:
                print('Last Sentence')
            cur_sentence = cur_sentence+chunk
        else:
            inputs = bart_tokenizer([cur_sentence], max_length=1024, return_tensors='pt', truncation=True)

            # Move input tensors to the same device as the model
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Generate summary and move it back to CPU for decoding
            summary_ids = bart_model.generate(inputs['input_ids'], num_beams=4, max_length=200, early_stopping=True)
            summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
            cur_sentence = chunk
    summaries_text = ' '.join(summaries)
    if len(summaries_text) <= 1024:
        return summaries_text
    else:
        return summarize_sentence(summaries)

### LED For summarize text

In [None]:
import torch
from transformers import LEDTokenizer, LEDForConditionalGeneration
import pandas as pd
import logging

In [None]:
# Load LED model for summarization and move it to the specified device (GPU if available)
led_model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384').to(device)
led_tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')
def summarize_sentence_led(text):
    paragraphs = text.split('\n\n')
    print('====Start===')
    summaries = []
    for paragraph in paragraphs:
        text_list = sent_tokenize(paragraph)
        cur_sentence = ''
        # Predefined settings
        max_input_length = 8000
        max_summary_length = 800
        min_summary_length = 40
        length_penalty = 2.0
        num_beams = 4
        for ii, chunk in enumerate(text_list):
            if ii % 1000 == 0:
                print(f'Progress: {ii}/{len(text_list)}')
            if len(cur_sentence + chunk) < max_input_length - 100 and ii < len(text_list) - 1:
                cur_sentence += chunk + ' '
            else:
                inputs = led_tokenizer(cur_sentence, return_tensors="pt", max_length=max_input_length, truncation=True)
                input_ids = inputs.input_ids.to('cuda')
                attention_mask = inputs.attention_mask.to('cuda')

                # Generate summary
                summary_ids = led_model.generate(input_ids, attention_mask=attention_mask, max_length=max_summary_length, min_length=min_summary_length, length_penalty=length_penalty, num_beams=num_beams, early_stopping=True)

                # Decode summary
                summary = led_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                summaries.append(summary)
                cur_sentence = chunk
        summaries.append('\n\n')  # Add a separator for each paragraph summary

    summaries_text = ' '.join(summaries)
    print(summaries_text)
    if len(summaries_text) <= 1024:
        print('===Finished===')
        return summaries_text
    else:
        return summarize_sentence_led(summaries_text)

In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,RandomSampler, SequentialSampler,TensorDataset
from transformers import BertTokenizer, AdamW ,BertModel, BertPreTrainedModel
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class CustomFinBERT(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # Load the pre-trained BERT model
        self.bert = BertModel(config)

        # Define custom feedforward layers
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(config.hidden_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 3)  # Output layer for 3 classes

        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # Get the output from the BERT model
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[1]  # Use pooled output

        # Pass through the custom layers
        x = self.dropout(sequence_output)
        x = nn.ReLU()(self.fc1(x))
        x = self.dropout(x)
        x = nn.ReLU()(self.fc2(x))
        x = self.dropout(x)
        x = nn.ReLU()(self.fc3(x))
        x = self.fc4(x)

        return x

# Prepare the dataset
def encode_texts(tokenizer, texts, max_length=512):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True,  # Include token type ids
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        token_type_ids.append(encoded['token_type_ids'])  # Append token type ids

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)  # Concatenate token type ids

    return input_ids, attention_masks, token_type_ids

def train(model, train_dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch

        model.zero_grad()

        # Forward pass, include token_type_ids
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
        loss = nn.CrossEntropyLoss()(outputs, b_labels)
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    return avg_train_loss

def evaluate(model, validation_dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        # Move batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch

        with torch.no_grad():
            # Forward pass, include token_type_ids
            outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)

        loss = nn.CrossEntropyLoss()(outputs, b_labels)
        total_eval_loss += loss.item()

        preds = torch.argmax(outputs, dim=1).flatten()
        total_eval_accuracy += (preds == b_labels).cpu().numpy().mean()
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    return avg_val_loss, avg_val_accuracy

Using device: cuda


### Test the Fine-Tuned Model only on FOMC Statement

In [4]:
#Load The pretrained_model
model = CustomFinBERT.from_pretrained('models/V1_Epoch75 ')
model.to(device)

CustomFinBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [5]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

In [8]:
# Load the Data(FOMC + Other Financial sentence)
full_df = pd.read_csv('data/ft.csv')

In [28]:
#Example of Other Sentence
ii = 1500
text, label = full_df.iloc[ii,1],full_df.iloc[ii,2]
text,label

('The sale price was not disclosed .', 0)

In [29]:
# Test On Model using other Sentence
inputs = tokenizer(text, return_tensors="pt")
inputs.to(device)
outputs = model(**inputs)
outputs

tensor([[ 6.2081, -3.2658, -3.3158]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [36]:
# Get Only FOMC Statement
validation_text = full_df.iloc[2265:, 1].values
validation_label = full_df.iloc[2265:, 2].values

In [54]:
validation_text[0][:200]

'A meeting of the Federal Open Market Committee was held in the offices of the Board of Governors of the Federal Reserve System in Washington, D. Madigan and Simpson, Associate Directors, Divisions of '

In [41]:
# Since the FOMC statement are longer, we first encode the text as how we trained the data
input_ids, attention_masks, token_type_ids = encode_texts(tokenizer, validation_text)
labels = torch.tensor(validation_label)

In [51]:
# Load the validation set to dataloader
validation_data = TensorDataset(input_ids, attention_masks, token_type_ids, labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=294)

In [52]:
# Get The Score
avg_val_loss, avg_val_acc = evaluate(model, validation_dataloader, device)
print(f"Validation loss: {avg_val_loss}, Validation Accuracy:{avg_val_acc}")

Validation loss: 0.1024499461054802, Validation Accuracy:0.9863945578231292
