<a href="https://colab.research.google.com/github/tamaskecskemeti/financial_nlp/blob/main/nlp_qlora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install required packages written in requirements
!pip install -r requirements.txt
!pip install accelerate -U

Collecting datasets (from -r requirements.txt (line 4))
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft (from -r requirements.txt (line 5))
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb (from -r requirements.txt (line 6))
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting loguru (from -r requirements.txt (line 7))
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00

In [3]:
# from transformers import GPT2LMHeadModel, AutoTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers import TextDataset, DataCollatorForLanguageModeling, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
import torch
import torch.nn.functional as F
import itertools
import numpy as np
from pathlib import Path

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [17]:
def generate_text_from_input(tokenizer, model, input_text):
  input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

  out = model.generate(input_ids,
                     max_new_tokens=100,
                     num_beams=5,
                     no_repeat_ngram_size=4,
                     top_k=50,
                     do_sample=True,
                     top_p=0.9,
                     temperature=1,
                     early_stopping=True,
                     pad_token_id=tokenizer.eos_token_id).to(device)

  out_text = list(map(tokenizer.decode, out))[0]

  return out_text

In [18]:
# rouge scores for a reference/generated sentence pair
# source google seq2seq source code.

# supporting function
def _split_into_words(sentences):
  """Splits multiple sentences into words and flattens the result"""
  return list(itertools.chain(*[_.split(" ") for _ in sentences]))

# supporting function
def _get_word_ngrams(n, sentences):
  """Calculates word n-grams for multiple sentences.
  """
  assert len(sentences) > 0
  assert n > 0

  words = _split_into_words(sentences)
  return _get_ngrams(n, words)

# supporting function
def _get_ngrams(n, text):
  """Calcualtes n-grams.
  Args:
    n: which n-grams to calculate
    text: An array of tokens
  Returns:
    A set of n-grams
  """
  ngram_set = set()
  text_length = len(text)
  max_index_ngram_start = text_length - n
  for i in range(max_index_ngram_start + 1):
    ngram_set.add(tuple(text[i:i + n]))
  return ngram_set

def rouge_n(reference_sentences, evaluated_sentences, n=2):
  """
  Args:
    evaluated_sentences: The sentences that have been picked by the summarizer
    reference_sentences: The sentences from the referene set
    n: Size of ngram.  Defaults to 2.
  Returns:
    recall rouge score(float)
  Raises:
    ValueError: raises exception if a param has len <= 0
  """
  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
    raise ValueError("Collections must contain at least 1 sentence.")

  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
  reference_ngrams = _get_word_ngrams(n, reference_sentences)
  reference_count = len(reference_ngrams)
  evaluated_count = len(evaluated_ngrams)

  # gets the overlapping ngrams between evaluated and reference
  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
  overlapping_count = len(overlapping_ngrams)

  # handle edge case. This isn't mathematically correct, but it's good enough
  if evaluated_count == 0:
    precision = 0.0
  else:
    precision = overlapping_count / evaluated_count

  if reference_count == 0:
    recall = 0.0
  else:
    recall = overlapping_count / reference_count

  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

  # just returning recall count in rouge, useful for our purpose
  return recall

In [19]:
# some text to test the model
text = Path("generate_text_en.txt").read_text()

In [23]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m").to(device)



In [24]:
generated_text = generate_text_from_input(tokenizer, model, text)
generated_text

'One of the biggest names in Silicon Valley is placing a moonshot bet on bitcoin BTCUSD, +0.72% . \nFounders Fund, the venture-capital firm co-founded by Peter Thiel, has amassed hundreds of millions of dollars of the volatile cryptocurrency, people familiar with the matter said. The fund has invested in several cryptocurrencies, including Bitcoin (BTC), Ethereum (ETH), Litecoin (LTC), and Dogecoin (DOGE), among others. The fund also invested in Bitcoin Cash (BCH) and Litecoin Ether (LTCE).\nThe fund has also invested in the cryptocurrency Ether (ETH), which is now valued at around $9,000. The fund is also investing in Bitcoin Cash and Liteco'

In [25]:
# the reference text is used to evaluate the generated text
ref_text = Path("reference_text_en.txt").read_text()

In [26]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text, generated_text))

0.6494845360824743


In [27]:
print(model)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

In [28]:
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []

    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])

    return layer_names

list(set(get_specific_layer_names(model)))

['', 'dense_h_to_4h', 'dense_4h_to_h', 'dense', 'query_key_value']

In [29]:
def load_dataset(file_path, tokenizer, block_size = 128):
  dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
  return dataset


def load_data_collator(tokenizer):
  data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
  return data_collator

def train(input_path,
          model_name,
          output_path,
          learning_rate,
          per_device_train_batch_size,
          num_train_epochs):

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(input_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_path)
  bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
  model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
  model = prepare_model_for_kbit_training(model)

  config = LoraConfig(r=8, lora_alpha=32, target_modules=["query_key_value"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM)
  model = get_peft_model(model, config)
  model.save_pretrained(output_path)

  training_args = TrainingArguments(
          output_dir=output_path,
          learning_rate=learning_rate,
          overwrite_output_dir=False,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          optim="paged_adamw_8bit",
          optim_target_modules=["attn", "mlp"]
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [30]:
input_path = "train_text_en.txt"

learning_rates = [1e-05, 2e-05, 3e-5]
batch_sizes = [4, 8]
combinations = [(lr, bs) for lr in learning_rates for bs in batch_sizes]

for lr, bs in combinations:
  output_path = f"result_en_{lr}_{bs}"
  train(
    input_path=input_path,
    model_name="bigscience/bloom-560m",
    output_path=output_path,
    learning_rate=lr,
    per_device_train_batch_size=bs,
    num_train_epochs=4
    )

`low_cpu_mem_usage` was None, now set to True since model is quantized.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Step,Training Loss


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Step,Training Loss


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Step,Training Loss


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Step,Training Loss


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Step,Training Loss




In [31]:
tokenizer_trained = AutoTokenizer.from_pretrained("result_en_1e-05_4")
model_trained = AutoModelForCausalLM.from_pretrained("result_en_1e-05_4").to(device)

generated_text = generate_text_from_input(tokenizer_trained, model_trained, text)
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text, generated_text))

0.6666666666666666


In [32]:
tokenizer_trained = AutoTokenizer.from_pretrained("result_en_1e-05_8")
model_trained = AutoModelForCausalLM.from_pretrained("result_en_1e-05_8").to(device)

generated_text = generate_text_from_input(tokenizer_trained, model_trained, text)
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text, generated_text))

0.6563573883161512


In [33]:
tokenizer_trained = AutoTokenizer.from_pretrained("result_en_2e-05_4")
model_trained = AutoModelForCausalLM.from_pretrained("result_en_2e-05_4").to(device)

generated_text = generate_text_from_input(tokenizer_trained, model_trained, text)
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text, generated_text))

0.6701030927835051


In [34]:
tokenizer_trained = AutoTokenizer.from_pretrained("result_en_2e-05_8")
model_trained = AutoModelForCausalLM.from_pretrained("result_en_2e-05_8").to(device)

generated_text = generate_text_from_input(tokenizer_trained, model_trained, text)
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text, generated_text))

0.6597938144329897


In [35]:
tokenizer_trained = AutoTokenizer.from_pretrained("result_en_3e-05_4")
model_trained = AutoModelForCausalLM.from_pretrained("result_en_3e-05_4").to(device)

generated_text = generate_text_from_input(tokenizer_trained, model_trained, text)
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text, generated_text))

0.6460481099656358
