<a href="https://colab.research.google.com/github/tamaskecskemeti/financial_nlp/blob/main/nlp_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# install required packages written in requirements
!pip install -r requirements.txt



In [5]:
from transformers import GPT2LMHeadModel, set_seed, AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline
import torch
import torch.nn.functional as F
import itertools
import numpy as np
from pathlib import Path

In [6]:
set_seed(42)

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"
device

'cuda:0'

# Function creator

In [7]:
def generate_text_from_input(tokenizer, model, input_text):
  input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

  out = model.generate(input_ids,
                     max_new_tokens=100,
                     num_beams=5,
                     no_repeat_ngram_size=4,
                     top_k=50,
                     do_sample=True,
                     top_p=0.9,
                     temperature=1,
                     early_stopping=True,
                     pad_token_id=tokenizer.eos_token_id).to(device)

  out_text = list(map(tokenizer.decode, out))[0]

  return out_text

In [8]:
# rouge scores for a reference/generated sentence pair
# source google seq2seq source code.

# supporting function
def _split_into_words(sentences):
  """Splits multiple sentences into words and flattens the result"""
  return list(itertools.chain(*[_.split(" ") for _ in sentences]))

# supporting function
def _get_word_ngrams(n, sentences):
  """Calculates word n-grams for multiple sentences.
  """
  assert len(sentences) > 0
  assert n > 0

  words = _split_into_words(sentences)
  return _get_ngrams(n, words)

# supporting function
def _get_ngrams(n, text):
  """Calcualtes n-grams.
  Args:
    n: which n-grams to calculate
    text: An array of tokens
  Returns:
    A set of n-grams
  """
  ngram_set = set()
  text_length = len(text)
  max_index_ngram_start = text_length - n
  for i in range(max_index_ngram_start + 1):
    ngram_set.add(tuple(text[i:i + n]))
  return ngram_set

def rouge_n(reference_sentences, evaluated_sentences, n=2):
  """
  Args:
    evaluated_sentences: The sentences that have been picked by the summarizer
    reference_sentences: The sentences from the referene set
    n: Size of ngram.  Defaults to 2.
  Returns:
    recall rouge score(float)
  Raises:
    ValueError: raises exception if a param has len <= 0
  """
  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
    raise ValueError("Collections must contain at least 1 sentence.")

  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
  reference_ngrams = _get_word_ngrams(n, reference_sentences)
  reference_count = len(reference_ngrams)
  evaluated_count = len(evaluated_ngrams)

  # gets the overlapping ngrams between evaluated and reference
  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
  overlapping_count = len(overlapping_ngrams)

  # handle edge case. This isn't mathematically correct, but it's good enough
  if evaluated_count == 0:
    precision = 0.0
  else:
    precision = overlapping_count / evaluated_count

  if reference_count == 0:
    recall = 0.0
  else:
    recall = overlapping_count / reference_count

  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

  # just returning recall count in rouge, useful for our purpose
  return recall

# English model

In [17]:
# some text to test the model
text_en = Path("generate_text_en.txt").read_text()

In [8]:
tokenizer_1_en = AutoTokenizer.from_pretrained("ai-forever/mGPT")
model_1_en = GPT2LMHeadModel.from_pretrained("ai-forever/mGPT").to(device)
tokenizer_2_en = AutoTokenizer.from_pretrained("gpt2")
model_2_en = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

In [18]:
generated_text_1_en = generate_text_from_input(tokenizer_1_en, model_1_en, text_en)
generated_text_1_en

'One of the biggest names in Silicon Valley is placing a moonshot bet on bitcoin BTCUSD, +0.72%. \nFounders Fund, the venture-capital firm co-founded by Peter Thiel, has amassed hundreds of millions of dollars of the volatile cryptocurrency, people familiar with the matter said.\n“Bitcoin is a great investment,” Thiel told CNBC. “It’s a great investment, but it’s not a good investment. It’s a good investment, but you’re not going to get a lot of money out of it.”\nBitcoin’s volatility has been a problem for investors in the cryptocurrencies since the beginning of the year, according to Thiel.\n“I don’t think that’s'

In [19]:
generated_text_2_en = generate_text_from_input(tokenizer_2_en, model_2_en, text_en)
generated_text_2_en

'One of the biggest names in Silicon Valley is placing a moonshot bet on bitcoin BTCUSD, +0.72%. \nFounders Fund, the venture-capital firm co-founded by Peter Thiel, has amassed hundreds of millions of dollars of the volatile cryptocurrency, people familiar with the matter said. \n\nThe fund, which has raised more than $100 million in funding, has raised $2.5 billion in the past two years, according to the people, who spoke on condition of anonymity because they were not authorized to discuss the matter publicly.\n\nIn a statement, the fund said: "We are pleased to announce that we have reached a critical milestone in the development of Bitcoin. We are now at the point where we are ready to invest in the next phase of the project'

In [20]:
# the reference text is used to evaluate the generated text
ref_text_en = Path("reference_text_en.txt").read_text()

In [21]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text_en, generated_text_1_en))

0.6632302405498282


In [22]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text_en, generated_text_2_en))

0.7250859106529209


# Hungarian model

In [23]:
# some text in hungarian to test the model
text_hu = Path("generate_text_hu.txt").read_text()


In [24]:
generated_text_1_hu = generate_text_from_input(tokenizer_1_en, model_1_en, text_hu)

In [25]:
generated_text_1_hu

'A Szilícium-völgy egyik legnagyobb neve a bitcoin-ra tesz, ami +0,72%-kal növekedett. \nA Founders Fund, a Peter Thiel által társalapított kockázati tőkecég több százmillió dollárt halmozott fel az ingatag kriptopénzből, mondták az ügyet ismerő személyek.\nA bitcoin árfolyama az elmúlt hetekben megduplázódott.\nAz elmúlt napokban a kriptovaluták árfolyama folyamatosan emelkedett.\nA New York-i tőzsdén a Bitcoin (BTC) árfolyama 0,7 százalékkal emelkedett hétfőn.\nA BTC árfolyama hétfőn 0,8 százalékot esett'

In [27]:
tokenizer_2_hu = AutoTokenizer.from_pretrained("NYTK/PULI-GPT-2")
model_2_hu = GPT2LMHeadModel.from_pretrained("NYTK/PULI-GPT-2").to(device)

generated_text_2_hu = generate_text_from_input(tokenizer_2_hu, model_2_hu, text_hu)

In [28]:
generated_text_2_hu

'A Szilícium-völgy egyik legnagyobb neve a bitcoin-ra tesz, ami +0,72%-kal növekedett. \nA Founders Fund, a Peter Thiel által társalapított kockázati tőkecég több százmillió dollárt halmozott fel az ingatag kriptopénzből, mondták az ügyet ismerő személyek.\nA CNBC-nek nyilatkozó szakértők szerint a kriptopénz nem is annyira kriptopénz, mint inkább egy olyan eszköz, amely a decentralizáltság és a decentralizált döntéshozatal révén képes alkalmazkodni a változó körülményekhez.\nA kriptopénz egy olyan eszközzé válhat, amely képes alkalmazkodni a változásokhoz, és képes alkalmazkodni a megváltozott körülményekhez.<|endoftext|>'

In [32]:
# the reference text is used to evaluate the generated text
ref_text_hu = Path("reference_text_hu.txt").read_text()

In [33]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text_hu, generated_text_1_hu))

0.5847457627118644


In [34]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text_hu, generated_text_2_hu))

0.632768361581921


# Fine-tune models

In [9]:
def load_dataset(file_path, tokenizer, block_size = 128):
  dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
  return dataset


def load_data_collator(tokenizer):
  data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )
  return data_collator


def train(input_path,
          model_name,
          output_path,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(input_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_path)
  model = GPT2LMHeadModel.from_pretrained(model_name)
  model.save_pretrained(output_path)

  training_args = TrainingArguments(
          output_dir=output_path,
          overwrite_output_dir=False,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [10]:
input_path_en = "train_text_en_coded.txt"
output_path_en = "result_en"
model_name_en = "gpt2"

input_path_hu = "train_text_hu.txt"
output_path_hu = "result_hu"
model_name_hu = "NYTK/PULI-GPT-2"
per_device_train_batch_size = 6
num_train_epochs = 64
save_steps = 250

In [None]:
train(
    input_path=input_path_en,
    model_name=model_name_en,
    output_path=output_path_en,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [None]:
train(
    input_path=input_path_hu,
    model_name=model_name_hu,
    output_dir=output_path_hu,
    overwrite_output_dir=False,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [None]:
tokenizer_trained_en = AutoTokenizer.from_pretrained(output_path_en)
model_trained_en = GPT2LMHeadModel.from_pretrained(output_path_en)

generated_text_en = generate_text_from_input(tokenizer_trained_en, model_trained_en, text_en)

In [None]:
generated_text_en

'The economics is:\na\nprice\ntheoretically\nconceivable\nwould be\na\nthing\n.\nFrom\nsure\n,\nthat\non\nApple\nvery\nquickly\nout\nmust\nfind\nsomething\na\n29\n'

In [None]:
tokenizer_trained_hu = AutoTokenizer.from_pretrained(output_path_hu)
model_trained_hu = GPT2LMHeadModel.from_pretrained(output_path_hu)

generated_text_hu = generate_text_from_input(tokenizer_trained_hu, model_trained_hu, text_hu)

In [None]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text_en, generated_text_en))

In [None]:
# the rouge value can be between 0 and 1. The higher value is better
print(rouge_n(ref_text_hu, generated_text_hu))