# Experiment notes on fine-tuning

In [None]:
# !pip install datasets
# !pip install evaluate
# !pip install accelerate

In [3]:
# See python-version

from datasets import load_dataset
from transformers import (
  GPT2Tokenizer,
  GPT2ForSequenceClassification,
  TrainingArguments,
  Trainer
)
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Get dataset from HuggingFace

`load_datasets()` is the standard way to download from HuggingFace repos.

Hugging Face's datasets contain a training, testing, and evaluation section.


In [4]:
ds = load_dataset("ic-fspml/stock_news_sentiment")

In [6]:
# Data comes with train
df = pd.DataFrame(ds["train"])

In [7]:
def enum_label(x) -> int:
  if x == "neutral":
    return 0
  elif x == "strongly bearish":
    return -2
  elif x == "mildly bearish":
    return -1
  elif x == "mildly bullish":
    return 1
  elif x == "strongly bullish":
    return 2

def new_col(x):
  x["sentiment"] = x["label"]
  x["label"] = enum_label(x["label"])
  return x


In [8]:
# Enumerate labels since Transformers can't use str as labels
new_ds = ds.map(new_col)

# df["label_enum"] = df["label"].apply(enum_label)

Find the number of unique labels.

In [9]:
df["label"].nunique()

5

In [38]:
# View test
pd.DataFrame(new_ds["test"])

Unnamed: 0,ticker,name,type,sector,article_date,article_headline,label,sentiment
0,CIFC,CIFC Corp.,MUTUALFUND,,2016-08-22 14:22:00+00:00,Mid-Morning Market Update: Markets Mostly Lowe...,0,neutral
1,AAVL,"Avalanche Biotechnologies, Inc.",MUTUALFUND,,2015-05-13 11:05:00+00:00,Avalanche Biotechnologies Reports Q1 Loss $0.3...,-1,mildly bearish
2,QIHU,Qihoo 360 Technology Co. Ltd. American Deposit...,MUTUALFUND,,2013-07-21 14:23:00+00:00,"Benzinga's M&A Chatter for Friday July 19, 2013",0,neutral
3,AXTA,Axalta Coating Systems Ltd.,EQUITY,Basic Materials,2020-04-01 10:44:00+00:00,Exane BNP Paribas Upgrades Axalta Coating Sys ...,1,mildly bullish
4,IR,Ingersoll Rand Inc.,EQUITY,Industrials,2013-04-08 16:38:00+00:00,Halliburton and Other Stocks Added to Jefferie...,1,mildly bullish
...,...,...,...,...,...,...,...,...
30145,ARL,"American Realty Investors, Inc.",EQUITY,Real Estate,2019-09-17 18:41:00+00:00,Mid-Afternoon Market Update: Apogee Enterprise...,-1,mildly bearish
30146,GSAT,"Globalstar, Inc.",EQUITY,Communication Services,2018-07-27 18:31:00+00:00,Mid-Afternoon Market Update: NASDAQ Down 1.7%;...,0,neutral
30147,LEN,Lennar Corporation,EQUITY,Consumer Cyclical,2016-06-14 20:36:00+00:00,MKM Thinks Potential Earnings-Related Boost Fo...,1,mildly bullish
30148,COF,Capital One Financial Corporation,EQUITY,Financial Services,2019-01-23 13:16:00+00:00,Synchrony Financial to Sell its Walmart Loan P...,1,mildly bullish


In [11]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

## Tokenizer

Tokenizer is a core function for machine learning algorithms to translate data
from human-readible (string) to computer-readible (numbers).

A tokenizer takes a string and breaks up the pieces into tokens which can be
used by an algorithm. Tokenizer, much like the algorithms themselves, vary on
their characteristics and behaviors.

### Efficiency

This makes the parsing of values to be more efficient since numbers are faster
to process and store than strings.

### Examples

Some examples of a tokenizer include Byte-level (BPE) for ChatGPT-2.
Another is Hugging Face's Tokenizer.

### Process

Steps for a tokenizer include:

1. Normalization: Removes whitespace, converts to lowercase, and removes
   accented characters.

   `"Héllò hôw are yoü?"` -> `"hello, how are you?"`

1. Pro-tokenization: Split the string into smaller chunks such as words. In the
   following example, the offsets are kept track.

   `"hello, how are you?"` -> `[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]`

1. Modeling: Using a BERT tokenizer, will tokenize the sentence like this:

   `["hello"; ","; "how"; "are"; "you"; "?"]`

1. Post-processing: Adds commands for processing the text.

   `["CLS"; "hello"; ","; "how"; "are"; "you"; "?"; "SEP"]`

   The CLS stands for classification token and SEP stands for end of sentence.

https://medium.com/@awaldeep/hugging-face-understanding-tokenizers-1b7e4afdb154


In [39]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize(examples):
    """Returns tokenized data for each row."""
    return tokenizer(examples["sentiment"], padding="max_length", truncation=True)

In [42]:
new_ds.map(tokenize, batched=True)

Map: 100%|██████████| 200998/200998 [00:19<00:00, 10453.65 examples/s]
Map: 100%|██████████| 20100/20100 [00:01<00:00, 10178.07 examples/s]
Map: 100%|██████████| 30150/30150 [00:03<00:00, 9729.64 examples/s]


DatasetDict({
    train: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 200998
    })
    validation: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 20100
    })
    test: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 30150
    })
})

In [13]:
# Training set
small_train_dataset = new_ds["train"].shuffle(seed=42).select(range(1000))
# Testing set
small_eval_dataset = new_ds["test"].shuffle(seed=42).select(range(1000))

In [20]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=5)  # Labels are `labels` column

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
metric = evaluate.load("accuracy")

In [21]:
metric

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [18]:
def compute_metrics(eval):
    logits, labels = eval
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    # evaluation_strategy="epoch",
    per_device_train_batch_size=1,  # Reduce batch size here
    per_device_eval_batch_size=1,  # Optionally, reduce for evaluation as well
    gradient_accumulation_steps=4,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

  0%|          | 0/750 [00:00<?, ?it/s]

ValueError: You have to specify either input_ids or inputs_embeds