## Experiment on fine-tuning

In [1]:
# !pip install datasets
# !pip install evaluate
# !pip install accelerate

In [2]:
# See python-version

from datasets import load_dataset
from transformers import (
  GPT2Tokenizer,
  GPT2ForSequenceClassification,
  TrainingArguments,
  Trainer
)
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


`load_datasets()` is the standard way to download from HuggingFace repos.

In [3]:
# Get dataset from HuggingFace
ds = load_dataset("ic-fspml/stock_news_sentiment")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label'],
        num_rows: 200998
    })
    validation: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label'],
        num_rows: 20100
    })
    test: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label'],
        num_rows: 30150
    })
})

In [5]:
# Data comes with train
df = pd.DataFrame(ds["train"])

In [6]:
def enum_label(x):
  if x == "neutral":
    return 0
  elif x == "strongly bearish":
    return -2
  elif x == "mildly bearish":
    return -1
  elif x == "mildly bullish":
    return 1
  elif x == "strongly bullish":
    return 2

In [7]:
# Enumerate labels since Transformers can't use str as labels
df["label_enum"] = df["label"].apply(enum_label)

In [8]:
df["label"].nunique()

5

In [9]:
ds

DatasetDict({
    train: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label'],
        num_rows: 200998
    })
    validation: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label'],
        num_rows: 20100
    })
    test: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label'],
        num_rows: 30150
    })
})

In [10]:
# Introduce the pre-trained model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [11]:
# The Tokenizer cleans up paddings among other things in data.
tokenizer.pad_token = tokenizer.eos_token
def tokenize(examples):
    return tokenizer(examples["label"], padding="max_length",
                     truncation=True)

In [12]:
ds.map(tokenize, batched=True)

DatasetDict({
    train: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200998
    })
    validation: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20100
    })
    test: Dataset({
        features: ['ticker', 'name', 'type', 'sector', 'article_date', 'article_headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 30150
    })
})

In [13]:
# Training set
small_train_dataset = ds["train"].shuffle(seed=42).select(range(1000))
# Testing set
small_eval_dataset = ds["test"].shuffle(seed=42).select(range(1000))

In [14]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=5)  # Labels are `labels` column

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
metric = evaluate.load("accuracy")

In [None]:
metric

In [None]:
def compute_metrics(eval):
    logits, labels = eval
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=1,  # Reduce batch size here
   per_device_eval_batch_size=1,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()