In [11]:
import evaluate
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import softmax
from torch.optim import AdamW

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, get_scheduler, AutoModelForSequenceClassification
from tqdm.auto import tqdm

DATA_PATH = "/kaggle/input/futures-price-prediction/"
RANDOM_STATE = 42

In [2]:
df = pd.read_csv("/kaggle/input/futures-price-prediction/processed.csv")
df.head()

Unnamed: 0,text,is_quote_status,has_card,1_day_after,is_in_reply_to,is_urls,is_thumbnail_title,is_hashtags,year,month,day_of_week
0,metatckr zuckerberg has said that they would b...,0,0,0,0,0,0,0,2024,10,5
1,"you can follow all market news, including for ...",0,0,1,1,1,0,0,2024,9,5
2,just in: apple aapltckr is now reportedly no l...,0,0,1,0,0,0,0,2024,9,5
3,"apple, aapltckr, is no longer a ‘growth engine...",0,0,1,0,0,0,0,2024,9,3
4,here are the hottest and most active bullish a...,0,0,1,0,1,0,0,2024,9,0


In [3]:
df = df[["text", "1_day_after"]].rename({"1_day_after": "label"}, axis=1)

X_train, X_test = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
Xtrain, Xval = train_test_split(X_train, test_size=0.2, random_state=RANDOM_STATE)

train = Dataset.from_pandas(Xtrain)
val = Dataset.from_pandas(Xval)

train = train.remove_columns(["__index_level_0__"])
val = val.remove_columns(["__index_level_0__"])

In [4]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_train = train.map(tokenize_function, batched=True)
tokenized_val = val.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")

tokenized_val = tokenized_val.remove_columns(["text"])
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_val.set_format("torch")

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_val, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/5374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1344 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=2, ignore_mismatched_sizes=True)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
optimizer = AdamW(model.parameters(), lr=5e-5, fused=True)

In [18]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [19]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [20]:
device

device(type='cuda')

In [24]:
progress_bar = tqdm(range(num_training_steps))
metric = evaluate.load("roc_auc")

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    all_probs = []
    all_labels = []

    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        logits = outputs.logits
        probs = softmax(logits, dim=1)[:, 1].detach().cpu()
        labels = batch["labels"].detach().cpu()

        all_probs.extend(probs.numpy())
        all_labels.extend(labels.numpy())

    score = metric.compute(prediction_scores=all_probs, references=all_labels)
    print(f"Epoch {epoch}: ROC-AUC = {score['roc_auc']:.4f}")

  0%|          | 0/2016 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

Epoch 0: ROC-AUC = 0.5078
Epoch 1: ROC-AUC = 0.5169
Epoch 2: ROC-AUC = 0.5221
