In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip3 install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip3 install -q -U -i https://pypi.org/simple/ bitsandbytes
!pip3 install -q -U transformers=="4.46.0"
!pip3 install -q -U trl=="0.12.1"
!pip3 install -q -U peft
!pip3 install -q -U tensorboard

Reason for being yanked: This version unfortunately does not work with 3.8 but we did not drop the support yet[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.16.1 requires tensorboard<2.17,>=2.16, but you have tensorboard 2.18.0 which is incompatible.[0m[31m
[0m

In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
import re
from datasets import Dataset
from peft import LoraConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda:0


In [5]:
path = "/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv"

df = pd.read_csv(path, names=["sentiment", "text"], encoding="utf-8", encoding_errors="replace")

print(df.shape)
df.head()

(4846, 2)


Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [6]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['sentiment'], random_state=42)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 4361, Test size: 485


In [7]:
# Balance processing for training set
print(train_df['sentiment'].value_counts())

sentiment
neutral     2591
positive    1227
negative     543
Name: count, dtype: int64


In [8]:
n_samples_per_class = 543

balanced_df = train_df.groupby('sentiment', group_keys=False).apply(
    lambda x: x.sample(n=n_samples_per_class, random_state=42)
)

train_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(train_df['sentiment'].value_counts())

sentiment
neutral     543
negative    543
positive    543
Name: count, dtype: int64


In [9]:
def prompt_with_label(data_point):
    return f"""
You are a financial sentiment analyzer. Based on the financial news, your task is to classify the sentiment of financial news articles into just ONE of the following categories: 'positive', 'negative', or 'neutral'. 

* For example:
The financial new: "{data_point["text"]}"
Output: {data_point["sentiment"]}
""".strip()

def prompt_without_label(data_point):
    return f"""
You are a financial sentiment analyzer. Based on the financial news, your task is to classify the sentiment of financial news articles into just ONE of the following categories: 'positive', 'negative', or 'neutral'. 

* For example:
The financial new: "The company 's net profit rose 11.4 % on the year to 82.2 million euros in 2005 on sales of 686.5 million euros , 13.8 % up on the year , the company said earlier ."
Output: positive

The financial new: "{data_point["text"]}"
Output:
""".strip()

train_df['text'] = train_df.apply(prompt_with_label, axis=1)
test_df['text'] = test_df.apply(prompt_without_label, axis=1)

train_df.iloc[0]['text']

'You are a financial sentiment analyzer. Based on the financial news, your task is to classify the sentiment of financial news articles into just ONE of the following categories: \'positive\', \'negative\', or \'neutral\'. \n\n* For example:\nThe financial new: "RFID ( Radio Frequency Identification ) is a method of so-called intelligent transport , whereby information can be read and saved remotely ."\nOutput: neutral'

In [10]:
model_name = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config, 
)

max_seq_length = 256
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
def predict(test_set, model, tokenizer):
    y_pred = []
    for i in range(len(test_set)):
        text = test_set['text'].iloc[i]
        inputs = tokenizer(text, return_tensors="pt").to(device)
        
        outputs = model.generate(**inputs, max_new_tokens=3, pad_token_id=model.config.eos_token_id)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        output_match = re.findall(r"Output:\s*(\w+)", answer) # get positive or negative or neutral
        last_output = output_match[-1] if output_match else 'none' # or none
        last_output = last_output.lower()
        if last_output in ['positive', 'negative', 'neutral']:
            y_pred.append(last_output)
        else:
            y_pred.append('none')
    return y_pred

In [12]:
y_pred = predict(test_df, model, tokenizer)

In [13]:
def evaluate(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall accuracy: {accuracy}\n")

    for label in set(y_test):
        label_y_true = [y for i, y in enumerate(y_test) if y == label]
        label_y_pred = [y_pred[i] for i in range(len(y_test)) if y_test[i] == label]
        print(f'Accuracy for "{label}": {accuracy_score(label_y_true, label_y_pred):.3f}')

In [14]:
evaluate(list(test_df['sentiment']), y_pred)

Overall accuracy: 0.7958762886597938

Accuracy for "negative": 0.918
Accuracy for "neutral": 0.944
Accuracy for "positive": 0.426


In [15]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)

In [16]:
lora_config = LoraConfig(
    r=8,
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

training_arguments = TrainingArguments(
    output_dir="outputs",                   
    per_device_train_batch_size=1,     
    gradient_accumulation_steps=8,    
    optim="paged_adamw_32bit",
    logging_steps=25,               
    learning_rate=2e-4,                
    fp16=True,
    warmup_ratio=0.03,            
    num_train_epochs=5, 
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    dataset_text_field="text",
    peft_config=lora_config,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
)

Map:   0%|          | 0/1629 [00:00<?, ? examples/s]

In [17]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113568677776837, max=1.0…

Step,Training Loss
25,2.3399
50,1.0018
75,0.8873
100,0.8896
125,0.8843
150,0.8761
175,0.8839
200,0.8432
225,0.7553
250,0.7627


TrainOutput(global_step=1015, training_loss=0.5879278283988314, metrics={'train_runtime': 9739.484, 'train_samples_per_second': 0.836, 'train_steps_per_second': 0.104, 'total_flos': 3.1581535313977344e+16, 'train_loss': 0.5879278283988314, 'epoch': 4.984653161448741})

In [19]:
y_pred = predict(test_df, model, tokenizer)
evaluate(list(test_df['sentiment']), y_pred)

Overall accuracy: 0.8288659793814434

Accuracy for "negative": 0.951
Accuracy for "neutral": 0.906
Accuracy for "positive": 0.610
