In [None]:
import torch
import pandas as pd
import pyarrow.parquet as pq
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path

MODEL = "ProsusAI/finbert"
BATCH_SIZE = 64

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()

if torch.cuda.is_available():
    model = model.cuda()

labels = ["negative", "neutral", "positive"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Mounted at /content/drive


In [4]:
dataset = pq.ParquetDataset("/content/drive/MyDrive/Project_Data/data_for_finbert/")

In [5]:
from tqdm import tqdm

results = []

for fragment in dataset.fragments:
    table = fragment.to_table()
    df = table.to_pandas()

    texts = df["Lsa_summary"].tolist()

    # Progress bar for batches only
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Processing batches"):
        batch = texts[i:i + BATCH_SIZE]

        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)

        for j in range(len(batch)):
            result = {
                "No": df.iloc[i + j]["No"]
            }

            # Add all label probabilities
            for label_idx, label_name in enumerate(labels):
                result[label_name] = probs[j][label_idx].item()

            # Optionally add the predicted sentiment
            result["predicted_sentiment"] = labels[torch.argmax(probs[j]).item()]

            results.append(result)

Processing batches: 100%|██████████| 556/556 [09:00<00:00,  1.03it/s]
Processing batches: 100%|██████████| 552/552 [08:54<00:00,  1.03it/s]
Processing batches: 100%|██████████| 663/663 [11:10<00:00,  1.01s/it]
Processing batches: 100%|██████████| 579/579 [09:55<00:00,  1.03s/it]
Processing batches: 100%|██████████| 561/561 [09:49<00:00,  1.05s/it]
Processing batches: 100%|██████████| 583/583 [10:14<00:00,  1.05s/it]
Processing batches: 100%|██████████| 582/582 [10:28<00:00,  1.08s/it]
Processing batches: 100%|██████████| 565/565 [09:54<00:00,  1.05s/it]
Processing batches: 100%|██████████| 543/543 [09:42<00:00,  1.07s/it]
Processing batches: 100%|██████████| 531/531 [09:04<00:00,  1.03s/it]
Processing batches: 100%|██████████| 547/547 [09:00<00:00,  1.01it/s]
Processing batches: 100%|██████████| 527/527 [09:22<00:00,  1.07s/it]
Processing batches: 100%|██████████| 502/502 [08:46<00:00,  1.05s/it]
Processing batches: 100%|██████████| 454/454 [08:06<00:00,  1.07s/it]
Processing batches: 

In [6]:
pd.DataFrame(results).to_parquet(
    "/content/drive/MyDrive/Project_Data/sentiment_data.parquet",
    index=False
)