In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]

In [None]:
from datasets import load_dataset
from sklearn.metrics import classification_report
data = load_dataset("MoritzLaurer/sentiment_economy_news")
data = data.rename_column("labels", "label")
data = data.remove_columns(['articleid', 'relevance', 'positivity', 'split', 'positivity_rounded', 'idx'])

In [None]:
data

In [None]:
data["train"][0]

In [None]:
import torch
seed = 777
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}

In [None]:
def preprocess_function(examples):
  tokenized_batch = tokenizer(examples["text"], truncation=True, padding=True)
  tokenized_batch["label"] = [label2id[label] for label in examples["label"]]
  return tokenized_batch

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="DistilSentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to = "none",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# import torch
# with torch.no_grad():
#     torch.cuda.empty_cache()

In [None]:
text = """
Egg and lettuce prices, for example, have also declined significantly after having soared in 2022.
Among the reasons for those initial shocks: a historic outbreak of avian influenza in the U.S.,
which is extremely lethal among birds such as egg-laying hens, and an insect-borne virus that
raged through the Salinas Valley growing region in California, which accounts for about half of U.S. lettuce production.
"""

In [None]:
inputs = tokenizer(text, return_tensors="pt")
inputs

In [None]:
trainer.state.best_model_checkpoint

In [None]:
import torch
model_path = "./DistilSentiment/checkpoint-188"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
model = model.cuda()

with torch.no_grad():
    logits = model(**inputs.to('cuda')).logits
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
preds = trainer.predict(tokenized_data["test"])
accuracy_score(preds.label_ids, preds.predictions.argmax(1))

In [None]:
print(classification_report(preds.label_ids, preds.predictions.argmax(1)))

In [None]:
trainer.save_model('./DistilSentiment073')