In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset

In [3]:
df = pd.read_csv('estonianvalence.csv', encoding="utf-8")

In [4]:
df = df[["valence", "paragraph"]]
df = df[df["valence"] != "vastuoluline"]
df = df.rename(columns={"valence": "label", "paragraph": "text"})
df = df.reset_index(drop=True)

In [5]:
df

Unnamed: 0,label,text
0,negatiivne,Enam kui kümme aastat tagasi tegutses huumoris...
1,positiivne,Isiklikult kohtasin natukegi Kukekese moodi po...
2,negatiivne,Kummaline on nüüd äkki lugeda politsei ja sise...
3,negatiivne,"Küsimus pole mitte leebetes karistustes, vaid ..."
4,negatiivne,Ajakirjanikuna ei saa ma politseile soovitada ...
...,...,...
3531,negatiivne,Hispaania peaminister Mariano Rajoy on Euroopa...
3532,negatiivne,"Hispaania keskpank on hoiatanud, et riik ei pr..."
3533,neutraalne,Gruusia parlamendivalimised võitnud koalitsioo...
3534,neutraalne,Gruusia meedia on juba spekuleerinud tulevase ...


In [7]:
sentiment_numbers = {'negatiivne': 0, 'neutraalne': 1, 'positiivne': 2}
df['label'] = df['label'].replace(sentiment_numbers)
df

Unnamed: 0,label,text
0,0,Enam kui kümme aastat tagasi tegutses huumoris...
1,2,Isiklikult kohtasin natukegi Kukekese moodi po...
2,0,Kummaline on nüüd äkki lugeda politsei ja sise...
3,0,"Küsimus pole mitte leebetes karistustes, vaid ..."
4,0,Ajakirjanikuna ei saa ma politseile soovitada ...
...,...,...
3531,0,Hispaania peaminister Mariano Rajoy on Euroopa...
3532,0,"Hispaania keskpank on hoiatanud, et riik ei pr..."
3533,1,Gruusia parlamendivalimised võitnud koalitsioo...
3534,1,Gruusia meedia on juba spekuleerinud tulevase ...


In [8]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [9]:
small_train_dataset = dataset["train"]
small_test_dataset = dataset["test"]

In [10]:
small_train_dataset[0]

{'label': 0,
 'text': 'Jõhkra käitumisega nooruk on varem kriminaalkorras karistamata. Teda on karistatud väärteo, tubakaseaduse rikkumise eest 12 euro suuruse trahviga mullu mais. Juunis sai ta samuti 12 eurot trahvi.'}

In [11]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/est-roberta")
#model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/est-roberta", num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained("EMBEDDIA/est-roberta", num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/est-roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2828 [00:00<?, ? examples/s]

Map:   0%|          | 0/708 [00:00<?, ? examples/s]

In [13]:
print("Batch size of tokenized_train dataset:", len(tokenized_train))
print("Batch size of tokenized_test dataset:", len(tokenized_test))

Batch size of tokenized_train dataset: 2828
Batch size of tokenized_test dataset: 708


In [14]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]

    return {"accuracy": accuracy, "f1": f1}

In [16]:
#!pip install transformers[torch]

In [17]:
import torch
torch.cuda.is_available()

True

In [18]:
#!pip uninstall transformers accelerate
#!pip install transformers accelerate



In [19]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Step,Training Loss
500,0.5971
1000,0.193
1500,0.0553


TrainOutput(global_step=1770, training_loss=0.24334039957509876, metrics={'train_runtime': 782.3692, 'train_samples_per_second': 36.147, 'train_steps_per_second': 2.262, 'total_flos': 1699067228420016.0, 'train_loss': 0.24334039957509876, 'epoch': 10.0})

In [21]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 1.2142927646636963,
 'eval_accuracy': 0.7838983050847458,
 'eval_f1': 0.7822232365313707,
 'eval_runtime': 7.7963,
 'eval_samples_per_second': 90.812,
 'eval_steps_per_second': 5.772,
 'epoch': 10.0}

In [22]:
model.save_pretrained('/content/drive/MyDrive/MAKATÖÖ/roBertaSentiment_V2')