In [None]:
!pip install transformers datasets evaluate
!pip install transformers[torch]

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
import datasets

df = pd.read_csv("./osr_tweets_without_T_U_U_v2.csv", engine='python')
# df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/osr_tweets_without_T_U_U_v2.csv", engine='python')

#convert to list
docs = df.text

labels = df.topic

id2label = {}
label2id = {}
id_counter = 0
for i in range(len(labels)):
    label = labels.iloc[i]
    if label not in label2id:
        label2id[label] = id_counter
        id_counter += 1

for label, id in label2id.items():
    id2label[id] = label

for i in range(len(labels)):
    topic = labels.iloc[i]
    cur_id = label2id[topic]
    labels.iloc[i] = cur_id

# generate class weight list
id2counter = dict(id2label)
label2counter = dict(label2id)

for id, _ in id2counter.items():
    id2counter[id] = 0
    label = id2label[id]
    label2counter[label] = 0

for i in range(len(labels)):
    cur_id = labels.iloc[i]
    id2counter[cur_id] += 1
    cur_topic = id2label[cur_id]
    label2counter[cur_topic] += 1

for label, counter in label2counter.items():
    label2counter[label] = counter
    # label2counter[label] = counter/len(labels)

class_weight = []
id_counter = 0
for id_num, counter in id2counter.items():
    weight = 1/(counter/len(df))
    class_weight.append(weight)
    id_counter += 1

docs = docs.astype(str)
df = pd.concat([docs, labels], axis=1)
df.rename(columns={'topic':'label'}, inplace = True)
df = df[df['text'].map(len) >= 10]

dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())

### convert to Huggingface dataset
hg_dataset = Dataset(pa.Table.from_pandas(df))

train_dataset, test_dataset= hg_dataset.train_test_split(test_size=0.2, shuffle=True).values()
db = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import DataCollatorWithPadding
import evaluate
from transformers import create_optimizer
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch import nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weight, device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_db = db.map(preprocess_function, batched=True)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Performance measure metrics
def custom_metrics(eval_pred):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")
    metric4 = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

batch_size_list = [16]
lr_list = [2e-5]
num_epochs = 10

for batch_size in batch_size_list:
    for lr in lr_list:
        # training using PyTorch
        model = AutoModelForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=id_counter, id2label=id2label, label2id=label2id
        )

        training_args = TrainingArguments(
            output_dir="BERT_large_with_preprocessing_grid_search",
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=10,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            push_to_hub=True,\
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_db["train"],
            eval_dataset=tokenized_db["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=custom_metrics,
        )

        trainer.train()
        trainer.push_to_hub()