In [None]:
!nvidia-smi

In [None]:
!pip install datasets transformers

In [None]:
!wget -O train_en.jsonl.bz2 https://github.com/TurkuNLP/wikipedia-toxicity-data-fi/raw/main/train_en.jsonl.bz2
!wget -O test_en.jsonl.bz2 https://github.com/TurkuNLP/wikipedia-toxicity-data-fi/raw/main/test_en.jsonl.bz2

!bzip2 -dk train_en.jsonl.bz2
!bzip2 -dk test_en.jsonl.bz2

!ls -l

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

output_dir = "./results"
#output_dir = "./drive/MyDrive/utu/kandi/results1"
checkpoint_dir = None # "./results/checkpoint-15000"
tokenizer_name = "bert-base-uncased"
labels = ["label_identity_attack", "label_insult", "label_obscene", "label_severe_toxicity", "label_threat", "label_toxicity"]

device

In [None]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")

os.environ['WANDB_API_KEY'] = secret_value_0

In [None]:
import numpy as np
from datasets import load_dataset, Dataset

dataset = load_dataset('json', data_files={'train': 'train_en.jsonl', 'test': 'test_en.jsonl'})

In [None]:
import pandas as pd

df_pandas = pd.DataFrame(dataset['train'])
df_pandas[labels].sum().sort_values().plot(kind="barh");

train_toxic = df_pandas[df_pandas[labels].sum(axis=1) > 0]
train_clean = df_pandas[df_pandas[labels].sum(axis=1) == 0]

pd.DataFrame(dict(
  toxic=[len(train_toxic)], 
  clean=[len(train_clean)]
)).plot(kind='barh');


train_df = pd.concat([
  train_toxic,
  train_clean.sample(40_000, random_state=123)
])

# shuffle
dataset['train'] = Dataset.from_pandas(train_df.sample(frac=1, random_state=123))

print(dataset['train'][:5])
print(dataset)

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def preprocess(items):
    return tokenizer(items["text"], truncation=True)

ds = dataset.remove_columns(["id", "lang"])
ds = ds.map(lambda it: {'labels': torch.FloatTensor([it[label] for label in labels])})
ds = ds.remove_columns(labels)
tokenized_data = ds.map(preprocess, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_data['train'][0]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, TrainerCallback

model = None
if checkpoint_dir is not None:
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint_dir)
else:
  model = AutoModelForSequenceClassification.from_pretrained(
                                                           tokenizer_name,
                                                           num_labels=len(labels),
                                                           problem_type="multi_label_classification",
                                                           id2label={int(i): v for i, v in enumerate(labels)},
                                                           label2id={str(v): i for i, v in enumerate(labels)},
                                                          )
# model = model.to(device)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from scipy.special import expit as sigmoid

def compute_metrics(pred):
    y_act = pred.label_ids
    y_pred = sigmoid(pred.predictions)
    y_pred = (y_pred>0.5).astype(float)
    
    print(y_pred)
    print("--asdasd--")
    print(y_act)
    
    print("--------")
    print(classification_report(y_act, y_pred, target_names=labels))
    print("--------")
    
    metrics = classification_report(y_act, y_pred, output_dict=True)
    metrics["eval_metric"] = metrics["weighted avg"]["f1-score"]
    return metrics    
    
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-5,
    #per_device_train_batch_size=16,
    #per_device_eval_batch_size=16,
    num_train_epochs=3, # TODO
    weight_decay=0.01,
    evaluation_strategy="steps",
    warmup_steps=1000,
    eval_steps=4000,
    save_steps=4000,
    load_best_model_at_end=True,
    metric_for_best_model="eval_metric"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #callbacks=[Callback],
    compute_metrics=compute_metrics,
)

if checkpoint_dir is not None:
    trainer.train(checkpoint_dir)
else:
    trainer.train()

In [None]:
#!zip -r ./checkpoint.zip ./results/checkpoint-12000