In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True, max_length=512)

In [2]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [3]:
def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1

In [4]:
import pandas as pd
import datasets
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

df = pd.read_table('../chapter06/train.txt', header=None, sep="\t")
df.columns = ["label", "text"]
df["label"] = df["label"].map(category_to_label).values
valid = pd.read_table('../chapter06/valid.txt', header=None, sep="\t")
valid.columns = ["label", "text"]
valid["label"] = valid["label"].map(category_to_label).values
test = pd.read_table('../chapter06/test.txt', header=None, sep="\t")
test.columns = ["label", "text"]
test["label"] = test["label"].map(category_to_label).values


train_dataset = datasets.Dataset.from_pandas(df[["label", "text"]])
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_dataset = datasets.Dataset.from_pandas(valid[["label", "text"]])
val_tokenized = val_dataset.map(preprocess_function, batched=True)
test_dataset = datasets.Dataset.from_pandas(test[["text"]])
test_tokenized = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
  "bert-base-uncased", num_labels=4
)

training_args = TrainingArguments(
  output_dir=f"./results",
  learning_rate=4e-5,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=64,
  num_train_epochs=3,
  weight_decay=0.01,
  eval_strategy="steps",
  eval_steps=250,
  load_best_model_at_end=True,
  save_steps=1000,
  gradient_accumulation_steps=3,
  save_total_limit=3,
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_tokenized,
  eval_dataset=val_tokenized,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

trainer.train()

oof_results = trainer.predict(test_dataset=val_tokenized)
np.save(f"oof_prediction", oof_results.predictions)

results = trainer.predict(test_dataset=test_tokenized)
np.save(f"test_prediction", results.predictions)


Map:   0%|          | 0/10684 [00:00<?, ? examples/s]

Map:   0%|          | 0/1336 [00:00<?, ? examples/s]

Map:   0%|          | 0/1335 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
250,No log,0.326558,0.908683
500,0.373000,0.273635,0.916916
750,0.373000,0.250217,0.925898
1000,0.238600,0.297081,0.922904
1250,0.238600,0.282947,0.931138
1500,0.154100,0.283721,0.934132
1750,0.154100,0.243296,0.936377
2000,0.090200,0.329903,0.940868
2250,0.090200,0.298026,0.942365
2500,0.069700,0.303154,0.943114
