In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split


def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

# use sklearn to partition
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

train_texts[:10]

In [12]:
import json
import glob
from pathlib import Path

def read_dataset(data_dir: Path):
    file_paths = glob.glob(f"{data_dir}/*.json")
    texts = []
    labels = []
    for i, file_path in enumerate(file_paths):
        with open(file_path) as f:
            paper_json = json.load(f)
            accepted = paper_json["review"]["accepted"]
            abstract = paper_json["review"]["abstract"]
            
            texts.append(abstract)
            labels.append(int(accepted))
    return texts, labels

data_dir = Path("data/original")
train_texts, train_labels = read_dataset(data_dir)

In [16]:
num_accepted = len(list(filter(lambda x: x == 1, train_labels)))
num_not_accepted = len(list(filter(lambda x: x == 0, train_labels)))

print(num_accepted, num_not_accepted)

2891 8887


In [18]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [19]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [20]:
import textwrap
print('Length: ', len(train_encodings[0].tokens))
print()
print(textwrap.fill(str(train_encodings[0].tokens)))

Length:  512

['[CLS]', 'change', 'management', 'for', 'evolving', 'collaborative',
'business', 'process', 'development', 'is', 'crucial', 'when', 'the',
'business', 'logic', ',', 'trans', '##ection', '##s', 'and', 'work',
'##flow', 'change', 'due', 'to', 'changes', 'in', 'business',
'strategies', 'or', 'organizational', 'and', 'technical',
'environment', '.', 'during', 'the', 'change', 'implementation', ',',
'business', 'processes', 'are', 'analyzed', 'and', 'improved',
'ensuring', 'that', 'they', 'capture', 'the', 'proposed', 'change',
'and', 'they', 'do', 'not', 'contain', 'any', 'und', '##es', '##ired',
'functional', '##ities', 'or', 'change', 'side', '-', 'effects', '.',
'this', 'paper', 'presents', 'business', 'process', 'change',
'management', 'approach', 'for', 'the', 'efficient', 'and',
'effective', 'implementation', 'of', 'change', 'in', 'the',
'business', 'process', '.', 'the', 'key', 'technology', 'behind',
'our', 'approach', 'is', 'our', 'proposed', 'business', 'process',


In [22]:
import torch

class PaperDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = PaperDataset(train_encodings, train_labels)
val_dataset = PaperDataset(val_encodings, val_labels)

In [None]:
import os
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

os.environ["TOKENIZERS_PARALLELISM"] = "true"

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_steps=50,
    evaluation_strategy="steps"
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": fa

Step,Training Loss,Validation Loss


wandb: Network error (ConnectionError), entering retry loop.
