In [1]:
import sys

app = "/Users/lemberge/code/dendritic" #"/app"
if app not in sys.path:
    sys.path.append(app)

In [2]:
!pwd

/app/notebooks


### Download OAPMC by FTP

Download OA content frmo PMC (approx. 100GB) by FTP from EBI.

Connect to the EBI by FTP:

```bash
ftp -i ftp.ebi.ac.uk
# Name (ftp.ebi.ac.uk:lemberge): anonymous
```

Than do a mget from the pmc/oa dir:

```bash
cd /pub/databases/pmc/oa
binary
quote PASV
mget *.xml.gz
quit
```

Expand the files on the local computer:

```bash
 gunzip *.gz
 ```

In [None]:
# # split into train, valid, test
#  parser.add_argument('corpus', help='path to the corpus of documents to use.')
#     parser.add_argument('-X', '--extension', default='xml', help='Extension (WITHOUT THE DOT) for allowed files in the corpus.')
#     args = parser.parse_args()
#     corpus = args.corpus
#     ext = args.extension
#     distribute(Path("oapmc"), ext="xml")


### Extract individual articles from oapmc

test:

```bash
python -m src.training.cli.articles /data/xml/oapmc_test_corpus/ /data/xml/oapmc_articles_test --celery_batch_size=10
# 21457 examples saved to disk.
python -m src.training.cli.split /data/xml/oapmc_articles_test
```

```bash
python -m src.training.cli.articles /data/xml/oapmc230401 /data/xml/oapmc_articles --celery_batch_size=10
# 5142702 examples saved to disk.
python -m src.training.cli.split /data/xml/oapmc_articles/
```



### Extract twin pairs

Three xpath expressions are given. The first identifies the 'container' elements from which pairs of elements will be extracted to generate twin examples.

Test on oapmc_test_corpus first:

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles_test /data/text/oapmc_twin_title_abstract_test --xpath "./front/article-meta" "./title-group/article-title" "./abstract[not(@abstract-type='graphical') and not(@abstract-type='teaser') and not(@abstract-type='author-highlights') and not(@abstract-type='precis')]" --celery_batch_size=10
```

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles_test /data/text/oapmc_twin_fig_test --xpath "//fig/caption" "./title" "./p"
```

On the full oapmc:

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles /data/text/oapmc_twin_title_abstract --xpath "./front/article-meta" "./title-group/article-title" "./abstract[not(@abstract-type='graphical') and not(@abstract-type='teaser') and not(@abstract-type='author-highlights') and not(@abstract-type='precis')]" --celery_batch_size=100
```

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles /data/text/oapmc_twin_fig --xpath "//fig/caption" "./title" "./p" --celery_batch_size=100
```


In [7]:
!ls /data/emboj_abstracts/train

examples.txt


In [2]:
from datasets import load_dataset

dataset = load_dataset(
    "text",
    data_dir="/data/emboj_abstracts/",
    data_files={'train': 'train/examples.txt', 'test': 'test/examples.txt'},
)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 12881
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1802
    })
})

In [3]:
MODEL_NAME = "roberta-base"

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"tokenizer vocab size: {tokenizer.vocab_size}")
print(f"max length: {tokenizer.max_model_input_sizes[MODEL_NAME]}")

tokenizer vocab size: 50265
max length: 512


In [5]:
# tokenize the dataset
def tokenization(examples):
    return tokenizer(
    examples["text"],
        max_length=tokenizer.max_model_input_sizes[MODEL_NAME],
        truncation=True,
        return_special_tokens_mask=True,
    )
tokenized = dataset.map(tokenization, batched=True)
tokenized

Map:   0%|          | 0/12881 [00:00<?, ? examples/s]

Map:   0%|          | 0/1802 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12881
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 1802
    })
})

In [6]:
tokenized = tokenized.remove_columns(["text"])
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 12881
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 1802
    })
})

In [7]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [8]:
from transformers import RobertaForMaskedLM, AutoConfig

config = AutoConfig.from_pretrained(MODEL_NAME)
model = RobertaForMaskedLM(config=config)

In [9]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 124.7M parameters


In [10]:
from transformers import Trainer, TrainingArguments
from datetime import datetime

args = TrainingArguments(
    output_dir="/models",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=100,
    prediction_loss_only=True,
    logging_steps=1000,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=False,
    push_to_hub=False,
    logging_dir = f"/runs/lm-dendroberta-{datetime.now().isoformat().replace(':','-')}"
)

In [11]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized["test"],
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,No log,7.845899
200,No log,6.837622


TrainOutput(global_step=202, training_loss=8.293279364557549, metrics={'train_runtime': 191.6164, 'train_samples_per_second': 67.223, 'train_steps_per_second': 1.054, 'total_flos': 2811832490250828.0, 'train_loss': 8.293279364557549, 'epoch': 1.0})

In [17]:
import torch
(
    torch.torch.backends.mps.is_built(),
    torch.cuda.is_available(),
)

  return torch._C._cuda_getDeviceCount() > 0


(False, False)

In [18]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 16
train_dataloader = DataLoader(
    tokenized["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    tokenized["test"], batch_size=batch_size, collate_fn=default_data_collator
)

In [19]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [20]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from torch.utils.tensorboard import SummaryWriter

def log(self, loss, epoch, writer, split):
    # write to tensorboard
    writer.add_scalar(f"Loss/{split}", loss.data, epoch)
    
    # loss on Trainset (we are lazy and don't use a separate validation set)
    # with torch.no_grad():
    #     outputs, states = self.model(self.X_test)
    #     valid_loss = self.criterion(outputs, self.y_test)
    # writer.add_scalar("Loss/valid", valid_loss, epoch)



In [23]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    log(loss, epoch, writer, "train")

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # repo.push_to_hub(
        #     commit_message=f"Training in progress epoch {epoch}", blocking=False
        # )

  0%|          | 0/2418 [00:00<?, ?it/s]

KeyboardInterrupt: 