In [1]:
import sys

app = "/app"
if app not in sys.path:
    sys.path.append(app)
sys.path

['/app',
 '/opt/conda/lib/python38.zip',
 '/opt/conda/lib/python3.8',
 '/opt/conda/lib/python3.8/lib-dynload',
 '',
 '/opt/venv/lib/python3.8/site-packages']

In [None]:
!pwd

### Download OAPMC by FTP

Download OA content frmo PMC (approx. 100GB) by FTP from EBI.

Connect to the EBI by FTP:

```bash
ftp -i ftp.ebi.ac.uk
# Name (ftp.ebi.ac.uk:lemberge): anonymous
```

Than do a mget from the pmc/oa dir:

```bash
cd /pub/databases/pmc/oa
binary
quote PASV
mget *.xml.gz
quit
```

Expand the files on the local computer:

```bash
 gunzip *.gz
 ```

In [None]:
# # split into train, valid, test
#  parser.add_argument('corpus', help='path to the corpus of documents to use.')
#     parser.add_argument('-X', '--extension', default='xml', help='Extension (WITHOUT THE DOT) for allowed files in the corpus.')
#     args = parser.parse_args()
#     corpus = args.corpus
#     ext = args.extension
#     distribute(Path("oapmc"), ext="xml")


### Extract individual articles from oapmc

test:

```bash
python -m src.training.cli.articles /data/xml/oapmc_test_corpus/ /data/xml/oapmc_articles_test --celery_batch_size=10
# 21457 examples saved to disk.
python -m src.training.cli.split /data/xml/oapmc_articles_test
```

```bash
python -m src.training.cli.articles /data/xml/oapmc230401 /data/xml/oapmc_articles --celery_batch_size=10
# 5142702 examples saved to disk.
python -m src.training.cli.split /data/xml/oapmc_articles/
```



### Extract twin pairs

Three xpath expressions are given. The first identifies the 'container' elements from which pairs of elements will be extracted to generate twin examples.

Test on oapmc_test_corpus first:

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles_test /data/text/oapmc_twin_title_abstract_test --xpath "./front/article-meta" "./title-group/article-title" "./abstract[not(@abstract-type='graphical') and not(@abstract-type='teaser') and not(@abstract-type='author-highlights') and not(@abstract-type='precis')]" --celery_batch_size=10
```

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles_test /data/text/oapmc_twin_fig_test --xpath "//fig/caption" "./title" "./p"
```

On the full oapmc:

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles /data/text/oapmc_twin_title_abstract --xpath "./front/article-meta" "./title-group/article-title" "./abstract[not(@abstract-type='graphical') and not(@abstract-type='teaser') and not(@abstract-type='author-highlights') and not(@abstract-type='precis')]" --celery_batch_size=100
```

```bash
python -m src.training.cli.extract /data/xml/oapmc_articles /data/text/oapmc_twin_fig --xpath "//fig/caption" "./title" "./p" --celery_batch_size=100
```


In [None]:
!ls /data/text/emboj_abstracts

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "text",
    data_dir="../data/text/emboj_abstracts/",
    data_files={'train': 'train/examples.txt', 'test': 'test/examples.txt'},
)
dataset

In [None]:
MODEL_NAME = "roberta-base"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"tokenizer vocab size: {tokenizer.vocab_size}")
print(f"max length: {tokenizer.max_model_input_sizes[MODEL_NAME]}")

In [None]:
# tokenize the dataset
def tokenization(examples):
    return tokenizer(
    examples["text"],
        max_length=tokenizer.max_model_input_sizes[MODEL_NAME],
        truncation=True,
        return_special_tokens_mask=True,
    )
tokenized = dataset.map(tokenization, batched=True)
tokenized

In [None]:
tokenized = tokenized.remove_columns(["text"])
tokenized

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
from transformers import (
    # RobertaForMaskedLM,
    AutoConfig
)
from src.models.modeling_dendroberta import RobertaForMaskedLM

config = AutoConfig.from_pretrained(MODEL_NAME)
model = RobertaForMaskedLM(config=config)

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import Trainer, TrainingArguments
from datetime import datetime

args = TrainingArguments(
    output_dir="../models",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    prediction_loss_only=True,
    logging_steps=1000,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=False,
    push_to_hub=False,
    logging_dir = f"../runs/lm-dendroberta-{datetime.now().isoformat().replace(':','-')}"
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized["test"],
)

In [None]:
trainer.train()

In [None]:
import torch
(
    torch.torch.backends.mps.is_built(),
    torch.cuda.is_available(),
)

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 8
train_dataloader = DataLoader(
    tokenized["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    tokenized["test"], batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # repo.push_to_hub(
        #     commit_message=f"Training in progress epoch {epoch}", blocking=False
        # )