In [None]:
!pip install -qU datasets evaluate rouge_score bert_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          Trainer, TrainingArguments, BertTokenizer,
                          EncoderDecoderModel)
from datasets import load_dataset, Dataset
from evaluate import load

import pandas as pd

In [None]:
papers = pd.concat([pd.read_csv("/content/alzheimer_biomarker_1000.csv"), pd.read_csv("/content/alzheimer_biomarker_2000.csv")]).dropna(subset=["fulltext"]).reset_index(drop=True).astype(str)

In [None]:
def create_dataset(papers):
    return Dataset.from_dict({
        "text": [paper["fulltext"] for _,paper in papers.iterrows()],
        "summary": [paper["abstract"] for _,paper in papers.iterrows()]
    })


sample_papers = papers[(papers["abstract"] != "nan") & (papers["fulltext"] != "nan")].sample(5)
dataset = create_dataset(sample_papers)


In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

In [None]:
def preprocess_function(examples):
        inputs = tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length")
        outputs = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
        batch = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "decoder_input_ids": outputs.input_ids,
            "decoder_attention_mask": outputs.attention_mask,
            "labels": outputs.input_ids.copy(),
        }
        return batch

In [None]:
def finetune_bert(dataset):

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./results_bert",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=500,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    trainer.train()
    return model, tokenizer



In [None]:
bert_model, bert_tokenizer = finetune_bert(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



Step,Training Loss


In [None]:
bert_model.config.decoder_start_token_id = bert_tokenizer.cls_token_id
bert_model.config.bos_token_id = bert_tokenizer.cls_token_id
bert_model.config.eos_token_id = bert_tokenizer.sep_token_id
bert_model.config.pad_token_id = bert_tokenizer.pad_token_id

In [None]:
def generate_summary(model, tokenizer, text, max_length=150):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
        decoder_start_token_id=model.config.decoder_start_token_id,
        bos_token_id=model.config.bos_token_id,
        eos_token_id=model.config.eos_token_id,
        pad_token_id=model.config.pad_token_id
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

rouge = load("rouge")
bertscore = load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
text = sample_papers.iloc[0]["fulltext"]