In [None]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.6.1-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.6.1-py3-none-any.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.7/307.7 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.6.1 pymongo-4.8.0


# Loading data

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [None]:
from pymongo import MongoClient
from datasets import Dataset
import json
import re

client = MongoClient('')
db = client['Ecmo-info']
collection = db['ecmo']

#retrieving all docs
documents = collection.find({}, {'_id': 0, 'first_image': 0, 'body_text': 0})  # Exclude _id and first_image

#convert docs to list of dicts
data = [doc for doc in documents]

#Save data to a JSON file
file_name = 'ecmo_data.json'
with open(file_name, 'w') as json_file:
    json.dump(data, json_file, indent=4)


# Pre-processing data

In [None]:
import json
import re
from datasets import Dataset, load_dataset
from transformers import T5Tokenizer

# Initialize the tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

file_name = 'ecmo_data.json'

with open(file_name, 'r') as json_file:
    data = json.load(json_file)

def clean_reference(reference):
    # Remove special characters, normalize, and clean up
    reference = re.sub(r'\\u[0-9A-Fa-f]{4}', '', reference)  # Remove Unicode escape sequences
    reference = re.sub(r'[^\x00-\x7F]+', '', reference)  # Remove non-ASCII characters
    reference = re.sub(r'\s+', ' ', reference)  # Normalize whitespace
    return reference.strip()

def clean_and_concatenate_data(data):
    cleaned_data = []
    for item in data:
        title = item['title']
        references = item['references']
        #remove empty references
        cleaned_references = [clean_reference(ref) for ref in references if clean_reference(ref)]
        # Concatenate all references into a single string
        concatenated_references = " ".join(cleaned_references)

        #Only add item if there are non-empty references
        if concatenated_references:
            cleaned_data.append({"title": title, "references": concatenated_references})
    return cleaned_data

# Clean the data
cleaned_data = clean_and_concatenate_data(data)

#convert data to HuggingFace Dataset so that HuggingFace can acess it
dataset = Dataset.from_list(cleaned_data)

train_test_valid = dataset.train_test_split(test_size=0.2)  # 80% train, 20% test
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)

#merge the splits
dataset = {
    'train': train_test_valid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test'],
}

#Tokenize datasets
def preprocess_function(examples):
    inputs = examples['references']
    targets = examples['title']
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=64, padding="max_length", truncation=True)

    #Replace pad token ids in labels with -100 so they are ignored by the loss function
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_ids]
        for label_ids in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to each dataset split
tokenized_datasets = {split: dataset[split].map(preprocess_function, batched=True) for split in dataset}

#save the processed data to a new JSON file
output_file_name = 'processed_ecmo_data.json'
with open(output_file_name, 'w') as json_file:
    json.dump(cleaned_data, json_file, indent=4)

print(f"Processed data successfully saved to {output_file_name}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/803 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Processed data successfully saved to processed_ecmo_data.json


# Fine-tuning using training and validation datasets

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback


# Load the model
model = T5ForConditionalGeneration.from_pretrained("t5-base")


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",     # Save and evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=100,  # Log every 100 steps
    save_strategy="epoch",  # Save at the end of each epoch
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,  # Accumulates gradients for 2 steps before updating
    eval_steps= 100,

)
trainer = Trainer( #Trainer object, will be managing the training of the model
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop if no improvement for 3 evaluations

)

trainer.train()
model.save_pretrained("fine_tuned_t5")
tokenizer.save_pretrained("fine_tuned_t5")




Epoch,Training Loss,Validation Loss
0,No log,2.058086
2,2.634200,1.728742
4,1.683400,1.697689


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('fine_tuned_t5/tokenizer_config.json',
 'fine_tuned_t5/special_tokens_map.json',
 'fine_tuned_t5/spiece.model',
 'fine_tuned_t5/added_tokens.json')

# Evaluation

In [None]:
!pip3 install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=467962f46867dc38386109bed1b5f2a7c524be88288581a1c015a9aea92d05ce
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_metric

#loading fine-tuned model
model_name = "fine_tuned_t5"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

#set model to eval mode
model.eval()

test_dataset = tokenized_datasets['test']
rouge = load_metric('rouge')

def evaluate_model_with_rouge(test_dataset, model, tokenizer):
    all_predictions = []
    all_references = []

    for example in test_dataset:
        references_input = example["references"]
        true_title = example["title"]

        # Tokenize inputs
        inputs = tokenizer(references_input, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

        # Generate the title using the model's generate method
        generated_ids = model.generate(inputs.input_ids, max_length=50, num_beams=5, early_stopping=True)
        generated_title = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Collect generated and reference titles
        all_predictions.append(generated_title)
        all_references.append(true_title)

    # Calculate ROUGE scores
    rouge_output = rouge.compute(
        predictions=all_predictions,
        references=all_references,
        rouge_types=["rouge1", "rouge2", "rouge3", "rougeL"]
    )

    # Return only the F1-measures for ROUGE-1, ROUGE-2, ROUGE-3, and ROUGE-L
    return {
        "ROUGE-1": rouge_output['rouge1'].mid.fmeasure * 100,
        "ROUGE-2": rouge_output['rouge2'].mid.fmeasure * 100,
        "ROUGE-3": rouge_output['rouge3'].mid.fmeasure * 100,
        "ROUGE-L": rouge_output['rougeL'].mid.fmeasure * 100,
    }


def generate_title(references, model, tokenizer):

    #tokenize references
    inputs = tokenizer.encode(references, return_tensors="pt")

    #generate title
    outputs = model.generate(inputs, max_length=50, num_beams=5, early_stopping=True)

    #decode generated title
    generated_title = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_title


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  rouge = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
if __name__ == "__main__":
    # Evaluate the model and print the ROUGE scores
    rouge_scores = evaluate_model_with_rouge(test_dataset, model, tokenizer)

    print(f"ROUGE-1 F1 Score: {rouge_scores['ROUGE-1']:.2f}")
    print(f"ROUGE-2 F1 Score: {rouge_scores['ROUGE-2']:.2f}")
    print(f"ROUGE-3 F1 Score: {rouge_scores['ROUGE-3']:.2f}")
    print(f"ROUGE-L F1 Score: {rouge_scores['ROUGE-L']:.2f}")

    # Example input to generate a title
    references_input = (
       "^ Furie B, Furie BC (2008). \"Mechanisms of thrombus formation\". New England Journal of Medicine. 359 (9): 938949. doi:10.1056/NEJMra0801082. PMID18753650. ^ Handin RI (2005). \"Chapter 53: bleeding and thrombosis\". In Kasper DL, Braunwald E, Fauci AS, etal. (eds.). Harrison's Principles of Internal Medicine (16thed.). New York: McGraw-Hill. ISBN978-0-07-139140-5. ^ Hughes ES (February 1, 1949). \"Venous obstruction in the upper extremity; Paget-Schroetter's syndrome; a review of 320 cases\". Surgery, Gynecology & Obstetrics. 88 (2): 89127. ISSN0039-6087. PMID18108679. ^ \"shunt\". National Cancer Institute. Retrieved July 5, 2021. ^ Webster GJ, Burroughs AK, Riordan SM (January 2005). \"Review article: portal vein thrombosis new insights into aetiology and management\". Alimentary Pharmacology & Therapeutics. 21 (1): 19. CiteSeerX10.1.1.536.2660. doi:10.1111/j.1365-2036.2004.02301.x. PMID15644039. S2CID5673778. Archived from the original on December 10, 2012. ^ DeLeve LD, Valla DC, Garcia-Tsao G (2009). \"Vascular disorders of the liver\". Hepatology. 49 (5): 172964. doi:10.1002/hep.22772. PMC6697263. PMID19399912.{{cite journal}}: CS1 maint: multiple names: authors list (link) ^ \"Renal vein thrombosis: MedlinePlus Medical Encyclopedia\". medlineplus.gov. Retrieved May 27, 2019. ^ Canho P, Ferro JM, Lindgren AG, etal. (August 2005). \"Causes and predictors of death in cerebral venous thrombosis\". Stroke. 36 (8): 17201725. doi:10.1161/01.STR.0000173152.84438.1c. PMID16002765."
    )
    generated_title = generate_title(references_input, model, tokenizer)
    print("Generated Title:", generated_title)


ROUGE-1 F1 Score: 48.29
ROUGE-2 F1 Score: 25.54
ROUGE-3 F1 Score: 10.56
ROUGE-L F1 Score: 48.05
Generated Title: vein thrombosis


In [None]:
import shutil

# Specify the path to the checkpoint directory
checkpoint_dir = "/content/results/checkpoint-250"  # Replace with your specific checkpoint directory

# Specify the output file name for the compressed file
output_filename = "checkpoint-1000.zip"

# Compress the checkpoint directory into a zip file
shutil.make_archive("checkpoint-1000", 'zip', checkpoint_dir)

print(f"Checkpoint {checkpoint_dir} has been compressed into {output_filename}")


Checkpoint /content/results/checkpoint-250 has been compressed into checkpoint-1000.zip


In [None]:
from google.colab import files

# Download the zip file
files.download(output_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/results/checkpoint-250 /content/drive/MyDrive/


Mounted at /content/drive
