# Text Summarization fine-tuning script

This script contains the source code of how to fine-tune a T5 model on **End-to-End automatic summarization task** in both English and Japanese.

## Sample usage of SPL for fine-tuning:

| inputlookup summarization_jp
| fields body_text summary_text 
| rename summary_text as summary body_text as text
| head 10
| fit MLTKContainer algo=appNLP_summarization_final max_epochs=1 lang=jp base_model=t5_summarization_jp batch_size=4 summary from text into app:t5_summarization_jp_finetuned_final as extracted_summary

## Sample usage of SPL for applying:

| inputlookup summarization_jp
| fields body_text summary_text 
| rename summary_text as summary body_text as text
| head 10
| apply t5_summarization_jp_finetuned_final

## Stage 0 - import libraries
At stage 0 we define all imports necessary to run our subsequent code depending on various libraries.

In [1]:
# this definition exposes all python module imports that should be available in all subsequent commands

import json
import numpy as np
import pandas as pd
from pathlib import Path
import re
import math
import time
import copy
from tqdm import tqdm
import pandas as pd
import tarfile
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer
# from torchmetrics.text.rouge import ROUGEScore
# tensorboard related
from torch.utils.tensorboard import SummaryWriter
import tensorboard
import datetime
import logging
import sys
import io
import os
import psutil
import shutil
# Fine-tune parameters initialization
MODEL_NAME = "/srv/app/model/data"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

max_length_src = 400
max_length_target = 200

batch_size_train = 4
batch_size_valid = 4

epochs = 100
patience = 20

MODEL_DIRECTORY = "/"

class T5FineTuner(nn.Module):
    
    def __init__(self, MODEL_NAME):
        super().__init__()

        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, local_files_only=True)

    def forward(
        self, input_ids, attention_mask=None, decoder_input_ids=None,
        decoder_attention_mask=None, labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

## Stage 1 - get a data sample from Splunk
In Splunk run a search to pipe a prepared dataset into this environment. (internal testing only)

| inputlookup summarization_en
| fields text summary
| head 5
| fit MLTKContainer algo=appNLP_summarization_final mode=stage max_epochs=1 lang=en base_model=t5_summarization_en summary from text into app:t5_summarization_en_finetuned_final as extracted_summary

In [5]:
# this cell is not executed from MLTK and should only be used for staging data into the notebook environment
def stage(name):
    print("DEBUG stage call")
    print("DEBUG " + name)
    with open("/srv/notebooks/data/"+name+".csv", 'r') as f:
        df = pd.read_csv(f)
    with open("/srv/notebooks/data/"+name+".json", 'r') as f:
        param = json.load(f) 
    return df, param

In [6]:
df, param = stage("t5_summarization_jp_finetuned_final")

DEBUG stage call
DEBUG t5_summarization_jp_finetuned_final


## Stage 2 - create and initialize a model

In [7]:
def init(df,param):
    tag = "-- process=fine_tuning_progress model={} max_epoch={} -- ".format(param['options']['params']['base_model'], param['options']['params']['max_epochs'])

    print(tag + "Training data loaded with shape: " + str(df.shape))
    print(tag + "Input parameters: ", param['options']['params'])
    print(tag + "Epoch number: " + param['options']['params']['max_epochs'])
    print(tag + "Base model: " + param['options']['params']['base_model'])
    
    print(tag + "Model Initialization: started")
    MODEL_NAME = "/srv/app/model/data/summarization"
    MODEL_NAME = os.path.join(MODEL_NAME, param['options']['params']['lang'], param['options']['params']['base_model'])
    print(tag + "Model file in " + MODEL_NAME)
    model = T5FineTuner(MODEL_NAME)
    model = model.to(device)
    print(tag + "Model Initialization: successfully finished")
    # GPU memory calculation
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    load1, load5, load15 = psutil.getloadavg()
    cpu_usage = (load15/os.cpu_count()) * 100
    stat = shutil.disk_usage("/")
    
    print(tag + "#GPU memory --Total memory: {}, --Memory reserved: {}, --Memory allocated: {}. #CPU: {}% occupied. #disk {}".format(t,r,a,cpu_usage,stat))
    
    return model

In [8]:
model = init(df,param)

-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Training data loaded with shape: (10, 2)
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Input parameters:  {'algo': 'appNLP_summarization_final', 'mode': 'stage', 'max_epochs': '1', 'lang': 'jp', 'base_model': 't5_summarization_jp', 'batch_size': '4'}
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Epoch number: 1
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Base model: t5_summarization_jp
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Model Initialization: started
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Model file in /srv/app/model/data/summarization/jp/t5_summarization_jp
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Model Initialization: successfully finished
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- #GPU memory --Tota

## Stage 3 - fit the model

In [9]:
def fit(model,df,param):  
    tag = "-- process=fine_tuning_progress model={} max_epoch={} -- ".format(param['options']['params']['base_model'], param['options']['params']['max_epochs'])
    if "batch_size" in param['options']['params']:
        print(tag + "setting batch size to ", param['options']['params']['batch_size'])
        batch_size_train = int(param['options']['params']['batch_size'])
        batch_size_valid = int(param['options']['params']['batch_size'])

    def preprocess_text(text):
        text = re.sub(r'[\r\t\n\u3000]', '', text)
        text = text.lower()
        text = text.strip()
        return text

    data = df.query('text.notnull()', engine='python').query('summary.notnull()', engine='python')
    data = data.assign(
        text=lambda x: x.text.map(lambda y: preprocess_text(y)),
        summary=lambda x: x.summary.map(lambda y: preprocess_text(y)))
    # Data conversion
    def convert_batch_data(train_data, valid_data, tokenizer):

        def generate_batch(data):

            batch_src, batch_tgt = [], []
            for src, tgt in data:
                batch_src.append(src)
                batch_tgt.append(tgt)

            batch_src = tokenizer(
                batch_src, max_length=max_length_src, truncation=True, padding="max_length", return_tensors="pt"
            )
            batch_tgt = tokenizer(
                batch_tgt, max_length=max_length_target, truncation=True, padding="max_length", return_tensors="pt"
            )

            return batch_src, batch_tgt

        train_iter = DataLoader(train_data, batch_size=batch_size_train, shuffle=True, collate_fn=generate_batch)
        valid_iter = DataLoader(valid_data, batch_size=batch_size_valid, shuffle=True, collate_fn=generate_batch)

        return train_iter, valid_iter
    MODEL_NAME = "/srv/app/model/data/summarization"
    MODEL_NAME = os.path.join(MODEL_NAME, param['options']['params']['lang'], param['options']['params']['base_model'])
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, is_fast=True)
    print(tag + "tokenizer intialized")
    print(tag + "Data vectorization: started")

    X_train, X_test, y_train, y_test = train_test_split(
        data['text'], data['summary'], test_size=0.15, random_state=42, shuffle=True
    )

    train_data = [(src, tgt) for src, tgt in zip(X_train, y_train)]
    valid_data = [(src, tgt) for src, tgt in zip(X_test, y_test)]

    train_iter, valid_iter = convert_batch_data(train_data, valid_data, tokenizer)
    print(tag + "Data vectorization: finished.")
    print(tag + "#Training data: " + str(len(train_data)) + ", #Test data: " + str(len(valid_data)))

    # Training function
    def train(model, data, optimizer, PAD_IDX, i):

        model.train()

        loop = 1
        total = len(data)
        losses = 0
        for src, tgt in data:
            optimizer.zero_grad()

            labels = tgt['input_ids'].to(device)
            labels[labels[:, :] == PAD_IDX] = -100

            outputs = model(
                input_ids=src['input_ids'].to(device),
                attention_mask=src['attention_mask'].to(device),
                decoder_attention_mask=tgt['attention_mask'].to(device),
                labels=labels
            )
            loss = outputs['loss']

            loss.backward()
            optimizer.step()
            losses += loss.item()

            print(tag + "Processed {}% of the {}-th epoch. Finished {} out of {} batches. Loss: {} ".format(round(loop/total*100), i, loop, total, round(losses / loop,2)), flush=True)
            loop += 1

        return losses / len(data)

    # Loss function
    def evaluate(model, data, PAD_IDX):

        model.eval()
        losses = 0
        with torch.no_grad():
            for src, tgt in data:

                labels = tgt['input_ids'].to(device)
                labels[labels[:, :] == PAD_IDX] = -100

                outputs = model(
                    input_ids=src['input_ids'].to(device),
                    attention_mask=src['attention_mask'].to(device),
                    decoder_attention_mask=tgt['attention_mask'].to(device),
                    labels=labels
                )
                loss = outputs['loss']
                losses += loss.item()

        return losses / len(data)

    epochs = int(param['options']['params']['max_epochs'])
    MODEL_DIRECTORY = "/srv/app/model/data/summarization"
    MODEL_DIRECTORY = os.path.join(MODEL_DIRECTORY, param['options']['params']['lang'], param['options']['model_name'])

    optimizer = optim.Adam(model.parameters())

    PAD_IDX = tokenizer.pad_token_id
    best_loss = float('Inf')
    best_model = None
    counter = 1

    print(tag + 'Model fine-tuning started with {} epochs'.format(epochs))

    for loop in range(1, epochs + 1):

        start_time = time.time()

        loss_train = train(model=model, data=train_iter, optimizer=optimizer, PAD_IDX=PAD_IDX, i=loop)

        elapsed_time = time.time() - start_time

        loss_valid = evaluate(model=model, data=valid_iter, PAD_IDX=PAD_IDX)
        
        t = torch.cuda.get_device_properties(0).total_memory
        r = torch.cuda.memory_reserved(0)
        a = torch.cuda.memory_allocated(0)
        f = r-a  # free inside reserved
        load1, load5, load15 = psutil.getloadavg()
        cpu_usage = (load15/os.cpu_count()) * 100
        stat = shutil.disk_usage("/")
        print(tag + "#GPU memory --Total memory: {}, --Memory reserved: {}, --Memory allocated: {}. #CPU: {}% occupied. #disk {}".format(t,r,a,cpu_usage,stat), flush=True)

        print(tag + '[{}/{}] train loss: {:.4f}, valid loss: {:.4f} [{}{:.0f}s] counter: {} {}'.format(
            loop, epochs, loss_train, loss_valid,
            str(int(math.floor(elapsed_time / 60))) + 'm' if math.floor(elapsed_time / 60) > 0 else '',
            elapsed_time % 60,
            counter,
            '**' if best_loss > loss_valid else ''
        ),flush=True)

        if best_loss > loss_valid:
            best_loss = loss_valid
            best_model = copy.deepcopy(model)
            counter = 1
        else:
            if counter > patience:
                break

            counter += 1

        tokenizer.save_pretrained(MODEL_DIRECTORY)
        print(tag + "tokenizer saved in " + MODEL_DIRECTORY, flush=True)
        best_model.model.save_pretrained(MODEL_DIRECTORY)
        print(tag + "model saved in " + MODEL_DIRECTORY, flush=True)

    print(tag + "Model fine-tuning successfully finished")
    returns = {}
    return returns

In [10]:
fit(model,df,param)

-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- setting batch size to  4
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- tokenizer intialized
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Data vectorization: started
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Data vectorization: finished.
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- #Training data: 8, #Test data: 2
-- process=fine_tuning_progress model=t5_summarization_jp max_epoch=1 -- Model fine-tuning started with 1 epochs


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.56 GiB total capacity; 1.46 GiB already allocated; 18.50 MiB free; 1.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Stage 4 - apply the model

In [11]:
def apply(model,df,param):
    print("DEBUG: enter apply")
    print(param)
    tag = "-- process=fine_tuning_progress model={} max_epoch={} -- ".format(param['options']['params']['base_model'], param['options']['params']['max_epochs'])
    MODEL_DIRECTORY = "/srv/app/model/data/summarization"
    MODEL_DIRECTORY = os.path.join(MODEL_DIRECTORY, param['options']['params']['lang'], param['options']['model_name'])
    model = {}
    print(MODEL_DIRECTORY)
    model["tokenizer"] = T5Tokenizer.from_pretrained(MODEL_DIRECTORY)
    model["summarizer"] = T5ForConditionalGeneration.from_pretrained(MODEL_DIRECTORY)
    print("DEBUG: model inited")
    X = df[param['feature_variables'][0]].values.tolist()
    temp_data=list()
    print(tag + "apply function read inputs")
    for i in range(len(X)):
        batch = model["tokenizer"](str(X[i]), max_length=400, truncation=True, return_tensors="pt")
        outputs = model["summarizer"].generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=400,repetition_penalty=8.0,num_beams=15)
        summary = [model["tokenizer"].decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for ids in outputs]
        temp_data += summary
    cols={"summary": temp_data}
    returns=pd.DataFrame(data=cols)
    print(tag + "apply function successfully finished")
        
    return returns

In [22]:
returns = apply(model,df,param)

debug: start apply
debug: read input
numpy version: 1.22.1
debug: start tokenizing
debug: finish tok and start summarizing
debug: finish summarizing and start decoding
debug: finish decoding
debug: finished
無線のプレミアムバージョンに加入しているユーザー番号は100145。通常加入よりも高速であるはずなのに、とても遅いという。お客様が契約したパッケージの説明とまったく同じだそう


## Stage 5 - save the model

In [14]:
# save model to name in expected convention "<algo_name>_<model_name>.h5"
def save(model, name):
    return {}

## Stage 6 - load the model

In [15]:
# load model from name in expected convention "<algo_name>_<model_name>.h5"
def load(path):
    print("DEBUG: load")
    model = {}
    return model

## Stage 7 - provide a summary of the model

In [16]:
# return model summary
def summary(model=None):
    returns = {}
    return returns

## End of Stages
All subsequent cells are not tagged and can be used for further freeform code