# T5 model for text simplification

## Import modules

In [1]:
!pip install --upgrade pip
!pip install --upgrade torch
!pip install SentencePiece
!pip uninstall transformers -y
!pip install transformers
!pip install rich
!pip install rouge
!pip install evaluate
!pip install Levenshtein
!pip install spacy
!pip install nltk
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Found existing installation: transformers 4.28.1
Uninstalling transformers-4.28.1:
  Successfully uninstalled transformers-4.28.1
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting transformers
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/d8/a7/a6ff727fd5d96d6625f4658944a2ae230f0c75743a9a117fbda013b03d3d/transformers-4.28.1-py3-none-any.whl (7.0 MB)
Installing collected packages: transformers
Successfully installed transformers-4.28.1
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import os
import re
import json
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
from rich import box
from rich.console import Console
from rich.table import Column, Table
from collections import defaultdict
import torch
import torch.nn.functional as F
from torch import cuda
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from string import punctuation
import Levenshtein
import spacy
import time

import multiprocessing
from multiprocessing import Pool, Lock
import threading
from queue import Queue
from functools import lru_cache

## Data Processor

In [3]:
class DataProcessor:
    def __init__(self, data_path='train/simpletext_task3_train.json', qrels_path='train/simpletext_task3_qrels.json'):
        self.data_path = data_path
        self.qrels_path = qrels_path
        
    def load_data(self, path):
        """
        Load data from a JSON file
        """
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data

    def get_query_dict(self, file):
        """
        Count the number of occurrences of each query text in the data
        """
        query_list = [data['query_text'] for data in file]
        query_dict = defaultdict(int)
        for query in query_list:
            query_dict[query] += 1
        return query_dict

    def preprocess_data(self, data):
        """
        Preprocess the source sentences by adding the query text and replacing periods with commas
        """
        data['source_snt'] = data['source_snt'].str.replace('.',',', regex=False)
        data['source_snt'] = data[['source_snt', 'query_text']].agg(' related to '.join, axis=1)+"."
        return data

    def load_data_and_get_dataframe(self):
        """
        Load the data and the query relevance judgments from JSON files and merge them into a single dataframe
        """
        data = self.load_data(self.data_path)
        query_dict = self.get_query_dict(data)
        query_json = json.dumps(query_dict, indent=4)
        qrels = self.load_data(self.qrels_path)
        merged_data = pd.concat([pd.DataFrame(data), pd.DataFrame(qrels)], axis=1)
        preprocessed_data = self.preprocess_data(merged_data)
        df = preprocessed_data[['source_snt', 'simplified_snt']]
        return data, query_dict, query_json, df

    def get_max_and_avg_length(self, data, column_name):
        """
        Compute the maximum and average length of the sentences in a given column of the dataframe
        """
        lengths = [len(s.split()) for s in data[column_name]]
        max_length = max(lengths)
        average_length = np.mean(lengths)
        return max_length, average_length

    def print_data_info(self):
        """
        Load the data, preprocess it, and compute various statistics on it
        """
        data, query_dict, query_json, df = self.load_data_and_get_dataframe()
        print(f"Data size: {len(data)}")
        print(f"The number of query texts: {len(query_dict)}")
        print(f"The amount of data per query text: {query_json}")
    
        max_length, average_length = self.get_max_and_avg_length(df, 'source_snt')
        print('Max length of source sentence: ',max_length,'\n'+'Avg length of source sentence: ',average_length)

        max_length, average_length = self.get_max_and_avg_length(df, 'simplified_snt')
        print('Max length of simplified sentence: ',max_length,'\n'+'Avg length of simplified sentence ',average_length)

        return df

In [4]:
dp = DataProcessor()
df = dp.print_data_info()
df

FileNotFoundError: [Errno 2] No such file or directory: 'train/simpletext_task3_train.json'

 ## Text Complexity Scorer

In [None]:
from spacy.cli import download, link
from spacy.util import is_package, get_package_path
from source.helper import yield_lines, load_dump, dump
from source.resources import download_glove, DUMPS_DIR
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [255]:
class TextComplexityScorer:
    def __init__(self):
        self.MODEL = 'en_core_web_sm'
        self.DUMPS_DIR = DUMPS_DIR
        self.WORD_EMBEDDINGS_NAME = "glove.42B.300d"
        self.word2rank = None

    def spacy_process(self, text):
        """
        Pre-processes the text using the spacy library.
        """
        if not spacy.util.is_package(self.MODEL):
            spacy.cli.download(self.MODEL)
            spacy.cli.link(self.MODEL, self.MODEL, force=True,
                            model_path=spacy.util.get_package_path(self.MODEL))
        return spacy.load(self.MODEL)(str(text))

    @lru_cache(maxsize=1024)
    def get_dependency_tree_depth(self, sentence):
        """
        Computes the dependency tree depth of the given sentence.
        """
        def tree_height(node):
            if len(list(node.children)) == 0:
                return 0
            return 1 + max(tree_height(child) for child in node.children)

        tree_depths = [tree_height(spacy_sentence.root) for spacy_sentence in self.spacy_process(sentence).sents]
        if len(tree_depths) == 0:
            return 0
        return max(tree_depths)

    @lru_cache(maxsize=1)
    def get_word2rank(self, vocab_size=np.inf):
        """
        Downloads and pre-processes the GloVe word embeddings.
        """
        model_filepath = self.DUMPS_DIR / f"{self.WORD_EMBEDDINGS_NAME}.pk"
        if model_filepath.exists():
            return load_dump(model_filepath)

        print("Downloading glove.42B.300d ...")
        download_glove(model_name=self.WORD_EMBEDDINGS_NAME, dest_dir=str(self.DUMPS_DIR))
        print("Preprocessing word2rank...")

        word2rank = {}
        with yield_lines(self.DUMPS_DIR / f"{self.WORD_EMBEDDINGS_NAME}.txt") as lines_generator:
            for i, line in enumerate(lines_generator):
                if vocab_size is not None and i >= vocab_size:
                    break
                word = line.split(' ')[0]
                word2rank[word] = i
                
        dump(word2rank, model_filepath)
        txt_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.txt'
        zip_file = DUMPS_DIR / f'{WORD_EMBEDDINGS_NAME}.zip'
        if txt_file.exists(): txt_file.unlink()
        if zip_file.exists(): zip_file.unlink()
        return word2rank

    @lru_cache(maxsize=2048)
    def get_rank(self, word, normalized=False):
        """
        Computes the rank of the given word in the word2rank vocabulary.
        """
        if self.word2rank is None:
            self.word2rank = self.get_word2rank()
        max_rank = len(self.word2rank)

        rank = self.word2rank.get(word, max_rank)
        rank = np.log(1 + rank)
        if normalized:
            rank = rank / np.log(1 + max_rank)

        return rank

    @lru_cache(maxsize=2048)
    def get_word_complexity_score(self, sentence):
        """
        Computes the complexity score of the given sentence based on the ranks of its words in the word2rank vocabulary.
        """
        words = [word.lower() for word in tokenize(sentence)
                 if word.lower() not in stopwords and
                 not all(char in string.punctuation for char in word)]

        words = [word for word in words if word in self.get_word2rank()]

        if not words:
            return 1.0
        return np.quantile([get_rank(word) for word in words], 0.75)

    def get_text_complexity(self, simple_sentences, complex_sentences):
        """
        Computes the complexity score of the given text
        """
        scores = []
        for i in range(len(simple_sentences)):
            simple_score = self.get_word_complexity_score(simple_sentences[i])
            complex_score = self.get_word_complexity_score(complex_sentences[i])
            if complex_score == 0:
                score = 0
            else:
                score = simple_score / complex_score
            score = min(float(f'%.2f' % score), 1.0)
            scores.append(score)
        return np.mean(scores)

In [256]:
complexity_scorer = TextComplexityScorer()
rank = complexity_scorer.get_word_complexity_score('Current academic and industrial research is interested in autonomous vehicles.')
rank

7.829503794772458

In [257]:
simple_sentences=['Current academic and industrial research is interested in autonomous vehicles.']
complex_sentences=['In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.']
complexity_scorer.get_text_complexity(simple_sentences,complex_sentences)

0.88

## Features

In [268]:
class RatioFeature:
    def __init__(self, feature_extractor, target_ratio=0.80):
        self.feature_extractor = feature_extractor
        self.target_ratio = str(target_ratio)

    def encode_sentence(self, sentence):
        return f'{self.name}_{self.target_ratio}'

    def encode_sentence_pair(self, complex_sentence, simple_sentence):
        return f'{self.name}_{self.feature_extractor(complex_sentence, simple_sentence)}', simple_sentence

    def decode_sentence(self, encoded_sentence):
        return encoded_sentence

    @property
    def name(self):
        class_name = self.__class__.__name__.replace('RatioFeature', '')
        name = ""
        for word in re.findall('[A-Z][^A-Z]*', class_name):
            if word: name += word[0]
        if not name: name = class_name
        return name
    
    
class WordRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_word_length_ratio, *args, **kwargs)

    def get_word_length_ratio(self, complex_sentence, simple_sentence):
        return round(safe_division(len(tokenize(simple_sentence)), len(tokenize(complex_sentence))))

    
class CharRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_char_length_ratio, *args, **kwargs)

    def get_char_length_ratio(self, complex_sentence, simple_sentence):
        return round(safe_division(len(simple_sentence), len(complex_sentence)))


class LevenshteinRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_levenshtein_ratio, *args, **kwargs)

    def get_levenshtein_ratio(self, complex_sentence, simple_sentence):
        return round(Levenshtein.ratio(complex_sentence, simple_sentence))


class WordRankRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_word_rank_ratio, *args, **kwargs)

    def get_word_rank_ratio(self, complex_sentence, simple_sentence):
        return round(min(safe_division(self.get_lexical_complexity_score(simple_sentence),
                                       self.get_lexical_complexity_score(complex_sentence)), 2))

    def get_lexical_complexity_score(self, sentence):
        words = tokenize(remove_stopwords(remove_punctuation(sentence)))
        words = [word for word in words if word in get_word2rank()]
        if len(words) == 0:
            return np.log(1 + len(get_word2rank()))
        return np.quantile([self.get_rank(word) for word in words], 0.75)

    @lru_cache(maxsize=5000)
    def get_rank(self, word):
        rank = get_word2rank().get(word, len(get_word2rank()))
        return np.log(1 + rank)


class DependencyTreeDepthRatioFeature(RatioFeature):
    def __init__(self, *args, **kwargs):
        super().__init__(self.get_dependency_tree_depth_ratio, *args, **kwargs)

    def get_dependency_tree_depth_ratio(self, complex_sentence, simple_sentence):
        return round(
            safe_division(self.get_dependency_tree_depth(simple_sentence),
                          self.get_dependency_tree_depth(complex_sentence)))
    
    @lru_cache(maxsize=1024)
    def get_dependency_tree_depth(self, sentence):
        def get_subtree_depth(node):
            if len(list(node.children)) == 0:
                return 0
            return 1 + max([get_subtree_depth(child) for child in node.children])

        tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in self.spacy_process(sentence).sents]
        if len(tree_depths) == 0:
            return 0
        return max(tree_depths)
    
    @lru_cache(maxsize=10 ** 6)
    def spacy_process(self, text):
        return get_spacy_model()(text)

## Preprocessor

In [269]:
class Preprocessor:
    def __init__(self, features_kwargs=None):
        super().__init__()

        self.features = self.get_features(features_kwargs)
        if features_kwargs:
            self.hash = generate_hash(str(features_kwargs).encode())
        else:
            self.hash = "no_feature"

    def get_class(self, class_name, *args, **kwargs):
        return globals()[class_name](*args, **kwargs)

    def get_features(self, feature_kwargs):
        features = []
        for feature_name, kwargs in feature_kwargs.items():
            features.append(self.get_class(feature_name, **kwargs))
        return features

    def encode_sentence(self, sentence):
        if self.features:
            line = ''
            for feature in self.features:
                line += feature.encode_sentence(sentence) + ' '
            line += sentence
            return line.rstrip()
        else:
            return sentence

    def encode_sentence_pair(self, complex_sentence, simple_sentence):
        if self.features:
            line = ''
            for feature in self.features:
                processed_complex, _ = feature.encode_sentence_pair(complex_sentence, simple_sentence)
                line += processed_complex + ' '
            line += complex_sentence
            return line.rstrip()

        else:
            return complex_sentence

    def decode_sentence(self, encoded_sentence):
        for feature in self.features:
            decoded_sentence = feature.decode_sentence(encoded_sentence)
        return decoded_sentence

    def encode_file(self, input_filepath, output_filepath):
        with open(output_filepath, 'w') as f:
            for line in yield_lines(input_filepath):
                f.write(self.encode_sentence(line) + '\n')

    def decode_file(self, input_filepath, output_filepath):
        with open(output_filepath, 'w') as f:
            for line in yield_lines(input_filepath):
                f.write(self.decode_sentence(line) + '\n')

    def encode_dataframe(self,dataset):
        processed_complex_sentences = []
        for complex_sentence, simple_sentence in tqdm(zip(dataset['source_snt'], dataset['simplified_snt']),total=len(dataset)):
            processed_complex_sentence = self.encode_sentence_pair(complex_sentence, simple_sentence)
            processed_complex_sentences.append(processed_complex_sentence)
        return processed_complex_sentences

    def preprocess_dataset(self, dataset):
        new_df=dataset.copy()
        new_df['source_snt']= self.encode_dataframe(dataset)
        return new_df

In [270]:
features_kwargs = {
        'WordRatioFeature': {'target_ratio': '1.05'},
        'CharRatioFeature': {'target_ratio': '0.95'},
        'LevenshteinRatioFeature': {'target_ratio': '0.75'},
        'WordRankRatioFeature': {'target_ratio': '0.85'},
        'DependencyTreeDepthRatioFeature': {'target_ratio': '0.85'}
    }
# features_kwargs = {}
preprocessor = Preprocessor(features_kwargs)

In [271]:
preprocessor = Preprocessor(features_kwargs)
# preprocessor.encode_sentence_pair('In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.','Current academic and industrial research is interested in autonomous vehicles.')
preprocessor.encode_sentence('In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.')

'W_1.05 C_0.95 L_0.75 WR_0.85 DTD_0.85 In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.'

In [272]:
simple_sentence='Current academic and industrial research is interested in autonomous vehicles.'
complex_sentence='In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.'
round(min(safe_division(get_lexical_complexity_score(simple_sentence),
                                       get_lexical_complexity_score(complex_sentence)),2))

'0.88'

simple_sentence='Current academic and industrial research is interested in autonomous vehicles.'
complex_sentence='In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.'
round(min(safe_division(get_complexity_score(simple_sentence),
                                       get_complexity_score(complex_sentence)), 2))

In [273]:
new_df = preprocessor.preprocess_dataset(df)

100%|██████████| 648/648 [00:12<00:00, 53.39it/s]


In [274]:
new_df

Unnamed: 0,source_snt,simplified_snt
0,W_0.44 C_0.54 L_0.47 WR_0.88 DTD_0.80 In the m...,Current academic and industrial research is in...
1,W_0.44 C_0.44 L_0.52 WR_1.00 DTD_0.36 With the...,Drones are increasingly used in the civilian a...
2,W_0.77 C_0.78 L_0.73 WR_0.98 DTD_0.75 Due to g...,Governments set guidelines on the operation ce...
3,W_0.51 C_0.61 L_0.61 WR_1.00 DTD_0.50 In an at...,Researchers propose data-driven solutions allo...
4,W_0.64 C_0.67 L_0.51 WR_0.96 DTD_0.86 Derived ...,"The algorithm, based on the Inception model, d..."
...,...,...
643,W_0.38 C_0.37 L_0.54 WR_1.00 DTD_0.62 Bodybuil...,Bodybuilders train with moderate loads and sho...
644,W_0.62 C_0.57 L_0.73 WR_0.98 DTD_0.36 Powerlif...,"Powerlifters, on the other hand, train with hi..."
645,W_0.79 C_0.77 L_0.81 WR_0.95 DTD_1.00 Although...,Although both groups are known to display impr...
646,W_0.85 C_0.75 L_0.78 WR_0.93 DTD_0.75 It has b...,It has been shown that many factors mediate th...


## Test data

In [277]:
# test_l=load_data('test/simpletext-task3-test-large.json')
# test_m=load_data('test/simpletext-task3-test-medium.json')
test_s=load_data('test/simpletext-task3-test-small.json')
def load_test_set(file):
    dataset=pd.DataFrame(file,dtype='object')
    for index, row in dataset.iterrows():
        if not pd.isnull(row['query_text']):
            dataset.at[index, 'source_snt'] = row['source_snt'].replace('.', ',') + ' related to ' + row['query_text'] + '.'
    dataset=dataset[['source_snt']]
    return dataset
test_small=load_test_set(test_s)
# test_medium=load_test_set(test_m)
# test_large=load_test_set(test_l)

In [278]:
# define a rich console logger
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="left"),
        Column("target_text", justify="left"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# Setting up the device for GPU usage
device = 'cuda' if cuda.is_available() else 'cpu'

## Dataset

In [279]:
class TrainDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        row_source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(row_source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_text":row_source_text,
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_mask": target_mask.to(dtype=torch.long),
        }

In [280]:
class TestDataSetClass(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, source_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            source_text (str): column name of source text

        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.source_text = self.data[source_text]
        
    def __len__(self):
        """returns the length of dataframe"""

        return len(self.source_text)

    def __getitem__(self, index):
        """return the input ids, attention masks"""

        source_text = str(self.source_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
        }

## Train, validation and test

In [281]:
# training logger to log training progress
def training_logger_init():
    training_logger = Table(
        Column("Epoch", justify="center"),
        Column("Steps", justify="center"),
        Column("Loss", justify="center"),
        Column("ROUGE_Loss", justify="center"),
        Column("ROUGE_1", justify="center"),
        Column("ROUGE_2", justify="center"),
        Column("ROUGE_L", justify="center"),
        Column("SARI", justify="center"),
        Column("BLEU", justify="center"),
        Column("FKGL", justify="center"),
        title="Training Status",
        pad_edge=False,
        box=box.ASCII,
    )
    return training_logger

# training logger to log training progress
def epoch_training_logger_init():
    epoch_training_logger = Table(
        Column("Epoch", justify="center"),
        Column("Train_Loss", justify="center"),
        Column("Valid_Loss", justify="center"),
        Column("ROUGE_1", justify="center"),
        Column("ROUGE_2", justify="center"),
        Column("ROUGE_L", justify="center"),
        Column("SARI", justify="center"),
        Column("BLEU", justify="center"),
        Column("FKGL", justify="center"),
        title="Training Epoch Status",
        pad_edge=False,
        box=box.ASCII,
    )
    return epoch_training_logger

In [282]:
from rouge import Rouge
from evaluate import load
from nltk.translate.bleu_score import sentence_bleu

def calculate_rouge(predicted, target):
    rouger = Rouge()
    scores = rouger.get_scores(predicted, target, avg=True)
    return scores['rouge-1']['f'],scores['rouge-2']['f'],scores['rouge-l']['f']

def compute_bleu(predicted, target):
    bleu_scores = []
    for i in range(len(predicted)):
        bleu_scores.append(sentence_bleu([target[i]],predicted[i], weights=(0.25, 0.25, 0.25, 0.25)))
    return np.mean(bleu_scores)

def compute_sari(sources, predicted, target):
    sari = load("sari")
    sari_scores=sari.compute(sources=sources, predictions=predicted, references=[[i] for i in target])
    return sari_scores['sari']

sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
predictions=["About 95 species are currently accepted.","About 95 species are currently accepted."]
references=["About 95 species are currently known.","About 95 species are currently known."]
compute_bleu(predictions,references)
# compute_sari(sources,predictions,references)

0.7721102818691421

In [283]:
from evaluation.sari import corpus_sari
from evaluation.bleu import corpus_bleu
from evaluation.fkgl import corpus_fkgl
sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
predictions=["About 95 species are currently accepted.","About 95 species are currently accepted."]
references=["About 95 species are currently known.","About 95 species are currently known."]
corpus_sari(sources,predictions,[references],lowercase=True)
corpus_bleu(predictions,[references],lowercase=True)
# corpus_fkgl(sources)

64.34588841607616

In [284]:
def train(epoch, tokenizer, model, device, loader, optimizer,model_params):
    model.train()
    total_loss=[]
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)
        target_mask = data["target_mask"].to(device, dtype=torch.long)[:, 1:]
        source_text = data["source_text"]

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
            decoder_attention_mask=target_mask,
        )
        loss = outputs[0]
        
        generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask, 
                  max_length=model_params["MAX_TARGET_TEXT_LENGTH"], 
                  num_beams=model_params["NUM_BEAMS"],
                  repetition_penalty=model_params["REPETITION_PENALTY"], 
                  length_penalty=model_params["LENGTH_PENALTY"], 
                  early_stopping=True,
                  do_sample=False,
                  temperature=0.25,
                  top_k=120,
                  top_p=0.98,
                  )

        # calculate rouge score
        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
        rouge1,rouge2,rougeL = calculate_rouge(preds, target)
#         bleu_score = compute_bleu(preds, target)
#         sari_score = compute_sari(source_text,preds, target)
        sari_score=corpus_sari(source_text,preds, [target],lowercase=False)
        bleu_score=corpus_bleu(preds, [target],lowercase=False)
        fkgl_score=corpus_fkgl(preds)

        # add rouge loss to total loss
        rouge_loss = 1.0 - np.mean([rouge1,rouge2,rougeL])
        complexity_score = get_lexical_complexity_score_batch(target,preds)#higher is better
        
        lambda_ = 0.7
        loss = lambda_ * loss + (1-lambda_)*(1-complexity_score)

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss),str(rouge_loss),str(rouge1),str(rouge2),str(rougeL),str(sari_score),str(bleu_score),str(fkgl_score))
            console.print(training_logger)

        optimizer.zero_grad()
#         (loss + rouge_loss).backward()
        loss.backward()
        optimizer.step()

        total_loss.append(loss)
    average_loss=sum(total_loss)/len(total_loss)

    return average_loss

In [285]:
def validate(tokenizer, model, device, loader,model_params):

    """
    Function to evaluate model for predictions

    """
    model.eval()
    predictions = []
    actuals = []
    total_loss=[]
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    bleu_scores = []
    sari_scores = []
    fkgl_scores = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)
            target_mask = data["target_mask"].to(device, dtype=torch.long)[:, 1:]
            source_text = data["source_text"]
            
            outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
            decoder_attention_mask=target_mask,
            )
            loss = outputs[0]

            generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask, 
                  max_length=model_params["MAX_TARGET_TEXT_LENGTH"], 
                  num_beams=model_params["NUM_BEAMS"],
                  repetition_penalty=model_params["REPETITION_PENALTY"], 
                  length_penalty=model_params["LENGTH_PENALTY"], 
                  early_stopping=True,
                  do_sample=False,
                  temperature=0.25,
                  top_k=120,
                  top_p=0.98,
                  )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            rouge1,rouge2,rougeL = calculate_rouge(preds, target)
            sari_score=corpus_sari(source_text,preds, [target],lowercase=False)
            bleu_score=corpus_bleu(preds, [target],lowercase=False)
            fkgl_score=corpus_fkgl(preds)
#             bleu_score = compute_bleu(preds, target)
#             sari_score = compute_sari(source_text,preds, target)

            predictions.extend(preds)
            actuals.extend(target)
            
            total_loss.append(loss)
            rouge1_scores.append(rouge1)
            rouge2_scores.append(rouge2)
            rougeL_scores.append(rougeL)
            sari_scores.append(sari_score)
            bleu_scores.append(bleu_score)
            fkgl_scores.append(fkgl_score)
            
    average_loss=sum(total_loss)/len(total_loss)
    average_rouge1=sum(rouge1_scores)/len(rouge1_scores)
    average_rouge2=sum(rouge2_scores)/len(rouge2_scores)
    average_rougeL=sum(rougeL_scores)/len(rougeL_scores)
    average_bleu=sum(bleu_scores)/len(bleu_scores)
    average_sari=sum(sari_scores)/len(sari_scores)
    average_fkgl=sum(fkgl_scores)/len(fkgl_scores)
    return predictions, actuals,average_loss, average_rouge1,average_rouge2,average_rougeL,average_bleu,average_sari,average_fkgl

In [286]:
def testing(tokenizer, model, device, loader,model_params):

    """
    Function to test model for predictions

    """
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=model_params["MAX_TARGET_TEXT_LENGTH"], 
                num_beams=model_params["NUM_BEAMS"],
                repetition_penalty=model_params["REPETITION_PENALTY"], 
                length_penalty=model_params["LENGTH_PENALTY"], 
                early_stopping=True,
                do_sample=False,
                temperature=0.25,
                top_k=120,
                top_p=0.98,
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            if _%10==0:
                console.print(f'Completed {_}')

            predictions.extend(preds)
    return predictions

## ModelTrainer and ModelTest

In [287]:
def ModelTrainer(
    dataframe, source_text, target_text, model,tokenizer,model_params, output_dir
):

    """
    T5 trainer

    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")
    model = model.to(device)

     # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]

    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation.
    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    train_dataset["source_snt"] = "simplify: " + train_dataset["source_snt"]
    train_dataset = train_dataset.reset_index(drop=True)
    
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    if model_params["CONTROL_TOKENS"]:
        val_dataset["source_snt"]=[preprocessor.encode_sentence(i[38:]) for i in val_dataset["source_snt"]]
    val_dataset["source_snt"] = "simplify: " + val_dataset["source_snt"]
    
    display_df(train_dataset.head(2))

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALID Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = TrainDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = TrainDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.AdamW(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train_loss =train(epoch, tokenizer, model, device, training_loader, optimizer,model_params)
        predictions, actuals, valid_loss, average_rouge1,average_rouge2,average_rougeL,average_bleu,average_sari,average_fkgl = validate(tokenizer, model, device, val_loader,model_params)
        epoch_training_logger.add_row(str(epoch), str(train_loss), str(valid_loss), str(average_rouge1),str(average_rouge2),str(average_rougeL),str(average_sari),str(average_bleu),str(average_fkgl))
        console.print(epoch_training_logger)
        
    final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
    final_df.to_csv(os.path.join(output_dir, "valid_predictions.csv"))

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'valid_predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [288]:
def ModelTest(
   test,size,source_text,model,tokenizer,model_params, output_dir
):
    """
    T5 test

    """        
    # logging
    console.log(f"""[Model]: Testing {model_params["MODEL"]}...\n""")
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading {size} test data...\n")

    # Creation of Dataset and Dataloader
    test = test.reset_index(drop=True)
    console.print(f"Test {size} Dataset: {test.shape}")

    # Creating the Training and Validation dataset for further creation of Dataloader
    test_set = TestDataSetClass(
        test,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        source_text,
    )

    # Defining the parameters for creation of dataloaders
    test_params = {
        "batch_size": model_params["TEST_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }
    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    test_loader = DataLoader(test_set, **test_params)

    # Testing loop
    console.log(f"[Initiating Testing]...\n")
    predictions = testing(tokenizer, model, device, test_loader,model_params)
    final_df = pd.DataFrame({"simplified_snt": predictions})
    final_df.to_csv(os.path.join(output_dir, f"test_{size}_predictions.csv"))

    console.log(f"[Testing Completed.]\n")
    console.print(
        f"""[Test] Generation on Test data saved @ {os.path.join(output_dir,f'test_{size}_predictions.csv')}\n"""
    )

In [289]:
def test(test_set,model,tokenizer,size,output_dir):
    test_set=test_set.copy()
    if model_params["CONTROL_TOKENS"]:
        test_set["source_snt"]=[preprocessor.encode_sentence(i) for i in test_set["source_snt"]]
    test_set["source_snt"] = "simplify: " + test_set["source_snt"]
    ModelTest(
        test=test_set,
        source_text="source_snt",
        size=size,
        model=model,
        tokenizer=tokenizer,
        model_params=model_params,
        output_dir=output_dir,
    )

## Load model

In [290]:
def load_model(output_dir,model_class,tokenizer_class,model_name):

    tokenizer = tokenizer_class.from_pretrained(model_name)
    model_config = model_class.config_class.from_pretrained(model_name)

    model_path = os.path.join(output_dir, "model_files")

    model = model_class.from_pretrained(model_path, config=model_config)

    return model, tokenizer

def generate_summary(text,model,tokenizer):
    if model_params["CONTROL_TOKENS"]:
        text=preprocessor.encode_sentence(text)
    input_ids = tokenizer.encode("simplify: "+text, return_tensors='pt', 
                                    max_length=tokenizer.model_max_length, 
                                    truncation=True)
    summary_ids = model.generate(input_ids, 
                                 max_length=model_params["MAX_TARGET_TEXT_LENGTH"], 
                                 num_beams=model_params["NUM_BEAMS"],
                                 repetition_penalty=model_params["REPETITION_PENALTY"], 
                                 length_penalty=model_params["LENGTH_PENALTY"],
                                 do_sample=False,
                                 temperature=0.25,
                                 top_k=120,
                                 top_p=0.98,)

    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(summary)

## T5 model with control tokens

### T5 Training loop

In [None]:
# tokenzier for encoding the text
model_params = {
    "MODEL": "t5-base",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TEST_BATCH_SIZE": 8,  # test batch size
    "TRAIN_EPOCHS": 5,  # number of training epochs
    "LEARNING_RATE": 3e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 100,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 75,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
    "NUM_BEAMS":8,
    "REPETITION_PENALTY":2.5, 
    "LENGTH_PENALTY":1, 
    "CONTROL_TOKENS":True,
}

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
tokenizer = T5TokenizerFast.from_pretrained(model_params["MODEL"])
training_logger = training_logger_init()
epoch_training_logger=epoch_training_logger_init()

# control tokens
features_kwargs = {
        'WordRatioFeature': {'target_ratio': '1.05'},
        'CharRatioFeature': {'target_ratio': '0.95'},
        'LevenshteinRatioFeature': {'target_ratio': '0.75'},
        'WordRankRatioFeature': {'target_ratio': '0.95'},
        'DependencyTreeDepthRatioFeature': {'target_ratio': '0.85'}
    }
# features_kwargs = {}
preprocessor = Preprocessor(features_kwargs)

ModelTrainer(
    dataframe=new_df,
    source_text="source_snt",
    target_text="simplified_snt",
    model=model,
    tokenizer=tokenizer,
    model_params=model_params,
    output_dir="T5_outputs",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


### T5 Testing

In [31]:
model,tokenizer=load_model("T5_outputs",T5ForConditionalGeneration,T5Tokenizer,"t5-small")
test(test_small,model,tokenizer,'small',"T5_outputs")
# test(test_medium,model,tokenizer,'medium',"T5_outputs")
# test(test_medium,model,tokenizer,'large',"T5_outputs")

In [54]:
# model,tokenizer=load_model("T5_outputs",T5ForConditionalGeneration,T5TokenizerFast,"t5-base")
text='This is moving us to a tipping point and to a crossroads: we must decide between a society in which the actions are determined in a top-down way and then implemented by coercion or manipulative technologies (such as personalized ads and nudging) or a society, in which decisions are taken in a free and participatory way and mutually coordinated.'
generate_summary(text,model,tokenizer)

This is moving us to a tipping point and to a crossroads: we must choose between a society in which the actions are determined top-down by coercion or manipulative technologies (such as personalized ads and nudging) or in which decisions are taken freely and cooperatively coordinated.


In [33]:
def output_format(size, run):
    #load test results
    file=load_data(f'test/simpletext-task3-test-{size}.json')

    df1=pd.read_csv(f"T5_outputs/test_{size}_predictions.csv")
    df2=pd.DataFrame(file)
    snt_id=df2[['snt_id']]
    
    df1 = df1.drop(df1.columns[0], axis=1)
    df1.insert(0, 'run_id', f'QH_task_3_run{run}')
    df1.insert(1, 'manual', 0)
    df1.insert(2, 'snt_id', snt_id)

    df1.to_json(f'{size}_pre_with_id.json', orient='records')

In [34]:
run = 1
for size in ['small']:
    output_format(size, run)

## T5 model without control tokens

In [None]:
# tokenzier for encoding the text
model_params = {
    "MODEL": "t5-small",  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TEST_BATCH_SIZE": 8,  # test batch size
    "TRAIN_EPOCHS": 3,  # number of training epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 70,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 55,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
    "NUM_BEAMS":8,
    "REPETITION_PENALTY":2.5, 
    "LENGTH_PENALTY":1, 
    "CONTROL_TOKENS":False,
}

# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
tokenizer = T5TokenizerFast.from_pretrained(model_params["MODEL"])
training_logger = training_logger_init()
epoch_training_logger=epoch_training_logger_init()

# features_kwargs = {}
preprocessor = Preprocessor(features_kwargs)

ModelTrainer(
    dataframe=df,
    source_text="source_snt",
    target_text="simplified_snt",
    model=model,
    tokenizer=tokenizer,
    model_params=model_params,
    output_dir="T5_outputs",
)

## BART model

### BART training

In [33]:
# tokenzier for encoding the text
model_params = {
    "MODEL": 'facebook/bart-large-cnn',  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TEST_BATCH_SIZE": 8,  # test batch size
    "TRAIN_EPOCHS": 1,  # number of training epochs
    "LEARNING_RATE": 1e-5,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 100,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 75,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
    "NUM_BEAMS":4,
    "REPETITION_PENALTY":2.5, 
    "LENGTH_PENALTY":0.75, 
}

from transformers import BartTokenizer, BartForConditionalGeneration

# 加载BART模型和分词器
model = BartForConditionalGeneration.from_pretrained(model_params["MODEL"])
tokenizer = BartTokenizer.from_pretrained(model_params["MODEL"])
model.model.decoder.generation_mode=False

training_logger = training_logger_init()
epoch_training_logger=epoch_training_logger_init()

train_data=df.copy()
train_data["source_snt"] = tokenizer.bos_token + train_data["source_snt"] + tokenizer.eos_token
ModelTrainer(
    dataframe=train_data,
    source_text="source_snt",
    target_text="simplified_snt",
    model=model,
    tokenizer=tokenizer,
    model_params=model_params,
    output_dir="Bart_outputs",
)

### Bart Testing

In [None]:
# model,tokenizer=load_model("Bart_outputs")
model,tokenizer=load_model("Bart_outputs",BartForConditionalGeneration,BartTokenizer,'facebook/bart-large-cnn')
test(test_small,model,tokenizer,'small',"Bart_outputs")
test(test_medium,model,tokenizer,'medium',"Bart_outputs")
test(test_medium,model,tokenizer,'large',"Bart_outputs")

In [36]:
model,tokenizer=load_model("Bart_outputs",BartForConditionalGeneration,BartTokenizer,'facebook/bart-large-cnn')
text='<s>In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research, related to drones.</s>'
generate_summary(text,model,tokenizer)

In the modern era of automation and robotics, autonomous vehicles are currently the focus of academic and industrial research. It is an important field of study because drones can be automated or robotically controlled. However, in the industrial and academic studies, people with human form of control are not allowed to drive autonomous vehicles.


## FlaxT5 model

### Flax T5 training

In [40]:
# tokenzier for encoding the text
model_params = {
    "MODEL": 'google/t5-v1-1.1.0',  # model_type: t5-base/t5-large
    "TRAIN_BATCH_SIZE": 8,  # training batch size
    "VALID_BATCH_SIZE": 8,  # validation batch size
    "TEST_BATCH_SIZE": 8,  # test batch size
    "TRAIN_EPOCHS": 1,  # number of training epochs
    "LEARNING_RATE": 3e-5,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 100,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 75,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
    "NUM_BEAMS":4,
    "REPETITION_PENALTY":2.5, 
    "LENGTH_PENALTY":0.75, 
}

from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 加载预训练的GPT-2模型和tokenizer
from transformers import FlaxT5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = FlaxT5ForConditionalGeneration.from_pretrained(model_name)

training_logger = training_logger_init()
epoch_training_logger=epoch_training_logger_init()


ModelTrainer(
    dataframe=train_data,
    source_text="source_snt",
    target_text="simplified_snt",
    model=model,
    tokenizer=tokenizer,
    model_params=model_params,
    output_dir="GPT2_outputs",
)

TypeError: forward() got an unexpected keyword argument 'decoder_input_ids'