In [1]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
config = T5Config.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

In [5]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"data": "table.csv"})
train_dataset, validation_dataset = dataset["data"].train_test_split(test_size=0.1).values()




  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
def preprocess_function(examples):
    inputs = [f"inject knowledge: {input_text}" for input_text in examples["input_text"]]
    targets = examples["target_text"]
    input_tokenized = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    target_tokenized = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    return {"input_ids": input_tokenized["input_ids"], "attention_mask": input_tokenized["attention_mask"], "labels": target_tokenized["input_ids"]}

train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
validation_tokenized_dataset = validation_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [7]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=300,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="logs",
    learning_rate=3e-5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=validation_tokenized_dataset,
)


In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,7.115379
2,No log,2.867525
3,No log,1.084715
4,No log,0.327612
5,No log,0.206846
6,No log,0.194575
7,No log,0.193249
8,No log,0.192819
9,No log,0.190618
10,No log,0.189683


In [12]:
trainer.save_model("fine_tuned_t5")

In [19]:
def generate_text(prompt, model, tokenizer, max_length=512, device="cpu"):
    model.to(device)
    input_text = f"inject knowledge: {prompt}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

prompt = "Explain process to verify details of documents submitted by applicant?"
generated_text = generate_text(prompt, model, tokenizer)
print(generated_text)


The applicant should submit documents submitted by applicant and verify the details of documents submitted by applicant.
