## Libraries

In [None]:
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    IN_COLAB = True
except:
    IN_COLAB = False

Mounted at /content/drive


In [None]:
if IN_COLAB:
  !pip install transformers
  !pip install datasets
  !pip install evaluate
  !pip install sentencepiece
  !pip install wandb
  !pip install faiss-cpu
  !pip install bitsandbytes
  !pip install accelerate
  !pip install sentencepiece
  !pip install protobuf
  !pip install peft

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.m

In [None]:
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["WANDB_DISABLED"] = "true"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["TF_USE_LEGACY_KERAS"] = "1"
os.environ["HF_HUB_DISABLE_XET"] = "1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import torch

if IN_COLAB:
    root_path = '/content/drive/My Drive/instructABSA'
else:
    root_path = 'Enter local path'

use_mps = True if torch.backends.mps.is_built() else False
os.chdir(root_path)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

from InstructABSA.data_prep import DatasetLoader
from InstructABSA.utils import T5GeneratorLLMOps, T5Classifier
from instructions import InstructionsHandler

## Training

In [None]:
task_name = 'aoste-4'
experiment_name = 'aoste-flan-t5-base'
model_checkpoint = 'google/flan-t5-base'
print('Experiment Name: ', experiment_name)
model_out_path = './Models'
model_out_path = os.path.join(model_out_path, task_name, f"{model_checkpoint.replace('/', '-')}-{experiment_name}")
print('Model output path: ', model_out_path)

Experiment Name:  aoste-flan-t5-base
Model output path:  ./Models/aoste-4/google-flan-t5-base-aoste-flan-t5-base


In [None]:
# Load the data
id_train_file_path = './Data/train_aoste.json'  ### <-- Update with your data
id_test_file_path = './Data/test_aoste.json'   ### <-- Update with your data

id_tr_df = pd.read_json(id_train_file_path)
id_te_df = pd.read_json(id_test_file_path)

# Get the input text into the required format using Instructions
instruct_handler = InstructionsHandler()

# Set instruction_set1 for InstructABSA-1 and instruction_set2 for InstructABSA-2
instruct_handler.load_instruction_set2()

# Set bos_instruct1 for lapt14 and bos_instruct2 for rest14. For other datasets, modify the insructions.py file.
loader = DatasetLoader(id_tr_df, id_te_df)
if loader.train_df_id is not None:
    loader.train_df_id = loader.create_data_in_aoste_format(
        df               = loader.train_df_id,
        key              = 'term',
        label_key        = 'polarity',
        text_col         = 'raw_words',
        aspect_col       = 'aspects',
        opinion_col      = 'opinions',
        bos_instruction  = instruct_handler.aoste['bos_instruct1'],
        eos_instruction  = instruct_handler.aoste['eos_instruct']
    )
if loader.test_df_id is not None:
    loader.test_df_id = loader.create_data_in_aoste_format(
        df               = loader.test_df_id,
        key              = 'term',
        label_key        = 'polarity',
        text_col         = 'raw_words',
        aspect_col       = 'aspects',
        opinion_col      = 'opinions',
        bos_instruction  = instruct_handler.aoste['bos_instruct1'],
        eos_instruction  = instruct_handler.aoste['eos_instruct']
    )

In [None]:
# Create T5 utils object
t5_exp = T5GeneratorLLMOps(
    model_checkpoint = model_checkpoint,
    max_new_tokens   = 128,
    use_lora         = True,                   # bật LoRA
    use_dora         = False,                  # nếu muốn thử DoRA thì set True
    lora_r           = 8,
    dora_r           = 4,
    quantize_fp16    = True,                   # load model 16‑bit
    cache_folder     = '/Models/faiss_cache',        # semantic cache
    wandb_project    = experiment_name         # project trong WandB
)

# Tokenize Dataset
id_ds, id_tokenized_ds, ood_ds, ood_tokenized_ds = loader.set_data_for_training_semeval(t5_exp.tokenize_function_inputs)

# Training arguments
training_args = {
    'output_dir':model_out_path,
    'eval_strategy': 'epoch',
    'learning_rate':5e-5,
    'lr_scheduler_type':'cosine',
    'per_device_train_batch_size':8,
    'per_device_eval_batch_size':16,
    'num_train_epochs':30,
    'weight_decay':0.01,
    'warmup_ratio':0.1,
    'save_strategy':'no',
    'load_best_model_at_end':False,
    'push_to_hub':False,
    'eval_accumulation_steps':1,
    'predict_with_generate':True,
    'use_mps_device':use_mps
}

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlpphat22[0m ([33mbigdata-project[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

In [None]:
id_tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 973
    })
    test: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 243
    })
})

In [None]:
# Split the test dataset in half
train_test_split = id_tokenized_ds['test'].train_test_split(test_size=0.5)
train_test_split

DatasetDict({
    train: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 121
    })
    test: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 122
    })
})

In [None]:
# Add the "new" 'train' and 'test' splits to the original DatasetDict with their new names
id_tokenized_ds['test'] = train_test_split['train']
id_tokenized_ds['validation'] = train_test_split['test']  # Use 'test' as the validation set
id_tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 973
    })
    test: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['raw_words', 'words', 'aspects', 'opinions', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 122
    })
})

In [None]:
!python -m pip install --upgrade pip
!python -m pip install -U accelerate
!python -m pip install -U transformers

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.7.0-py3-none-any.whl (362 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.6.0
    Uninstalling accelerate-1.6.0:
      Successfully uninstalled accelerate-1.6.0
Successfully installed accelerate-1.7.0


In [None]:
# Train model
model_trainer = t5_exp.train(id_tokenized_ds, **training_args)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Model training started ....


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,2.9375
2,No log,1.738281
3,No log,1.288086
4,No log,1.102539
5,2.184700,1.00293
6,2.184700,0.947266
7,2.184700,0.925781
8,2.184700,0.881348
9,1.065900,0.864258
10,1.065900,0.821289
