# Task 1

安裝語音處理與模型訓練相關的 Python 套件，包含：

datasets: HuggingFace 資料集

librosa: 音訊讀取

transformers: Whisper 模型

jiwer: 評估語音辨識錯誤率（如 WER/MER）

In [None]:
!pip install  datasets evaluate jiwer librosa matplotlib
!pip install --upgrade bitsandbytes transformers==4.50.0 accelerate opencc
!pip install audiomentations
!pip install torchaudio
!pip install safetensors peft accelerate 



## Imoprt Package

In [2]:
import sys
sys.path.append('./AICUP') 

In [3]:
import os
import json
import torch
import random
import librosa
import zipfile
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset, Audio, load_dataset, Features, Value, concatenate_datasets
from AICUP import (DataCollatorSpeechSeq2SeqWithPadding,
      transcribe_with_timestamps,
      collate_batch_with_prompt_template,
      generate_annotated_audio_transcribe_parallel,OpenDeidBatchSampler)
from transformers import (
    WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor,
    WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer
)

  from .autonotebook import tqdm as notebook_tqdm


## Download Audio File

In [None]:
zip_path = r"Training_Dataset.zip"
extract_dir = r"Training_Dataset"
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
  zip_ref.extractall(extract_dir)

audio_files = []
for root, dirs, files in os.walk(extract_dir):
  for file in files:
    if file.lower().endswith('.wav'):
      audio_files.append(os.path.join(root, file))

print(f"Found {len(audio_files)} audio files")
# for path in audio_files:
#   print(path)

Found 0 audio files


In [5]:
def set_torch_seed(seed=0):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_torch_seed(42)

In [6]:
train_audio_folder = r"Training_Dataset\audio"
train_transcription_file = r"Training_Dataset\task1_answer.txt"
transcripts, dataset_list = {}, []

with open(train_transcription_file, "r", encoding="utf-8") as f:
  for line in f:
    if line.strip():
      parts = line.strip().split("\t", 1)
      # print(parts)
      if len(parts) == 2:
        filename, transcript = parts
        transcripts[filename] = transcript

for file in sorted(os.listdir(train_audio_folder)):
  if file.endswith(".wav") and file.split(".")[0] in transcripts:
    try:
      file_path = os.path.join(train_audio_folder, file)
      audio_array, sr = librosa.load(file_path, sr=16000)
      dataset_list.append({"audio":
                 {
                  "path":file_path,
                  "array":audio_array,
                  "sampling_rate":sr
                 },
                 "sentence": transcripts[file.split(".")[0]]})
    except Exception as e:
      print(e)
      print(f"Can't read {file_path}:{e}")

dataset = Dataset.from_pandas(pd.DataFrame(dataset_list))

In [7]:
print(dataset)

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 1539
})


In [8]:
dataset[0]['audio']['path']

'Training_Dataset\\audio\\1000.wav'

In [9]:

dataset[0]['audio']['array'][:20]

[8.841743692755699e-07,
 -4.0726736187934875e-06,
 -2.194661647081375e-06,
 8.789938874542713e-07,
 -4.919711500406265e-07,
 2.3731263354420662e-07,
 7.35744833946228e-08,
 -1.1399970389902592e-06,
 -6.317859515547752e-06,
 -3.2662064768373966e-06,
 6.971240509301424e-07,
 -5.945330485701561e-07,
 7.411435944959521e-07,
 -1.0784860933199525e-06,
 1.874344889074564e-06,
 -6.328860763460398e-06,
 -6.247661076486111e-06,
 1.782434992492199e-06,
 -9.54663846641779e-07,
 5.38479071110487e-07]

In [10]:
print("Audio Sample:",len(dataset[0]['audio']['array']))
print("Audio Duration:",len(dataset[0]['audio']['array'])/16000)

Audio Sample: 375968
Audio Duration: 23.498


In [11]:
split_ratio = 0.8
train_size = int(len(dataset) * split_ratio)
dataset = dataset.train_test_split(train_size=train_size,
     test_size=len(dataset) - train_size, 
     shuffle=True, 
     seed=42)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(f"Train: {len(train_dataset)} samples, Test: {len(test_dataset)} samples")

Train: 1231 samples, Test: 308 samples


In [12]:
test_dataset[0]

{'audio': {'array': [-0.0002897786907851696,
   -0.0005036171060055494,
   -0.0004905923269689083,
   -0.0005729645490646362,
   -0.0005184835754334927,
   -0.0005295630544424057,
   -0.000534487422555685,
   -0.00042258482426404953,
   -0.0003298153169453144,
   -0.00035896780900657177,
   -0.00024633342400193214,
   -0.0001579797826707363,
   -0.00010512629523873329,
   -8.964911103248596e-06,
   9.787734597921371e-05,
   0.00010839616879820824,
   0.0002268604002892971,
   0.000322938896715641,
   0.0002529188059270382,
   0.0003058691509068012,
   0.0003188366536051035,
   0.0002481886185705662,
   0.0003550788387656212,
   0.0003586788661777973,
   0.00023046229034662247,
   0.00024468451738357544,
   0.00016804644837975502,
   0.0001358063891530037,
   8.22087749838829e-05,
   -1.1086929589509964e-05,
   2.483837306499481e-06,
   1.9224360585212708e-05,
   -6.415508687496185e-05,
   -0.00010079378262162209,
   -0.00018653180450201035,
   -0.0003216792829334736,
   -0.000227001029

In [13]:
train_audio_folder = r"LibriSpeech_Dataset\audio"
train_transcription_file = r"LibriSpeech_Dataset\LibriSpeech.txt"
transcripts, dataset_list = {}, []

with open(train_transcription_file, "r", encoding="utf-8") as f:
  for line in f:
    if line.strip():
      parts = line.strip().split("\t", 1)
      # print(parts)
      if len(parts) == 2:
        filename, transcript = parts
        transcripts[filename] = transcript

for file in sorted(os.listdir(train_audio_folder)):
  if file.endswith(".wav") and file.split(".")[0] in transcripts:
    try:
      file_path = os.path.join(train_audio_folder, file)
      audio_array, sr = librosa.load(file_path, sr=16000)
      dataset_list.append({"audio":
                 {
                  "path":file_path,
                  "array":audio_array,
                  "sampling_rate":sr
                 },
                 "sentence": transcripts[file.split(".")[0]]})
    except Exception as e:
      print(e)
      print(f"Can't read {file_path}:{e}")

dataset = Dataset.from_pandas(pd.DataFrame(dataset_list))

# split_ratio = 0.8
# train_size = int(len(dataset) * split_ratio)
# dataset = dataset.train_test_split(train_size=train_size,
#      test_size=len(dataset) - train_size, 
#      shuffle=True, 
#      seed=42)

LibriSpeech_train_dataset = dataset

print(f"LibriSpeech Train: {len(LibriSpeech_train_dataset)} samples")

LibriSpeech Train: 5567 samples


## Model Import

In [14]:
model_name = "openai/whisper-small"  #"small", "medium", "large"
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.padding_side = 'left'
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.generation_config.language = None
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

In [None]:
model.config.dropout = 0.1
model.config.attention_dropout = 0.1
model.config.activation_dropout = 0.1
model.config.decoder_attention_dropout = 0.1
model.config.encoder_attention_dropout = 0.1


In [16]:
print(model.config)

WhisperConfig {
  "_attn_implementation_autoset": true,
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 768,
  "decoder_attention_dropout": 0.1,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 50258,
  "dropout": 0.1,
  "encoder_attention_dropout": 0.1,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 50257,
  "forced_decoder_ids": [
    [
      1,
      50259
    ],
    [
      2,
      50359
    ],
    [
      3,
      50363
    ]
  ],
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "mask_feature_length": 10,
  "mask_feature_min_masks": 0,
  "mask_feature_

### Pre-evaluate

In [17]:
from jiwer import mer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer


def calculate_mer(ground_truth_texts, predicted_texts):
  """ Mix Error Rate (MER) English only"""
  mer_scores = {}
  total_mer = 0
  count = 0

  normalizer = BasicTextNormalizer()

  for filename, ref_text in ground_truth_texts.items():
    if filename in predicted_texts:
      pred_text = predicted_texts[filename]
      ref_text = normalizer(ref_text)
      pred_text = normalizer(pred_text)
      mer_score = mer(ref_text, pred_text)
      mer_scores[filename] = mer_score
      total_mer += mer_score
    else:
      mer_scores[filename] = 1
      total_mer += 1
    count += 1

  average_mer = total_mer / count if count != 0 else 0
  return mer_scores, average_mer

def evaluate_mer(model, processor, dataset):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)
  predictions = {}
  references = {}

  for sample in dataset:
    filename = sample["audio"]["path"].split("/")[-1].split(".")[0]
    audio_array = sample['audio']['array']
    sr = sample['audio']['sampling_rate']
    input_features = processor.feature_extractor(
        audio_array,
        sampling_rate=sr,
        return_tensors="pt"
    ).input_features.to(model.device)
    with torch.no_grad():
      predicted_ids = model.generate(input_features)
      transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()

    predictions[filename] = transcription
    references[filename] = sample["sentence"].strip()

  mer_scores, avg_mer = calculate_mer(references, predictions)
  return mer_scores, avg_mer, predictions, references

In [None]:
mer_scores, avg_mer, predictions, references = evaluate_mer(model, processor, test_dataset)
print("All mer scores:\n",mer_scores)
print("Average mer scores:",avg_mer)
print("Predict :\n",predictions)
print("Answer :\n",references)

You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


All mer scores:
 {'Training_Dataset\\audio\\14137': 0.05747126436781609, 'Training_Dataset\\audio\\16159': 0.024390243902439025, 'Training_Dataset\\audio\\81228': 1.0, 'Training_Dataset\\audio\\13240': 0.1323529411764706, 'Training_Dataset\\audio\\18098': 0.34615384615384615, 'Training_Dataset\\audio\\36127': 0.01639344262295082, 'Training_Dataset\\audio\\37724': 0.0, 'Training_Dataset\\audio\\54856': 0.03571428571428571, 'Training_Dataset\\audio\\15599': 0.06153846153846154, 'Training_Dataset\\audio\\12309': 0.03333333333333333, 'Training_Dataset\\audio\\37013': 0.06666666666666667, 'Training_Dataset\\audio\\18397': 0.036585365853658534, 'Training_Dataset\\audio\\40211': 0.018518518518518517, 'Training_Dataset\\audio\\41350': 0.05172413793103448, 'Training_Dataset\\audio\\54877': 0.07692307692307693, 'Training_Dataset\\audio\\40959': 0.0, 'Training_Dataset\\audio\\33239': 0.02857142857142857, 'Training_Dataset\\audio\\40092': 0.010638297872340425, 'Training_Dataset\\audio\\3984': 0.25

## Dataset

In [20]:
print(processor.feature_extractor(test_dataset[2]['audio']['array'],
    sampling_rate=test_dataset[2]['audio']["sampling_rate"])['input_features'])
print(tokenizer(test_dataset[2]['sentence']).input_ids)

[[[ 0.58007383  0.15902716  0.6395861  ... -0.73975885 -0.73975885
   -0.73975885]
  [ 0.5141828   0.69328636  0.7562165  ... -0.73975885 -0.73975885
   -0.73975885]
  [ 0.6525948   0.6869429   0.63942    ... -0.73975885 -0.73975885
   -0.73975885]
  ...
  [-0.41415346 -0.18334866 -0.22415698 ... -0.73975885 -0.73975885
   -0.73975885]
  [-0.2303704  -0.20207572 -0.20472145 ... -0.73975885 -0.73975885
   -0.73975885]
  [-0.20497358 -0.38095844 -0.32534552 ... -0.73975885 -0.73975885
   -0.73975885]]]
[50258, 50363, 44, 284, 17029, 27781, 2412, 15590, 13183, 115, 18140, 239, 43, 1837, 49, 84, 35603, 45, 687, 1124, 304, 533, 10108, 9990, 7526, 2131, 5155, 11100, 48351, 16276, 25941, 18614, 235, 17015, 21686, 11217, 20145, 26256, 22060, 39708, 251, 75, 326, 5053, 10960, 2930, 246, 10960, 10213, 21975, 39708, 251, 11217, 2930, 246, 11217, 13167, 99, 41323, 10213, 6963, 9572, 5000, 13167, 99, 41323, 12608, 117, 19976, 250, 1541, 2412, 3338, 75, 326, 5053, 9765, 32187, 15555, 82, 325, 540, 8

In [None]:
import torch
import torchaudio
import random
from datasets import Audio
from transformers import WhisperProcessor, WhisperTokenizer
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Gain
import numpy as np

def preprocess_dataset(batch):

  audio = batch["audio"]
  batch["input_features"] = processor.feature_extractor(audio["array"],
    sampling_rate=audio["sampling_rate"]).input_features[0]
  batch["labels"] = tokenizer(batch["sentence"]).input_ids

  return batch

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.3),
    TimeStretch(min_rate=0.9, max_rate=1.1, p=0.3),
    PitchShift(min_semitones=-1, max_semitones=1, p=0.3),
])

def augment_audio(batch):
  audio = batch["audio"]
  samples = np.array(audio["array"], dtype=np.float32)
  augmented_samples = augment(samples=samples, sample_rate=audio["sampling_rate"])
  batch["audio"]["array"] = augmented_samples
  return batch

# 定義 torchaudio 增強策略
def augment_audio_torchaudio(example):
  waveform = torch.tensor(example["audio"]["array"]).float().unsqueeze(0)  # (1, N)
  sample_rate = example["audio"]["sampling_rate"]

  augmentations = [
    torchaudio.transforms.Vol(gain=random.uniform(-5, 5), gain_type="db"),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    torchaudio.transforms.TimeMasking(time_mask_param=35),
  ]

  # 隨機選擇 1~3 個增強
  for transform in random.sample(augmentations, k=random.randint(1, len(augmentations))):
    try:
      waveform = transform(waveform)
    except:
      continue

  example["audio"]["array"] = waveform.squeeze().numpy()
  return example

def transform_with_aug(example):
  return preprocess_dataset(augment_audio_torchaudio(example))

# 整合增強與 Whisper 預處理
def transform_with_two_aug(example):
  audio = example["audio"]["array"]
  sample_rate = example["audio"]["sampling_rate"]

  # 50% 機率執行 audiomentations
  if random.random() < 0.5:
    augmented_audio = augment(samples=audio, sample_rate=sample_rate)
    example["audio"]["array"] = augmented_audio

  # 50% 機率執行 torchaudio
  if random.random() < 0.5:
    waveform = torch.tensor(example["audio"]["array"]).float().unsqueeze(0)
    transforms = [
        torchaudio.transforms.Vol(gain=random.uniform(-5, 5), gain_type="db"),
        torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
        torchaudio.transforms.TimeMasking(time_mask_param=35),
    ]
    for t in random.sample(transforms, k=random.randint(1, len(transforms))):
      try:
        waveform = t(waveform)
      except:
        continue
    example["audio"]["array"] = waveform.squeeze().numpy()
  return preprocess_dataset(example)


In [None]:
# train_dataset = train_dataset.map(augment_audio)
# train_dataset = train_dataset.map(transform_with_aug)

# 確保音訊欄位是 Audio 類型
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
LibriSpeech_train_dataset = LibriSpeech_train_dataset.cast_column("audio", Audio(sampling_rate=16000))

train_dataset_orig = train_dataset.map(preprocess_dataset, remove_columns=test_dataset.column_names)
train_dataset_aug = train_dataset.map(transform_with_aug, remove_columns=test_dataset.column_names)
# train_dataset_aug = train_dataset.map(transform_with_two_aug, remove_columns=dataset.column_names["train"])
train_dataset = concatenate_datasets([train_dataset_orig, train_dataset_aug])

# LibriSpeech_train_dataset = LibriSpeech_train_dataset.map(preprocess_dataset, remove_columns=LibriSpeech_train_dataset.column_names)
# train_dataset_aug = train_dataset.map(preprocess_dataset, remove_columns=train_dataset.column_names)
# train_dataset = concatenate_datasets([train_dataset_aug, LibriSpeech_train_dataset])

# train_dataset = train_dataset.map(transform_with_aug, remove_columns=test_dataset.column_names)
test_dataset = test_dataset.map(preprocess_dataset, remove_columns=test_dataset.column_names)

Map: 100%|██████████| 5567/5567 [00:30<00:00, 184.22 examples/s]
Map: 100%|██████████| 1231/1231 [00:08<00:00, 140.46 examples/s]
Map: 100%|██████████| 308/308 [00:33<00:00,  9.29 examples/s]


In [23]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [24]:
d = [test_dataset[i] for i in range(2)]
batch = data_collator(d)
print(batch['input_features'])
print(batch['labels'])

tensor([[[-0.5641, -0.6291, -0.5743,  ..., -0.8220, -0.8220, -0.8220],
         [-0.5496, -0.6818, -0.3148,  ..., -0.8220, -0.8220, -0.8220],
         [-0.4647, -0.7626, -0.0974,  ..., -0.8220, -0.8220, -0.8220],
         ...,
         [-0.8220, -0.8220, -0.8220,  ..., -0.8220, -0.8220, -0.8220],
         [-0.8220, -0.8220, -0.8220,  ..., -0.8220, -0.8220, -0.8220],
         [-0.8220, -0.8220, -0.8220,  ..., -0.8220, -0.8220, -0.8220]],

        [[-0.8233, -0.8233, -0.8233,  ..., -0.8233, -0.8233, -0.8233],
         [-0.8233, -0.8233, -0.8233,  ..., -0.8233, -0.8233, -0.8233],
         [-0.8233, -0.8233, -0.8233,  ..., -0.8233, -0.8233, -0.8233],
         ...,
         [-0.8233, -0.8233, -0.8233,  ..., -0.8233, -0.8233, -0.8233],
         [-0.8233, -0.8233, -0.8233,  ..., -0.8233, -0.8233, -0.8233],
         [-0.8233, -0.8233, -0.8233,  ..., -0.8233, -0.8233, -0.8233]]])
tensor([[50363,   474,  1578,   466,   406,  1419,  1096,   309,   293,   286,
           841,  6884,   293,   300, 

In [25]:
from jiwer import mer
from transformers.models.whisper.english_normalizer import BasicTextNormalizer


normalizer = BasicTextNormalizer()

def mixed_tokenizer(text):
  text = normalizer(text.strip())
  tokens = []
  temp_token = ""
  # print(text)
  for char in text:
    if '\u4e00' <= char <= '\u9fff':
      if temp_token:
          tokens.append(temp_token)
          temp_token = ""
      tokens.append(char)
    elif char.isspace():
      if temp_token:
        tokens.append(temp_token)
        temp_token = ""
    else:
      temp_token += char
  if temp_token:
    tokens.append(temp_token)
  return tokens

def compute_metrics(eval_pred):
  predictions, labels = eval_pred

  decoded_preds = processor.batch_decode(predictions, skip_special_tokens=True)
  decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)

  decoded_preds = [normalizer(pred.strip()) for pred in decoded_preds]
  decoded_labels = [normalizer(label.strip()) for label in decoded_labels]

  paired = [
      (ref, hyp) for ref, hyp in zip(decoded_labels, decoded_preds)
      if ref.strip() != "" and hyp.strip() != ""
  ]

  filtered_labels, filtered_preds = zip(*paired) if paired else ([], [])
  if len(filtered_labels) == 0:
    return {"mer": 1.0}

  ref_tokens = [mixed_tokenizer(t) for t in filtered_labels]
  pred_tokens = [mixed_tokenizer(t) for t in filtered_preds]
  # print(ref_tokens)
  # print(pred_tokens)
  ref_strs = [" ".join(tokens) for tokens in ref_tokens]
  pred_strs = [" ".join(tokens) for tokens in pred_tokens]

  try:
    score = mer(ref_strs, pred_strs)
  except Exception as e:
    print("Error during MER computation:", e)
    score = 1.0

  return {"mer": score}

## Finetune decoder only (option)

In [26]:
for param in model.model.encoder.parameters():
  param.requires_grad = False

In [27]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters:     {total_params:,}")
print(f"Trainable ratio:      {trainable_params / total_params:.2%}")

Trainable parameters: 153,580,800
Total parameters:     241,734,912
Trainable ratio:      63.53%


In [28]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)


model.decoder.embed_tokens.weight
model.decoder.embed_positions.weight
model.decoder.layers.0.self_attn.k_proj.weight
model.decoder.layers.0.self_attn.v_proj.weight
model.decoder.layers.0.self_attn.v_proj.bias
model.decoder.layers.0.self_attn.q_proj.weight
model.decoder.layers.0.self_attn.q_proj.bias
model.decoder.layers.0.self_attn.out_proj.weight
model.decoder.layers.0.self_attn.out_proj.bias
model.decoder.layers.0.self_attn_layer_norm.weight
model.decoder.layers.0.self_attn_layer_norm.bias
model.decoder.layers.0.encoder_attn.k_proj.weight
model.decoder.layers.0.encoder_attn.v_proj.weight
model.decoder.layers.0.encoder_attn.v_proj.bias
model.decoder.layers.0.encoder_attn.q_proj.weight
model.decoder.layers.0.encoder_attn.q_proj.bias
model.decoder.layers.0.encoder_attn.out_proj.weight
model.decoder.layers.0.encoder_attn.out_proj.bias
model.decoder.layers.0.encoder_attn_layer_norm.weight
model.decoder.layers.0.encoder_attn_layer_norm.bias
model.decoder.layers.0.fc1.weight
model.decoder.

## Training & args

In [29]:
train_dataset
# train_dataset[:20]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 6798
})

In [30]:
from transformers import EarlyStoppingCallback


training_args = Seq2SeqTrainingArguments(
  output_dir= f"Small LibriSpeech Ver 4",
  report_to="none",
  num_train_epochs=15,
  per_device_train_batch_size=6,
  per_device_eval_batch_size=6,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  predict_with_generate=True,
  logging_dir=f"Small LibriSpeech Ver 4/logs",
  logging_steps=1,
  fp16=True,
  learning_rate=1e-5,
  warmup_ratio=0.1,
  gradient_accumulation_steps=4,
  dataloader_num_workers=0,
  metric_for_best_model="mer",
  greater_is_better=False,
  load_best_model_at_end=True,
  remove_unused_columns=False,
  weight_decay=0.01,
  lr_scheduler_type="linear",
  warmup_steps=100
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  tokenizer=processor.tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics,
  callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

  trainer = Seq2SeqTrainer(


In [31]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Mer
1,0.0661,0.427748,0.102511
2,0.0406,0.41873,0.132029
3,0.0217,0.42824,0.129351
4,0.0211,0.455737,0.112314


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=1136, training_loss=0.2686336365754677, metrics={'train_runtime': 2782.7685, 'train_samples_per_second': 36.643, 'train_steps_per_second': 1.525, 'total_flos': 7.84721420550144e+18, 'train_loss': 0.2686336365754677, 'epoch': 4.0})

In [32]:
save_directory = r"Small LibriSpeech Ver 4"

model.save_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

tokenizer.save_pretrained(save_directory)
processor.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

Model saved to Small LibriSpeech Ver 4


## Output Result

In [None]:
valid_dataset_list = []
# t1_vaild_audio_folder = r"Private Dataset\audio_ZH"
t1_vaild_audio_folder = r"Private Dataset\audio_EN"
for file in sorted(os.listdir(t1_vaild_audio_folder)):
  if file.endswith(".wav"):
    try:
      file_path = os.path.join(t1_vaild_audio_folder, file)
      audio_array, sr = librosa.load(file_path, sr=16000)
      valid_dataset_list.append({"audio": {"path":file_path,'array':audio_array,'sampling_rate':sr},
                                 "sentence": ""})
    except Exception as e:
      print(e)
      print(f"Can't read {file_path}")

valid_dataset = Dataset.from_pandas(pd.DataFrame(valid_dataset_list))

In [34]:
valid_dataset

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 775
})

In [None]:
# 簡體轉繁體
from opencc import OpenCC

cc = OpenCC('s2t')

In [None]:
model_name = r"Small_Final_EnglishOnly"
# model_name = r"Large_Final_ChineseOnly"
model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)
model.generation_config.language = 'en'
# model.generation_config.language = 'zh'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

output_file = fr"Private Dataset\Task1_Answer\task1_answer_{model_name}.txt"
json_output_file = fr"Private Dataset\Task1_Answer\task1_answer_timestamps_{model_name}.json"
_mapping = {}

with open(output_file, "w", encoding="utf-8") as f:
  for _file in valid_dataset:
    result = transcribe_with_timestamps(_file,model,processor)
    # 簡體轉繁體
    # result['text'] = cc.convert(result['text'])
    filename = os.path.splitext(os.path.basename(_file["audio"]["path"]))[0]
    _mapping[filename] = result
    f.write(f"{filename}\t{result['text']}\n")
    
with open(json_output_file, "w", encoding="utf-8") as f:
  json.dump(_mapping, f, ensure_ascii=False)

# Task 2

## Read Dataset

In [None]:
task2_train_answer = r"Final_Dataset_EN\task2_answer.txt"
task2_train_transcribe = r"Final_Dataset_EN\task1_anwer.txt"
task2_train_data = r'Final_Dataset_EN\task2_train.tsv'
generate_annotated_audio_transcribe_parallel(task2_train_answer, task2_train_transcribe, task2_train_data, num_processes=4)

process annotation file...
annotation file done
processing each medical file
1093	Alright, thank you. Ok can you stop it.	PHI:Null

2244
All medical file done
write out to tsv format...
tsv format dataset done


In [None]:
from datasets import load_dataset, Features, Value

task2_data = load_dataset("csv",data_files=task2_train_data, delimiter='\t',
  features = Features({'fid': Value('string'),'content': Value('string'),'label':Value('string')}),
  column_names=['fid','content','label'])

Generating train split: 2237 examples [00:00, 139010.58 examples/s]


In [None]:
task2_data["train"][0]

{'fid': '19',
 'content': 'Any overture of something that\'s kind of like a little white flag or peace offering to just get a week of peace, I\'m not talking about permanent "I\'m going to placate and cow tow to you and to talk my needs in other..." No. Just talking about lets...',
 'label': 'PHI:Null'}

In [None]:
from collections import Counter


ctr = Counter()

for i in task2_data['train']:
  phi_labelwvalue = i['label'].split("\\n")
  phi_label = [j.split(":")[0] for j in phi_labelwvalue]
  ctr.update(phi_label)

print(ctr)

Counter({'DATE': 1137, 'PHI': 852, 'DURATION': 480, 'DOCTOR': 474, 'PATIENT': 286, 'TIME': 268, 'ID_NUMBER': 240, 'PERSONALNAME': 235, 'FAMILYNAME': 158, 'CITY': 113, 'HOSPITAL': 106, 'MEDICAL_RECORD_NUMBER': 101, 'STATE': 100, 'DEPARTMENT': 99, 'STREET': 90, 'ZIP': 84, 'SET': 70, 'AGE': 50, 'PROFESSION': 37, 'ORGANIZATION': 31, 'LOCATION-OTHER': 20, 'COUNTRY': 13, 'COUNTY': 3, 'PHONE': 2, 'PROFSSION': 2, 'URL': 1, 'DISTRICT': 1})


In [None]:
total_label = sum(ctr.values())
total_label

5053

In [None]:
rare_ratio = 0.1
RARE_THRESHOLD = total_label * rare_ratio
rare_degree1_labels = {label for label, count in ctr.items() if count <= RARE_THRESHOLD}
print(rare_degree1_labels)

rare_ratio = 0.06
RARE_THRESHOLD = total_label * rare_ratio
rare_degree2_labels = {label for label, count in ctr.items() if count <= RARE_THRESHOLD}
print(rare_degree2_labels)

rare_ratio = 0.04
RARE_THRESHOLD = total_label * rare_ratio
rare_degree3_labels = {label for label, count in ctr.items() if count <= RARE_THRESHOLD}
print(rare_degree3_labels)

rare_ratio = 0.02
RARE_THRESHOLD = total_label * rare_ratio
rare_degree4_labels = {label for label, count in ctr.items() if count <= RARE_THRESHOLD}
print(rare_degree4_labels)

{'STREET', 'MEDICAL_RECORD_NUMBER', 'AGE', 'PHONE', 'HOSPITAL', 'PERSONALNAME', 'SET', 'TIME', 'ID_NUMBER', 'DEPARTMENT', 'DISTRICT', 'FAMILYNAME', 'ZIP', 'PATIENT', 'PROFESSION', 'LOCATION-OTHER', 'ORGANIZATION', 'CITY', 'URL', 'DOCTOR', 'COUNTY', 'COUNTRY', 'PROFSSION', 'STATE', 'DURATION'}
{'STREET', 'MEDICAL_RECORD_NUMBER', 'AGE', 'PHONE', 'HOSPITAL', 'PERSONALNAME', 'SET', 'TIME', 'ID_NUMBER', 'DEPARTMENT', 'DISTRICT', 'FAMILYNAME', 'ZIP', 'PATIENT', 'PROFESSION', 'LOCATION-OTHER', 'ORGANIZATION', 'CITY', 'URL', 'COUNTY', 'COUNTRY', 'PROFSSION', 'STATE'}
{'STREET', 'MEDICAL_RECORD_NUMBER', 'AGE', 'PHONE', 'HOSPITAL', 'SET', 'DEPARTMENT', 'DISTRICT', 'ZIP', 'FAMILYNAME', 'PROFESSION', 'LOCATION-OTHER', 'ORGANIZATION', 'CITY', 'URL', 'COUNTY', 'COUNTRY', 'PROFSSION', 'STATE'}
{'ORGANIZATION', 'DEPARTMENT', 'STREET', 'MEDICAL_RECORD_NUMBER', 'URL', 'SET', 'PROFESSION', 'COUNTY', 'AGE', 'COUNTRY', 'PROFSSION', 'ZIP', 'STATE', 'DISTRICT', 'LOCATION-OTHER', 'PHONE'}


In [None]:
def rare_sample(sample, rare_labels):
    if not sample['label'] or sample['label'] == 'PHI:Null':
        return False
    label_lines = sample['label'].split('\n')
    return any(label_line.split(':')[0] in rare_labels for label_line in label_lines)

rare_degree1_data = [sample for sample in task2_data['train'] if rare_sample(sample, rare_degree1_labels)]
print(rare_degree1_data[0])

rare_degree2_data = [sample for sample in task2_data['train'] if rare_sample(sample, rare_degree2_labels)]
print(rare_degree2_data[0])

rare_degree3_data = [sample for sample in task2_data['train'] if rare_sample(sample, rare_degree3_labels)]
print(rare_degree3_data[0])

rare_degree4_data = [sample for sample in task2_data['train'] if rare_sample(sample, rare_degree4_labels)]
print(rare_degree4_data[0])

{'fid': '121', 'content': 'It makes it really hard when I lose things. Even this morning, he knocked over my glasses. I had them on the dresser and he was trying to find them this morning because I can\'t see anything without my glasses. He was like, "Are you sure you didn\'t put them somewhere else?" and I\'m like, "Yes, I put them in one of two places. They are in neither. I heard you knock them over last night when you came back and the lights were off." He got annoyed that I was blaming him and I got annoyed that he got annoyed for me blaming him when it was his fault. He eventually found them obviously because I\'m wearing them but that was a stressful way to wake up.', 'label': 'TIME:this morning\\nTIME:this morning\\nTIME:last night'}
{'fid': '121', 'content': 'It makes it really hard when I lose things. Even this morning, he knocked over my glasses. I had them on the dresser and he was trying to find them this morning because I can\'t see anything without my glasses. He was lik

## Model Imoprt

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

task2_model_name = "deepseek-ai/deepseek-llm-7b-base"
# task2_model_name = "deepseek-ai/deepseek-llm-7b-chat"
bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep = '\n\n####\n\n'

special_tokens_dict = {
    'eos_token': eos,
    'bos_token': bos,
    'pad_token': pad,
    'sep_token': sep
}

tokenizer = AutoTokenizer.from_pretrained(task2_model_name, use_fast=False, trust_remote_code=True)

tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)
config = AutoConfig.from_pretrained(
    task2_model_name,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    sep_token_id=tokenizer.sep_token_id,
    output_hidden_states=False
)

model = AutoModelForCausalLM.from_pretrained(
    task2_model_name,
    quantization_config=bnb_config,
    config=config,
    device_map="auto",       
    trust_remote_code=True
)
model.resize_token_embeddings(len(tokenizer))

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj","k_proj", "o_proj"],
    # target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.28s/it]


<|pad|>: 100017


In [None]:
def collate_batch_with_prompt_template(batch, tokenizer, template =
  "<|endoftext|> __CONTENT__\n\n####\n\n__LABEL__ <|END|>", IGNORED_PAD_IDX = -100):

  texts = [template.replace("__LABEL__", data['label']).replace("__CONTENT__",
    data['content']) for data in list(batch)]
  encoded_seq = tokenizer(texts, padding=True)

  indexed_tks = torch.tensor(encoded_seq['input_ids'])
  attention_mask = torch.tensor(encoded_seq['attention_mask'])
  encoded_label = torch.tensor(encoded_seq['input_ids'])
  encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX

  return indexed_tks, encoded_label, attention_mask

In [None]:
from torch.utils.data import Dataset, DataLoader


train_data = list(task2_data['train'])
train_dataloader = DataLoader(train_data, batch_size=2, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
print(tks)
print(labels)
print(masks)
print(tokenizer.decode(tks[0]))
print(tokenizer.decode(tks[1]))
next(iter(titer))


torch.Size([2, 127])
tensor([[100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017,
         100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017,
         100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017,
         100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017,
         100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017,
         100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017, 100017,
         100017, 100017, 100017, 100000, 100016,   6295,  60656,    453,    280,
           1874,    344,      6,     82,   2447,    280,    837,    245,   1585,
           3682,   9347,    410,   6942,   9139,    276,    953,    752,    245,
           2562,    280,   6942,     11,    304,      6,     76,    441,   6807,
            786,  13128,    440,     40,      6,     76,   1872,    276,  30444,
            387,    285,  13917,   9620,    276,    340,    285,    276,   3075,
       

(tensor([[100000, 100016,    809,   3190,    359,   1921,   2074,    754,    304,
            8433,   2012,     13,   6809,    437,   4421,     11,    362,  28561,
             855,    601,  22954,     13,    304,    661,    768,    331,    254,
           76734,    285,    362,    438,   3507,    276,   1275,    768,    437,
            4421,   1373,    304,    481,      6,     83,   1019,   3270,   1673,
             601,  22954,     13,   1063,    438,    837,     11,    440,   7425,
             340,   2049,    340,   3320,      6,     83,   1957,    768,  11155,
            1979,   1967,    285,    304,      6,     76,    837,     11,    440,
            5661,     11,    304,   1957,    768,    279,    634,    280,    984,
            6171,     13,   1955,    418,    279,   8533,     13,    304,   4101,
             340,  14348,    768,    855,   1562,   2653,    754,    340,   2373,
            1062,    285,    254,  11493,    778,    842,    883,   1063,   2149,
           42600

In [None]:
results = tokenizer(
    [f"{bos} Yeah, I imagine it would — sorry, go ahead. So it's supposed to work immediately, right? Yep. So we'll see if I'm productive tomorrow. I hope I'm productive today. I've actually been trying to plan. If I do the titles today, then I can do my laundry tomorrow. Right. I probably could bring my computer and do titles while I'm doing my laundry. If I was — but I won't do that.{sep}DATE:tomorrow\nDATE:today\nDate:today {eos}",
    f"{bos} I imagine it{sep}PHI:Null {eos}"]
    ,padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print(tokenizer.decode(results['input_ids'][0]))
print(tokenizer.decode(results['input_ids'][1]))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
<｜begin▁of▁sentence｜><|endoftext|> Yeah, I imagine it would — sorry, go ahead. So it's supposed to work immediately, right? Yep. So we'll see if I'm productive tomorrow. I hope I'm productive today. I've actually been trying to plan. If I do the titles today, then I can do my laundry

In [None]:
BATCH_SIZE = 8
bucket_train_dataloader = DataLoader(train_data,
  batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
  collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
  pin_memory=True)

In [None]:
from torch.utils.data import DataLoader

rare_degree1_class_dataloader = DataLoader(rare_degree1_data,
    batch_sampler=OpenDeidBatchSampler(rare_degree1_data, BATCH_SIZE),
    collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
    pin_memory=True
)

rare_degree2_class_dataloader = DataLoader(rare_degree2_data,
    batch_sampler=OpenDeidBatchSampler(rare_degree2_data, BATCH_SIZE),
    collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
    pin_memory=True
)

rare_degree3_class_dataloader = DataLoader(rare_degree3_data,
    batch_sampler=OpenDeidBatchSampler(rare_degree3_data, BATCH_SIZE),
    collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
    pin_memory=True
)

rare_degree4_class_dataloader = DataLoader(rare_degree4_data,
    batch_sampler=OpenDeidBatchSampler(rare_degree4_data, BATCH_SIZE),
    collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
    pin_memory=True
)


In [None]:
model.print_trainable_parameters()

trainable params: 7,864,320 || all params: 6,898,724,864 || trainable%: 0.1140


## Training

In [None]:
import os
import torch
from tqdm import tqdm
from transformers import get_scheduler

def train(
    model,
    tokenizer,
    train_dataloader,
    output_dir,
    val_dataloader=None,
    epochs=50,
    learning_rate=2e-4,
    gradient_accumulation_steps=4,
    max_grad_norm=1.0,
    early_stop_patience=3,
    scheduler_warmup_steps=200,
    scheduler_type="cosine",
    device=None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    os.makedirs(output_dir, exist_ok=True)

    model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * epochs // gradient_accumulation_steps
    lr_scheduler = get_scheduler(
        scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=scheduler_warmup_steps,
        num_training_steps=total_steps,
    )

    best_loss = float("inf")
    early_stop_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch+1}")

        for step, (seqs, labels, masks) in progress_bar:
            input_ids = seqs.to(device)
            labels = labels.to(device)
            attention_mask = masks.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()
            total_loss += loss.item()

            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                model.zero_grad()

            progress_bar.set_postfix(
                loss=loss.item() * gradient_accumulation_steps,
                lr=lr_scheduler.get_last_lr()[0]
            )

        avg_train_loss = total_loss / len(train_dataloader)

        # ==== Validation loss ====
        # 此版本無計算 Validation loss 
        if val_dataloader is not None:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for seqs, labels, masks in val_dataloader:
                    input_ids = seqs.to(device)
                    labels = labels.to(device)
                    attention_mask = masks.to(device)
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss += outputs.loss.item()

            val_loss /= len(val_dataloader)
            current_loss = val_loss
            print(f"\nEpoch {epoch+1} average train loss: {avg_train_loss:.4f}, val loss: {val_loss:.4f}")
        else:
            current_loss = avg_train_loss
            print(f"\nEpoch {epoch+1} average train loss: {avg_train_loss:.4f}")

        # ==== Save best model ====
        if current_loss < best_loss:
            best_loss = current_loss
            early_stop_counter = 0
            best_path = os.path.join(output_dir, "best_adapter")
            model.save_pretrained(best_path)
            tokenizer.save_pretrained(best_path)
            print(f"Best model saved at {best_path} with train loss {current_loss:.4f}")
        else:
            early_stop_counter += 1
            print(f"No improvement. Early stop patience: {early_stop_counter}/{early_stop_patience}")

        # ==== Early stopping ====
        if early_stop_counter >= early_stop_patience:
            print(f"Early stopping triggered at epoch {epoch+1}. Best loss: {best_loss:.4f}")
            break

    # Save final model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Final model saved at {output_dir}")


In [None]:
# 第一階段訓練（全資料）
train(
    model,
    tokenizer,
    train_dataloader=bucket_train_dataloader,
    val_dataloader=None,
    output_dir="Final_deepseek-llm-7b-base_EnglishOnly_Step1",
    learning_rate=2e-4,
    epochs=100
)

Epoch 1:   0%|          | 0/280 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
Epoch 1: 100%|██████████| 280/280 [14:52<00:00,  3.19s/it, loss=4.79, lr=7e-5]  



Epoch 1 average train loss: 0.8531




Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.8531


Epoch 2: 100%|██████████| 280/280 [14:49<00:00,  3.18s/it, loss=2.89, lr=0.00014] 



Epoch 2 average train loss: 0.6369
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.6369


Epoch 3: 100%|██████████| 280/280 [14:48<00:00,  3.17s/it, loss=2.73, lr=0.0002]  



Epoch 3 average train loss: 0.5751
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.5751


Epoch 4: 100%|██████████| 280/280 [14:55<00:00,  3.20s/it, loss=2.68, lr=0.0002]



Epoch 4 average train loss: 0.5473
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.5473


Epoch 5: 100%|██████████| 280/280 [14:51<00:00,  3.18s/it, loss=1.58, lr=0.0002]



Epoch 5 average train loss: 0.5261
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.5261


Epoch 6: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=1.3, lr=0.000199]  



Epoch 6 average train loss: 0.5054
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.5054


Epoch 7: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=1.12, lr=0.000199] 



Epoch 7 average train loss: 0.4843
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.4843


Epoch 8: 100%|██████████| 280/280 [14:53<00:00,  3.19s/it, loss=1.42, lr=0.000199] 



Epoch 8 average train loss: 0.4610
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.4610


Epoch 9: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=1.11, lr=0.000198] 



Epoch 9 average train loss: 0.4356
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.4356


Epoch 10: 100%|██████████| 280/280 [14:51<00:00,  3.18s/it, loss=0.817, lr=0.000197]



Epoch 10 average train loss: 0.4068
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.4068


Epoch 11: 100%|██████████| 280/280 [14:56<00:00,  3.20s/it, loss=0.802, lr=0.000197]



Epoch 11 average train loss: 0.3784
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.3784


Epoch 12: 100%|██████████| 280/280 [14:52<00:00,  3.19s/it, loss=0.614, lr=0.000196]



Epoch 12 average train loss: 0.3497
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.3497


Epoch 13: 100%|██████████| 280/280 [14:53<00:00,  3.19s/it, loss=0.602, lr=0.000195]



Epoch 13 average train loss: 0.3215
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.3215


Epoch 14: 100%|██████████| 280/280 [14:50<00:00,  3.18s/it, loss=0.553, lr=0.000194]



Epoch 14 average train loss: 0.2978
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.2978


Epoch 15: 100%|██████████| 280/280 [14:48<00:00,  3.17s/it, loss=0.443, lr=0.000192]



Epoch 15 average train loss: 0.2730
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.2730


Epoch 16: 100%|██████████| 280/280 [14:49<00:00,  3.18s/it, loss=0.605, lr=0.000191]



Epoch 16 average train loss: 0.2478
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.2478


Epoch 17: 100%|██████████| 280/280 [14:52<00:00,  3.19s/it, loss=0.643, lr=0.00019] 



Epoch 17 average train loss: 0.2258
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.2258


Epoch 18: 100%|██████████| 280/280 [14:38<00:00,  3.14s/it, loss=0.488, lr=0.000188]



Epoch 18 average train loss: 0.2038
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.2038


Epoch 19: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=0.398, lr=0.000187]



Epoch 19 average train loss: 0.1864
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.1864


Epoch 20: 100%|██████████| 280/280 [14:40<00:00,  3.15s/it, loss=0.334, lr=0.000185]



Epoch 20 average train loss: 0.1674
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.1674


Epoch 21: 100%|██████████| 280/280 [14:38<00:00,  3.14s/it, loss=0.513, lr=0.000183]



Epoch 21 average train loss: 0.1496
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.1496


Epoch 22: 100%|██████████| 280/280 [14:34<00:00,  3.12s/it, loss=0.396, lr=0.000181]



Epoch 22 average train loss: 0.1365
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.1365


Epoch 23: 100%|██████████| 280/280 [14:42<00:00,  3.15s/it, loss=0.394, lr=0.00018] 



Epoch 23 average train loss: 0.1219
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.1219


Epoch 24: 100%|██████████| 280/280 [14:46<00:00,  3.17s/it, loss=0.51, lr=0.000178] 



Epoch 24 average train loss: 0.1114
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.1114


Epoch 25: 100%|██████████| 280/280 [14:35<00:00,  3.13s/it, loss=0.547, lr=0.000175]



Epoch 25 average train loss: 0.0999
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0999


Epoch 26: 100%|██████████| 280/280 [14:30<00:00,  3.11s/it, loss=0.373, lr=0.000173]



Epoch 26 average train loss: 0.0890
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0890


Epoch 27: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=0.657, lr=0.000171]



Epoch 27 average train loss: 0.0791
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0791


Epoch 28: 100%|██████████| 280/280 [14:42<00:00,  3.15s/it, loss=0.371, lr=0.000169]



Epoch 28 average train loss: 0.0732
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0732


Epoch 29: 100%|██████████| 280/280 [14:39<00:00,  3.14s/it, loss=0.308, lr=0.000166]



Epoch 29 average train loss: 0.0663
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0663


Epoch 30: 100%|██████████| 280/280 [14:34<00:00,  3.12s/it, loss=0.524, lr=0.000164]



Epoch 30 average train loss: 0.0614
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0614


Epoch 31: 100%|██████████| 280/280 [14:37<00:00,  3.13s/it, loss=0.376, lr=0.000161]



Epoch 31 average train loss: 0.0573
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0573


Epoch 32: 100%|██████████| 280/280 [14:39<00:00,  3.14s/it, loss=0.481, lr=0.000159]



Epoch 32 average train loss: 0.0550
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0550


Epoch 33: 100%|██████████| 280/280 [14:42<00:00,  3.15s/it, loss=0.251, lr=0.000156]



Epoch 33 average train loss: 0.0509
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0509


Epoch 34: 100%|██████████| 280/280 [14:42<00:00,  3.15s/it, loss=0.377, lr=0.000153]



Epoch 34 average train loss: 0.0486
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0486


Epoch 35: 100%|██████████| 280/280 [14:37<00:00,  3.13s/it, loss=0.295, lr=0.000151]



Epoch 35 average train loss: 0.0464
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0464


Epoch 36: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=0.449, lr=0.000148]



Epoch 36 average train loss: 0.0439
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0439


Epoch 37: 100%|██████████| 280/280 [14:43<00:00,  3.15s/it, loss=0.349, lr=0.000145] 



Epoch 37 average train loss: 0.0427
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0427


Epoch 38: 100%|██████████| 280/280 [14:40<00:00,  3.14s/it, loss=0.486, lr=0.000142]



Epoch 38 average train loss: 0.0402
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0402


Epoch 39: 100%|██████████| 280/280 [14:48<00:00,  3.17s/it, loss=0.344, lr=0.000139]



Epoch 39 average train loss: 0.0387
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0387


Epoch 40: 100%|██████████| 280/280 [14:49<00:00,  3.18s/it, loss=0.218, lr=0.000136] 



Epoch 40 average train loss: 0.0380
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0380


Epoch 41: 100%|██████████| 280/280 [14:40<00:00,  3.14s/it, loss=0.311, lr=0.000133] 



Epoch 41 average train loss: 0.0370
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0370


Epoch 42: 100%|██████████| 280/280 [14:44<00:00,  3.16s/it, loss=0.167, lr=0.00013]  



Epoch 42 average train loss: 0.0370
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0370


Epoch 43: 100%|██████████| 280/280 [15:01<00:00,  3.22s/it, loss=0.239, lr=0.000127] 



Epoch 43 average train loss: 0.0372
No improvement. Early stop patience: 1/3


Epoch 44: 100%|██████████| 280/280 [15:01<00:00,  3.22s/it, loss=0.509, lr=0.000124] 



Epoch 44 average train loss: 0.0366
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0366


Epoch 45: 100%|██████████| 280/280 [14:44<00:00,  3.16s/it, loss=0.528, lr=0.000121] 



Epoch 45 average train loss: 0.0329
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0329


Epoch 46: 100%|██████████| 280/280 [14:49<00:00,  3.18s/it, loss=0.483, lr=0.000117] 



Epoch 46 average train loss: 0.0318
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0318


Epoch 47: 100%|██████████| 280/280 [15:05<00:00,  3.23s/it, loss=0.288, lr=0.000114] 



Epoch 47 average train loss: 0.0311
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0311


Epoch 48: 100%|██████████| 280/280 [14:59<00:00,  3.21s/it, loss=0.397, lr=0.000111] 



Epoch 48 average train loss: 0.0302
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0302


Epoch 49: 100%|██████████| 280/280 [14:59<00:00,  3.21s/it, loss=0.351, lr=0.000108] 



Epoch 49 average train loss: 0.0295
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0295


Epoch 50: 100%|██████████| 280/280 [14:45<00:00,  3.16s/it, loss=0.331, lr=0.000105] 



Epoch 50 average train loss: 0.0288
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0288


Epoch 51: 100%|██████████| 280/280 [14:37<00:00,  3.14s/it, loss=0.554, lr=0.000101] 



Epoch 51 average train loss: 0.0282
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0282


Epoch 52: 100%|██████████| 280/280 [14:35<00:00,  3.13s/it, loss=0.451, lr=9.82e-5] 



Epoch 52 average train loss: 0.0280
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0280


Epoch 53: 100%|██████████| 280/280 [14:36<00:00,  3.13s/it, loss=0.45, lr=9.49e-5]  



Epoch 53 average train loss: 0.0273
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0273


Epoch 54: 100%|██████████| 280/280 [14:40<00:00,  3.15s/it, loss=0.504, lr=9.17e-5] 



Epoch 54 average train loss: 0.0271
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0271


Epoch 55: 100%|██████████| 280/280 [14:37<00:00,  3.14s/it, loss=0.447, lr=8.85e-5] 



Epoch 55 average train loss: 0.0273
No improvement. Early stop patience: 1/3


Epoch 56: 100%|██████████| 280/280 [14:28<00:00,  3.10s/it, loss=0.398, lr=8.53e-5] 



Epoch 56 average train loss: 0.0267
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0267


Epoch 57: 100%|██████████| 280/280 [14:52<00:00,  3.19s/it, loss=0.404, lr=8.21e-5] 



Epoch 57 average train loss: 0.0262
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0262


Epoch 58: 100%|██████████| 280/280 [15:36<00:00,  3.34s/it, loss=0.348, lr=7.89e-5] 



Epoch 58 average train loss: 0.0261
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0261


Epoch 59: 100%|██████████| 280/280 [15:24<00:00,  3.30s/it, loss=0.472, lr=7.58e-5] 



Epoch 59 average train loss: 0.0259
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0259


Epoch 60: 100%|██████████| 280/280 [15:15<00:00,  3.27s/it, loss=0.316, lr=7.26e-5] 



Epoch 60 average train loss: 0.0257
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0257


Epoch 61: 100%|██████████| 280/280 [15:13<00:00,  3.26s/it, loss=0.219, lr=6.95e-5] 



Epoch 61 average train loss: 0.0254
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0254


Epoch 62: 100%|██████████| 280/280 [15:00<00:00,  3.22s/it, loss=0.24, lr=6.65e-5]  



Epoch 62 average train loss: 0.0251
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0251


Epoch 63: 100%|██████████| 280/280 [14:59<00:00,  3.21s/it, loss=0.253, lr=6.34e-5] 



Epoch 63 average train loss: 0.0250
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0250


Epoch 64: 100%|██████████| 280/280 [14:38<00:00,  3.14s/it, loss=0.419, lr=6.05e-5] 



Epoch 64 average train loss: 0.0250
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0250


Epoch 65: 100%|██████████| 280/280 [14:58<00:00,  3.21s/it, loss=0.402, lr=5.75e-5] 



Epoch 65 average train loss: 0.0249
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0249


Epoch 66: 100%|██████████| 280/280 [15:00<00:00,  3.22s/it, loss=0.414, lr=5.46e-5] 



Epoch 66 average train loss: 0.0247
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0247


Epoch 67: 100%|██████████| 280/280 [15:06<00:00,  3.24s/it, loss=0.352, lr=5.17e-5] 



Epoch 67 average train loss: 0.0245
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0245


Epoch 68: 100%|██████████| 280/280 [15:00<00:00,  3.22s/it, loss=0.382, lr=4.89e-5] 



Epoch 68 average train loss: 0.0245
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0245


Epoch 69: 100%|██████████| 280/280 [15:05<00:00,  3.23s/it, loss=0.308, lr=4.62e-5] 



Epoch 69 average train loss: 0.0244
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0244


Epoch 70: 100%|██████████| 280/280 [15:08<00:00,  3.24s/it, loss=0.504, lr=4.35e-5] 



Epoch 70 average train loss: 0.0243
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0243


Epoch 71: 100%|██████████| 280/280 [15:06<00:00,  3.24s/it, loss=0.44, lr=4.08e-5]  



Epoch 71 average train loss: 0.0243
No improvement. Early stop patience: 1/3


Epoch 72: 100%|██████████| 280/280 [15:09<00:00,  3.25s/it, loss=0.44, lr=3.83e-5]  



Epoch 72 average train loss: 0.0242
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0242


Epoch 73: 100%|██████████| 280/280 [15:15<00:00,  3.27s/it, loss=0.398, lr=3.58e-5] 



Epoch 73 average train loss: 0.0241
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter with train loss 0.0241


Epoch 74:   7%|▋         | 20/280 [01:24<18:24,  4.25s/it, loss=0.0828, lr=3.56e-5] 


KeyboardInterrupt: 

In [None]:
from peft import PeftModel
# 第二階段訓練（稀有類別資料）

# <-- 載入前一階段最佳模型載入 LoRA 權重 -->
best_dir = "Final_deepseek-llm-7b-base_EnglishOnly_Step1"
adapter_path = os.path.join(best_dir, "best_adapter")
model = PeftModel.from_pretrained(model, adapter_path)

train(
    model,
    tokenizer,
    train_dataloader=rare_degree1_class_dataloader,
    val_dataloader=None,
    output_dir="Final_deepseek-llm-7b-base_EnglishOnly_Step2",
    learning_rate=1e-4,
    epochs=50
)

Epoch 1:   0%|          | 0/120 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
Epoch 1: 100%|██████████| 120/120 [07:10<00:00,  3.59s/it, loss=3.93, lr=1.5e-5] 



Epoch 1 average train loss: 0.8815




Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step2\best_adapter with train loss 0.8815


Epoch 2: 100%|██████████| 120/120 [07:00<00:00,  3.51s/it, loss=3.37, lr=3e-5]   



Epoch 2 average train loss: 0.8816
No improvement. Early stop patience: 1/3


Epoch 3: 100%|██████████| 120/120 [07:04<00:00,  3.54s/it, loss=4.17, lr=4.5e-5] 



Epoch 3 average train loss: 0.8816
No improvement. Early stop patience: 2/3


Epoch 4: 100%|██████████| 120/120 [07:03<00:00,  3.53s/it, loss=4.4, lr=6e-5]    



Epoch 4 average train loss: 0.8820
No improvement. Early stop patience: 3/3
Early stopping triggered at epoch 4. Best loss: 0.8815
Final model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step2


In [None]:
from peft import PeftModel
# 第三階段訓練（稀有類別資料）

# <-- 載入前一階段最佳模型載入 LoRA 權重 -->
best_dir = "Final_deepseek-llm-7b-base_EnglishOnly_Step2"
adapter_path = os.path.join(best_dir, "best_adapter")
model = PeftModel.from_pretrained(model, adapter_path)

train(
    model,
    tokenizer,
    train_dataloader=rare_degree2_class_dataloader,
    val_dataloader=None,
    output_dir="Final_deepseek-llm-7b-base_EnglishOnly_Step3",
    learning_rate=5e-5,
    epochs=50
)

Epoch 1: 100%|██████████| 83/83 [05:09<00:00,  3.73s/it, loss=4.28, lr=5.25e-6]



Epoch 1 average train loss: 0.8684
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step3\best_adapter with train loss 0.8684


Epoch 2: 100%|██████████| 83/83 [05:02<00:00,  3.65s/it, loss=4.28, lr=1.05e-5]



Epoch 2 average train loss: 0.8683
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step3\best_adapter with train loss 0.8683


Epoch 3: 100%|██████████| 83/83 [05:03<00:00,  3.65s/it, loss=4.28, lr=1.57e-5]



Epoch 3 average train loss: 0.8686
No improvement. Early stop patience: 1/3


Epoch 4: 100%|██████████| 83/83 [05:00<00:00,  3.62s/it, loss=4.28, lr=2.1e-5] 



Epoch 4 average train loss: 0.8677
Best model saved at Final_deepseek-llm-7b-base_EnglishOnly_Step3\best_adapter with train loss 0.8677


Epoch 5: 100%|██████████| 83/83 [25:12<00:00, 18.22s/it, loss=4.28, lr=2.63e-5]



Epoch 5 average train loss: 0.8683
No improvement. Early stop patience: 1/3


Epoch 6: 100%|██████████| 83/83 [09:59<00:00,  7.22s/it, loss=4.28, lr=3.15e-5]  



Epoch 6 average train loss: 0.8678
No improvement. Early stop patience: 2/3


Epoch 7:  30%|███       | 25/83 [1:00:44<14:34, 15.08s/it, loss=3.65, lr=3.3e-5]   

In [None]:
from peft import PeftModel
# 第四階段訓練（稀有類別資料）

# <-- 載入前一階段最佳模型載入 LoRA 權重 -->
adapter_path = "Final_deepseek-llm-7b-base_EnglishOnly_Step3/best_adapter"
model = PeftModel.from_pretrained(model, adapter_path)

train(
    model,
    tokenizer,
    train_dataloader=rare_degree3_class_dataloader,
    val_dataloader=None,
    output_dir="Final_deepseek-llm-7b-base_EnglishOnly_Step4",
    learning_rate=4e-5,
    epochs=40
)

In [None]:
from peft import PeftModel
# 第五階段訓練（稀有類別資料）

# <-- 載入前一階段最佳模型載入 LoRA 權重 -->
adapter_path = "Final_deepseek-llm-7b-base_EnglishOnly_Step4/best_adapter"
model = PeftModel.from_pretrained(model, adapter_path)

train(
    model,
    tokenizer,
    train_dataloader=rare_degree3_class_dataloader,
    val_dataloader=None,
    output_dir="Final_deepseek-llm-7b-base_EnglishOnly_Step5",
    learning_rate=3e-5,
    epochs=40
)

## Load finetune model

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)                
print(torch.cuda.is_available())       
print(torch.version.cuda)          

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,AutoConfig
from peft import PeftModel

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep = '\n\n####\n\n'

# base_model_name = "deepseek-ai/deepseek-llm-7b-chat"
base_model_name = "deepseek-ai/deepseek-llm-7b-base"
adapter_path = r"Final_deepseek-llm-7b-base_EnglishOnly_Step1\best_adapter"

tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
tokenizer.padding_side = 'left'
config = AutoConfig.from_pretrained(
    base_model_name,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    sep_token_id=tokenizer.sep_token_id,
    output_hidden_states=False,
    # llm_int8_enable_fp32_cpu_offload=True
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    # llm_int8_enable_fp32_cpu_offload=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    config=config,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto",
    # offload_folder="offload"
)

base_model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(base_model, adapter_path,file_name="adapter_model.safetensors")
model.eval().cuda()

## Output result

In [None]:
task2_valid_data = r'Task1_Answer\task1_answer_Small_Final_EnglishOnly.txt'
valid_data = load_dataset("csv", data_files=task2_valid_data, delimiter='\t',
  features = Features({'fid': Value('string'),'content': Value('string')}),
  column_names=['fid', 'content'])
valid_list = list(valid_data['train'])
valid_data

DatasetDict({
    train: Dataset({
        features: ['fid', 'content'],
        num_rows: 639
    })
})

In [None]:
with open(r'Task1_Answer\task1_answer_timestamps_Small_Final_EnglishOnly.json', 'r', encoding='utf-8') as file:
  audio_timestamps = json.load(file)

In [None]:
import re
# 去除標點只留文字
def normalize_token(token):
  return re.sub(r"[^\w]", "", token.lower())  

In [None]:
def is_overlapped(st, ed, existing_spans):
    for exist_st, exist_ed in existing_spans:
        if exist_st is None or exist_ed is None:
            continue

        if st is None or ed is None:
            continue

        if st >= exist_st and ed <= exist_ed:
            return True  
        if st < exist_ed and ed > exist_st:
            return True  


In [None]:
# 補標註 DOCTOR
import re

def regex_postprocess_doctor1(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    doctor_pattern = re.compile(
        r'\bDr\.?\s+(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*|'        
        r'[A-Z]\.\s*[A-Z][a-z]+|'                        
        r'[A-Z]{2,})\b'                                       
    )

    for match in doctor_pattern.finditer(text):
        matched_text = match.group()
        match_start, match_end = match.start(), match.end()

        # 對齊 tokens：與 match 有重疊的 word
        span_start, span_end = None, None
        for word in words:
            w_start, w_end = word["start"], word["end"]
            if w_start is None or w_end is None:
                continue
            if w_end > match_start and w_start < match_end:
                if span_start is None:
                    span_start = w_start
                span_end = w_end

        if span_start is not None and span_end is not None:
            if (span_start, span_end) not in existing_spans:
                new_annos.append({
                    "phi": "DOCTOR",
                    "st_time": span_start,
                    "ed_time": span_end,
                    "entity": matched_text.strip()
                })
                existing_spans.add((span_start, span_end))

    return existing_annos + new_annos


In [None]:
# 補標註 DOCTOR
import re

def regex_postprocess_doctor2(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}
    exclusion_set = {"PM", "AM", "ID", "ZIP"}
    short_code_pattern = re.compile(r'\b[A-Z]{2,4}\b')

    for match in short_code_pattern.finditer(text):
        matched_text = match.group()
        if matched_text in exclusion_set:
            continue
        for i in range(len(words)):
            word = words[i]['word']
            if normalize_token(word) == normalize_token(matched_text):
                st, ed = words[i]['start'], words[i]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "DOCTOR",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                break

    return existing_annos + new_annos



In [None]:
# 補標註 DEPARTMENT
import re

def regex_postprocess_department(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    pattern = re.compile(
        r'\b(?:[A-Z][a-z]+(?: [A-Z][a-z]+)*|SydPath|St\. Vincent|hunter|st\. vincent|sydpath)' 
        r'( Pathology)?'                      
        r' department\b',                     
        re.IGNORECASE
    )


    for match in pattern.finditer(text):
        matched_text = match.group()
        matched_span = match.span()
        matched_tokens = matched_text.strip().split()
        n = len(matched_tokens)

        for i in range(len(words) - n + 1):
            segment_tokens = [normalize_token(words[i + j]['word']) for j in range(n)]
            if " ".join(segment_tokens).lower() == normalize_token(matched_text).lower():
                st, ed = words[i]['start'], words[i + n - 1]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "DEPARTMENT",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                    existing_spans.add((st, ed))
                break

    return existing_annos + new_annos


In [None]:
# 補標註 HOSPITAL
import re

def regex_postprocess_hospital(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    pattern = re.compile(
        r'\b(?:'
        r'(?:The\s)?(?:[A-Z][a-z]+|[A-Z]{2,}|Children|Memorial|Port|Westmead|Health|Service)' 
        r'(?:\s[A-Z][a-z]+)*'              
        r'\s(?:Hospital|Memorial Hospital|Health Service)' 
        r')\b',
        re.IGNORECASE
    )


    for match in pattern.finditer(text):
        matched_text = match.group()
        matched_span = match.span()

        matched_tokens = matched_text.strip().split()
        n = len(matched_tokens)

        for i in range(len(words) - n + 1):
            segment_tokens = [normalize_token(words[i + j]['word']) for j in range(n)]
            if " ".join(segment_tokens).lower() == normalize_token(matched_text).lower():
                st, ed = words[i]['start'], words[i + n - 1]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "HOSPITAL",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                    existing_spans.add((st, ed))
                break

    return existing_annos + new_annos


In [None]:
# 補標註 COUNTRY
import re

def regex_postprocess_country(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    country_names = [
        "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda",
        "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas",
        "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize",
        "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil",
        "Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia",
        "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China",
        "Colombia", "Comoros", "Congo", "Costa Rica", "Croatia",
        "Cuba", "Cyprus", "Czech Republic", "Democratic Republic of the Congo",
        "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt",
        "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia",
        "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana",
        "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti",
        "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq",
        "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan",
        "Kenya", "Kiribati", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon",
        "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg",
        "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands",
        "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco",
        "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia",
        "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria",
        "North Korea", "North Macedonia", "Norway", "Oman", "Pakistan", "Palau",
        "Palestine State", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
        "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis",
        "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino",
        "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles",
        "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands",
        "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka",
        "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Tajikistan", "Tanzania",
        "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia",
        "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates",
        "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu",
        "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe", "Taiwan"
]

    for date_phrase in country_names:
        escaped_phrase = re.escape(date_phrase)
        pattern = re.compile(rf'\b{escaped_phrase}\b', re.IGNORECASE)

        for match in pattern.finditer(text):
            matched_text = match.group()

            date_tokens = matched_text.lower().split()
            n = len(date_tokens)

            for i in range(len(words) - n + 1):
                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == date_tokens:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if (st, ed) not in existing_spans:
                        new_annos.append({
                            "phi": "COUNTRY",
                            "st_time": st,
                            "ed_time": ed,
                            "entity": matched_text
                        })
                        existing_spans.add((st, ed))
                    break

    return existing_annos + new_annos


In [None]:
# 補標註 CITY
import re

def regex_postprocess_city(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    city_names = [
    "Washington, D.C.", "New York", "Los Angeles", "Chicago", "Houston",
    "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas",
    "San Jose", "Austin", "Jacksonville", "Fort Worth", "Columbus",
    "Charlotte", "San Francisco", "Indianapolis", "Seattle", "Denver",
    "Canberra", "Sydney", "Melbourne", "Brisbane", "Perth",
    "Adelaide", "Gold Coast", "Newcastle", "Wollongong", "Logan City",
    "London", "Birmingham", "Manchester", "Glasgow", "Leeds",
    "Liverpool", "Sheffield", "Bristol", "Edinburgh", "Leicester",
    "Coventry", "Kingston upon Hull", "Bradford", "Cardiff", "Belfast"
    ]

    for date_phrase in city_names:
        escaped_phrase = re.escape(date_phrase)
        pattern = re.compile(rf'\b{escaped_phrase}\b', re.IGNORECASE)

        for match in pattern.finditer(text):
            matched_text = match.group()

            date_tokens = matched_text.lower().split()
            n = len(date_tokens)

            for i in range(len(words) - n + 1):
                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == date_tokens:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if (st, ed) not in existing_spans:
                        new_annos.append({
                            "phi": "COUNTRY",
                            "st_time": st,
                            "ed_time": ed,
                            "entity": matched_text
                        })
                        existing_spans.add((st, ed))
                    break

    return existing_annos + new_annos


In [None]:
# 補標註 ZIP
import re
from collections import defaultdict
from rapidfuzz import fuzz

def regex_postprocess_zip(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    pattern = re.compile(r'\b\d{4}\b')

    for match in pattern.finditer(text):
        matched_text = match.group()

        if 1900 <= int(matched_text) <= 2065:
            continue

        for i in range(len(words)):
            word = words[i]['word']
            if normalize_token(word) == normalize_token(matched_text):
                st, ed = words[i]['start'], words[i]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "ZIP",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                break

    return existing_annos + new_annos

In [None]:
# 補標註AGE
import re

def regex_postprocess_age(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    patterns = [
        r'\b\d{1,3}\s+years\s+old\b',
        r'\b\d{1,3}-year-old\b',
        r'\b(?:one|two|three|four|five|six|seven|eight|nine|ten|'
        r'eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|'
        r'twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred)(?:\s*[- ])?(?:year[s]?)?\s+old\b'
    ]

    for raw_pattern in patterns:
        pattern = re.compile(raw_pattern, re.IGNORECASE)
        for match in pattern.finditer(text):
            matched_text = match.group()
            matched_tokens = matched_text.replace('-', ' ').split()
            n = len(matched_tokens)

            for i in range(len(words) - n + 1):
                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == [t.lower() for t in matched_tokens]:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if (st, ed) not in existing_spans:
                        new_annos.append({
                            "phi": "AGE",
                            "st_time": st,
                            "ed_time": ed,
                            "entity": matched_text
                        })
                        existing_spans.add((st, ed))
                    break

    return existing_annos + new_annos


In [None]:
# 補標註 DATE
import re

def regex_postprocess_date(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    date_list = [
        "now", "today", "tomorrow", "yesterday",
        "last week", "last month", "last year",
        "this week", "this month", "this year",
        "next week", "next month", "next year",
        "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
        "Mondays", "Tuesdays", "Wednesdays", "Thursdays", "Fridays", "Saturdays", "Sundays",
        "next Monday", "next Tuesday", "next Wednesday", "next Thursday", "next Friday", "next Saturday", "next Sunday",
        "next few Wednesdays", "last Friday",
        "weekend", "weekday", "in the moment"
        "the day before", "the day after", "the previous day", "the next day",
        "the same day", "same day", "another week", "a couple of days",
        "in two weeks", "in a month", "in a year",
        "January", "February", "March", "April", "May", "June",
        "July", "August", "September", "October", "November", "December",
        "first semester", "second semester", "academic year",
        "Easter", "Christmas", "New Year's Day", "Thanksgiving", "Halloween",
        "by the end of the week", "by next year", "at the beginning of the month"
    ]


    for date_phrase in date_list:
        escaped_phrase = re.escape(date_phrase)
        pattern = re.compile(rf'\b{escaped_phrase}\b', re.IGNORECASE)

        for match in pattern.finditer(text):
            matched_text = match.group()

            date_tokens = matched_text.lower().split()
            n = len(date_tokens)

            for i in range(len(words) - n + 1):

                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == date_tokens:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if not is_overlapped(st, ed, existing_spans):
                        if (st, ed) not in existing_spans:
                            new_annos.append({
                                "phi": "DATE",
                                "st_time": st,
                                "ed_time": ed,
                                "entity": matched_text
                            })
                            existing_spans.add((st, ed))
                        break

    return existing_annos + new_annos

In [None]:
# 補標註 DATE
import re

def regex_postprocess_date_standard(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    pattern = re.compile(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b')

    for match in pattern.finditer(text):
        matched_text = match.group()

        date_tokens = matched_text.split()
        if len(date_tokens) == 3 and "," in date_tokens[1]:
            date_tokens[1] = date_tokens[1].replace(",", "")
        
        n = len(date_tokens)
        for i in range(len(words) - n + 1):
            segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
            if segment_tokens == [t.lower() for t in date_tokens]:
                st, ed = words[i]['start'], words[i + n - 1]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "DATE",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                    existing_spans.add((st, ed))
                break

    return existing_annos + new_annos


In [None]:
# 補標註 TIME
import re

def regex_postprocess_time1(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    time_list = [
        "this morning", "this afternoon", "this evening", "tonight",
        "last night","in the morning",
        "Monday morning", "Tuesday morning", "Wednesday morning", "Thursday morning", "Friday morning", "Saturday morning", "Sunday morning",
        "Monday afternoon", "Tuesday afternoon", "Wednesday afternoon", "Thursday afternoon", "Friday afternoon", "Saturday afternoon", "Sunday afternoon",
        "Monday evening", "Tuesday evening", "Wednesday evening", "Thursday evening", "Friday evening", "Saturday evening", "Sunday evening",
        "Monday night", "Tuesday night", "Wednesday night", "Thursday night", "Friday night", "Saturday night", "Sunday night", "another night","same morning"        
    ]


    for date_phrase in time_list:
        escaped_phrase = re.escape(date_phrase)
        pattern = re.compile(rf'\b{escaped_phrase}\b', re.IGNORECASE)

        for match in pattern.finditer(text):
            matched_text = match.group()

            date_tokens = matched_text.lower().split()
            n = len(date_tokens)

            for i in range(len(words) - n + 1):
                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == date_tokens:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if (st, ed) not in existing_spans:
                        new_annos.append({
                            "phi": "TIME",
                            "st_time": st,
                            "ed_time": ed,
                            "entity": matched_text
                        })
                        existing_spans.add((st, ed))
                    break

    return existing_annos + new_annos

In [None]:
# 補標註 TIME
import re
from collections import defaultdict
from rapidfuzz import fuzz

def regex_postprocess_time2(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}
    
    pattern = re.compile(r'\b(?:[0-2]?[0-9]):[0-5][0-9](?:\s?[AaPp]\.?[Mm]\.?)?\b')

    for match in pattern.finditer(text):
        matched_text = match.group()
        matched_tokens = matched_text.split()  
        for i in range(len(words)):
            word = words[i]['word']
            if normalize_token(word) == normalize_token(matched_text):
                st, ed = words[i]['start'], words[i]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "TIME",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                break

    return existing_annos + new_annos


In [None]:
# 補標註 DURATION
import re

def regex_postprocess_duration1(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    duration_list = [
        "the same day", "same day", "another week", "a couple of days", "a whole week", 
        "past few weeks", "past few days", "past few months", "past few years", "a long time"
        "in two weeks", "in a month", "in a year", "in the past", "quite a while", "long periods of time"    
    ]


    for date_phrase in duration_list:
        escaped_phrase = re.escape(date_phrase)
        pattern = re.compile(rf'\b{escaped_phrase}\b', re.IGNORECASE)

        for match in pattern.finditer(text):
            matched_text = match.group()

            date_tokens = matched_text.lower().split()
            n = len(date_tokens)

            for i in range(len(words) - n + 1):
                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == date_tokens:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if (st, ed) not in existing_spans:
                        new_annos.append({
                            "phi": "DURATION",
                            "st_time": st,
                            "ed_time": ed,
                            "entity": matched_text
                        })
                        existing_spans.add((st, ed))
                    break

    return existing_annos + new_annos

In [None]:
# 補標註 DURATION
import re
from collections import defaultdict
from rapidfuzz import fuzz

def regex_postprocess_duration2(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}
    
    number_pattern = r"(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|a|an|few|several|couple|last)"
    time_unit_pattern = r"(?:second|seconds|minute|minutes|hour|hours|day|days|week|weeks|month|months|year|years|session|sessions)"

    pattern = re.compile(
        rf"\b(?:{number_pattern}(?:\s+of)?(?:\s+{number_pattern})?\s+{time_unit_pattern}s?)\b",
        re.IGNORECASE
    )
    
    for match in pattern.finditer(text):
        matched_text = match.group()
        matched_tokens = matched_text.split()
        for i in range(len(words)):
            word = words[i]['word']
            if normalize_token(word) == normalize_token(matched_text):
                st, ed = words[i]['start'], words[i]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "DURATION",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                break

    return existing_annos + new_annos


In [None]:
# 補標註 PHONE
import re

def regex_postprocess_phone(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}

    phone_list = [
        "911", "110"
    ]


    for date_phrase in phone_list:
        escaped_phrase = re.escape(date_phrase)
        pattern = re.compile(rf'\b{escaped_phrase}\b', re.IGNORECASE)

        for match in pattern.finditer(text):
            matched_text = match.group()

            date_tokens = matched_text.lower().split()
            n = len(date_tokens)

            for i in range(len(words) - n + 1):
                segment_tokens = [normalize_token(words[i + j]['word']).lower() for j in range(n)]
                if segment_tokens == date_tokens:
                    st, ed = words[i]['start'], words[i + n - 1]['end']
                    if (st, ed) not in existing_spans:
                        new_annos.append({
                            "phi": "PHONE",
                            "st_time": st,
                            "ed_time": ed,
                            "entity": matched_text
                        })
                        existing_spans.add((st, ed))
                    break

    return existing_annos + new_annos

In [None]:
# 補標註 MEDICAL_RECORD_NUMBER
import re
from collections import defaultdict
from rapidfuzz import fuzz

def regex_postprocess_MRN(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}
    
    pattern = re.compile(r'\b\d{6,7}\.[A-Z]{3}\b')

    for match in pattern.finditer(text):
        matched_text = match.group()
        matched_tokens = matched_text.split()
        for i in range(len(words)):
            word = words[i]['word']
            if normalize_token(word) == normalize_token(matched_text):
                st, ed = words[i]['start'], words[i]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "MEDICAL_RECORD_NUMBER",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                break

    return existing_annos + new_annos


In [None]:
# 補標註 ID_NUMBER
import re
from collections import defaultdict
from rapidfuzz import fuzz

def regex_postprocess_ID(text, words, existing_annos):
    new_annos = []
    existing_spans = {(anno["st_time"], anno["ed_time"]) for anno in existing_annos}
    
    pattern = re.compile(
        r'\b(?:\d{8}|'                
        r'\d{2}-\d{7}|'                
        r'\d{2}[A-Z]\d{5,6}[A-Z]?|'  
        r'\d{2}[A-Z]\d{5,6})\b'        
    )

    for match in pattern.finditer(text):
        matched_text = match.group()
        matched_tokens = matched_text.split()  
        for i in range(len(words)):
            word = words[i]['word']
            if normalize_token(word) == normalize_token(matched_text):
                st, ed = words[i]['start'], words[i]['end']
                if (st, ed) not in existing_spans:
                    new_annos.append({
                        "phi": "ID_NUMBER",
                        "st_time": st,
                        "ed_time": ed,
                        "entity": matched_text
                    })
                break

    return existing_annos + new_annos


In [None]:
from collections import defaultdict
from rapidfuzz import fuzz
import re

def normalize_token(token):
  return re.sub(r"[^\w]", "", token.lower()) 

def fuzzy_match(a, b, threshold=85):
  return fuzz.ratio(a, b) >= threshold


train_phi_category = ['PATIENT', 'DOCTOR', 'USERNAME','FAMILYNAME','PERSONALNAME','PROFESSION',
             'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY',
             'DISTRICT','COUNTY','STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
             'AGE',
             'DATE', 'TIME', 'DURATION', 'SET',
             'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDR',
             'SSN', 'MEDICALRECORD', 'HEALTHPLAN', 'ACCOUNT',
             'LICENSE', 'VEHICLE', 'DEVICE', 'BIOID', 'IDNUM',
             'OTHER']

def get_anno_format(infos, audio_timestamps):
  anno_list = []
  phi_dict = defaultdict(list)
  clean_infos = infos.replace('\\n', '\n')
  for line in clean_infos.split("\n"):
    if ":" not in line:
      continue
    key, value = line.split(":", 1)
    key = key.strip()
    value = value.strip().rstrip("\\")
    if key in train_phi_category and value:
      phi_dict[key].append(value)


  remaining_timestamps = audio_timestamps.copy()
  used_indices = set()

  for phi_key, phi_values in phi_dict.items():
    for phi_value in phi_values:
      phi_tokens = phi_value.lower().strip().split()

      for i in range(len(remaining_timestamps) - len(phi_tokens) + 1):
        if any((i + j) in used_indices for j in range(len(phi_tokens))):
          continue

        match = True

        for j, phi_token in enumerate(phi_tokens):
          tsd_word = remaining_timestamps[i + j]['word']
          if not fuzzy_match(normalize_token(tsd_word), normalize_token(phi_token)):
            match = False
            break

        if match:
          anno_list.append({
              "phi": phi_key,
              "st_time": remaining_timestamps[i]['start'],
              "ed_time": remaining_timestamps[i + len(phi_tokens) - 1]['end'],
              "entity": phi_value
          })
          for j in range(len(phi_tokens)):
            used_indices.add(i + j)
          break

  # 加入正規表達式補標
  full_text = " ".join([w["word"] for w in audio_timestamps])
  anno_list = regex_postprocess_doctor1(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_doctor2(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_hospital(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_department(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_country(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_city(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_age(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_date_standard(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_date(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_zip(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_time1(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_time2(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_duration1(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_duration2(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_phone(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_MRN(full_text, audio_timestamps, anno_list)
  anno_list = regex_postprocess_ID(full_text, audio_timestamps, anno_list)
  return anno_list

def aicup_predict(model, tokenizer, _input, audio_timestamps, template = "<|endoftext|> __CONTENT__\n\n####\n\n"):
  seeds = [template.replace("__CONTENT__", data['content']) for data in _input]
  sep = tokenizer.sep_token
  eos = tokenizer.eos_token
  pad = tokenizer.pad_token
  model.eval()
  device = model.device
  texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
  outputs = []

  with torch.no_grad():
    output_tokens = model.generate(**texts, max_new_tokens=32, do_sample=True,
    temperature=0.1,     
    top_p=1,    
    top_k=100,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id)
    preds = tokenizer.batch_decode(output_tokens, skip_special_tokens=False)
    for idx , pred in enumerate(preds):
      if "Null" in pred:
        continue
      phi_infos = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
      # print(phi_infos)
      words = []
      segments = audio_timestamps[_input[idx]['fid']]['segments']
      for seg in segments:
        words.extend(seg['words'])

      annotations = get_anno_format(phi_infos, words)
      # annotations = get_anno_format(phi_infos,audio_timestamps[_input[idx]['fid']]['segments'])
      for annotation in annotations:
        outputs.append(f'{_input[idx]["fid"]}\t{annotation["phi"]}\t{annotation["st_time"]}\t{annotation["ed_time"]}\t{annotation["entity"]}')
  return outputs

In [None]:
print("audio_timestamps keys:", list(audio_timestamps.keys())[:10])

audio_timestamps keys: ['60014', '60015', '60018', '60022', '60049', '60079', '60084', '60102', '60147', '60167']


In [None]:
import torch
from tqdm import tqdm

BATCH_SIZE = 16
task2_answer_filename = r"task2_answer.txt"
with open(task2_answer_filename,'w',encoding='utf8') as f:
  for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
    with torch.no_grad():
      data = valid_list[i:i+BATCH_SIZE]
      outputs = aicup_predict(model, tokenizer, data, audio_timestamps)
      # print(f"[DEBUG] Batch {i} outputs: {outputs}")
      for o in outputs:
        # print(o)
        f.write(o)
        f.write('\n')

  0%|          | 0/40 [00:05<?, ?it/s]


NameError: name 'regex_postprocess_doctor1' is not defined

In [None]:
# 使用集合去重並保留順序
seen = set()
unique_lines = []

with open(task2_answer_filename, 'r', encoding='utf-8') as f:
    for line in f:
        if line not in seen:
            seen.add(line)
            unique_lines.append(line)

# 將不重複的行寫回原檔案
with open(task2_answer_filename, 'w', encoding='utf-8') as f:
    f.writelines(unique_lines)
