In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from evaluate import load
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("PYTORCH_CUDA_ALLOC_CONF:", os.environ.get("PYTORCH_CUDA_ALLOC_CONF"))
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print(f"Using device: {device}")

file_path = 'llm_evaluation_summary.csv'

data = pd.read_csv(file_path)
print("Initial Data Sample:\n", data.head())
print(f"Dataset size before cleaning: {data.shape}")

data = data.dropna(subset=["True Transcription", "Predicted Transcription"]) # .head(500)
data = data[data["True Transcription"].str.strip() != ""]
data = data[data["Predicted Transcription"].str.strip() != ""]
print(f"Dataset size after cleaning: {data.shape}")

data["input_text"] = data["Predicted Transcription"].apply(lambda text: f"correct: {text}")
data["target_text"] = data["True Transcription"]
print("Processed Data Sample:\n", data.head())


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
hf_train_data = Dataset.from_pandas(train_data)
hf_test_data = Dataset.from_pandas(test_data)

print("Training Data Columns:", hf_train_data.column_names)

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=64,
        truncation=True,
        padding="max_length",
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=64,
            truncation=True,
            padding="max_length",
        )
    model_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in seq]
        for seq in labels["input_ids"]
    ]
    return model_inputs

hf_train_data = hf_train_data.map(tokenize_function, batched=True)
hf_test_data = hf_test_data.map(tokenize_function, batched=True)

hf_train_data = hf_train_data.select_columns(["input_ids", "attention_mask", "labels"])
hf_test_data = hf_test_data.select_columns(["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="max_length", max_length=64)
train_dataloader = DataLoader(hf_train_data, batch_size=8, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(hf_test_data, batch_size=8, collate_fn=data_collator)

print("Sample batch from DataLoader:")
for batch in train_dataloader:
    for key, value in batch.items():
        print(f"{key}: {value.shape}")
    break

accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=5e-5)

num_training_steps = len(train_dataloader) * 3
lr_scheduler = get_scheduler("linear", optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)


wer_metric = load("wer")
cer_metric = load("cer")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = [[(token if token != -100 else tokenizer.pad_token_id) for token in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    print("Sample Predictions:", decoded_preds[:5])
    print("Sample Labels:", decoded_labels[:5])
    
    wer = wer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {"wer": wer, "cer": cer}


model.train()
for epoch in range(3):
    loop = tqdm(train_dataloader, leave=True)
    for batch in loop:
        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


model.eval()
eval_loss = 0
predictions = []
labels_list = []
input_texts = []  

with torch.no_grad():
    for batch in eval_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        eval_loss += outputs.loss.item()
        
        
        batch_predictions = torch.argmax(outputs.logits, dim=-1).tolist()
        predictions.extend(batch_predictions)
        labels_list.extend(batch["labels"].tolist())
        
        
        if len(input_texts) < 5:  
            input_ids = batch["input_ids"]
            input_texts.extend(tokenizer.batch_decode(input_ids, skip_special_tokens=True))


eval_loss /= len(eval_dataloader)
print(f"Final Evaluation Loss: {eval_loss}")


metrics = compute_metrics((predictions, labels_list))
print(f"WER: {metrics['wer']}, CER: {metrics['cer']}")


decoded_predictions = tokenizer.batch_decode(predictions[:5], skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(
    [[(token if token != -100 else tokenizer.pad_token_id) for token in label] for label in labels_list[:5]],
    skip_special_tokens=True,
)


print("\nSample Input, Prediction, and Label:")
for i, (inp, pred, label) in enumerate(zip(input_texts[:5], decoded_predictions, decoded_labels)):
    print(f"Sample {i+1}:")
    print(f"Input: {inp}")
    print(f"Prediction: {pred}")
    print(f"Label: {label}")
    print("-" * 50)


model.save_pretrained("./transcription_correction_model")
tokenizer.save_pretrained("./transcription_correction_model")
raise SystemExit("Stop")

  from .autonotebook import tqdm as notebook_tqdm
2024-12-26 21:56:28.064620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-26 21:56:28.076275: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-26 21:56:28.079778: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-26 21:56:28.089480: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
CUDA_VISIBLE_DEVICES: 0,1
Using device: cuda
Initial Data Sample:
    Dataset                                              Model  \
0  atcosim  c:\Users\tim3l\OneDrive\Desktop\Local_Wav2Vec\...   
1  atcosim  c:\Users\tim3l\OneDrive\Desktop\Local_Wav2Vec\...   
2  atcosim  c:\Users\tim3l\OneDrive\Desktop\Local_Wav2Vec\...   
3  atcosim  c:\Users\tim3l\OneDrive\Desktop\Local_Wav2Vec\...   
4  atcosim  c:\Users\tim3l\OneDrive\Desktop\Local_Wav2Vec\...   

                                  True Transcription  \
0  lufthansa four three nine three descend to fli...   
1  lufthansa four three nine three descend to fli...   
2  lufthansa four three nine three descend to fli...   
3  lufthansa four three nine three descend to fli...   
4  lufthansa four three nine three descend to fli...   

                             Predicted Transcription  WER (word Error rate)  \
0  loflans are fourd three nine three-decent flig...               0.583333 

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|█████████████████████████████████████████| 18876/18876 [00:05<00:00, 3630.61 examples/s]
Map: 100%|███████████████████████████████████████████| 4720/4720 [00:01<00:00, 3576.11 examples/s]


Sample batch from DataLoader:
input_ids: torch.Size([8, 64])
attention_mask: torch.Size([8, 64])
labels: torch.Size([8, 64])
decoder_input_ids: torch.Size([8, 64])


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 0: 100%|████████████████████████████████████| 2360/2360 [03:52<00:00, 10.16it/s, loss=0.673]
Epoch 1: 100%|████████████████████████████████████| 2360/2360 [03:53<00:00, 10.13it/s, loss=0.518]
Epoch 2: 100%|████████████████████████████████████| 2360/2360 [03:52<00:00, 10.15it/s, loss=0.459]


Final Evaluation Loss: 0.5842615766608614
Sample Predictions: ['hapag lloyd six five three good afternoon radar contact fly heading of two one zero call you back with climb  ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha', 'csa six one one ruzyne tower continue approach', 'cleared to land runway three one csa six bravo charlie    cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared cleared', 'tower  one  tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower tower t

/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


SystemExit: Stop right there!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
