In [1]:
#  Import all necessary libraries
import torch
import clip
import cv2
import numpy as np
from PIL import Image 
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers.optimization import get_linear_schedule_with_warmup
from torch.optim import AdamW # Corrected import
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim
import json
import os
import time
from tqdm import tqdm

In [2]:
from transformers import get_linear_schedule_with_warmup

In [3]:
class CricketCommentaryDataset(Dataset):
    def __init__(self, annotations, clip_model, preprocess, num_frames=16):
        self.annotations = annotations
        self.clip_model = clip_model
        self.preprocess = preprocess
        self.num_frames = num_frames
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def __len__(self):
        return len(self.annotations)

    def extract_frames(self, video_path, start_time, end_time):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print("video not opened")
            return torch.zeros(self.num_frames, 3, 224, 224)

        fps = cap.get(cv2.CAP_PROP_FPS)
        start_frame = int(start_time * fps)
        end_frame = int(end_time * fps)

        if start_frame >= end_frame:
            return torch.zeros(self.num_frames, 3, 224, 224)

        stride = max(1, (end_frame - start_frame) // self.num_frames)
        frames = []

        for i in range(start_frame, end_frame, stride):
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if ret:
                # Action-focused cropping
                h, w, _ = frame.shape
                crop_size = min(h, w) // 2
                y_start = max(0, (h - crop_size) // 2)
                x_start = max(0, (w - crop_size) // 2)
                cropped = frame[y_start:y_start+crop_size, x_start:x_start+crop_size]

                cropped = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
                pil_image = Image.fromarray(cropped)
                frames.append(self.preprocess(pil_image))
            if len(frames) >= self.num_frames:
                break
        
        # Always ensure we return exactly num_frames
        if len(frames) < self.num_frames:
            num_pad = self.num_frames - len(frames)
            frames.extend([torch.zeros(3, 224, 224)] * num_pad)

        cap.release()
        return torch.stack(frames)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        frames = self.extract_frames(
            ann["video_path"],
            ann["start_time"],
            ann["end_time"]
        )

        # Use the prompt and response directly
        prompt = ann["prompt"]
        response = ann["response"]

        return {
            "frames": frames,
            "prompt": prompt,
            "response": response
        }
    
class TemporalTransformerEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads, num_layers, num_frames, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_frames = num_frames

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        nn.init.trunc_normal_(self.cls_token, std=0.02)

        self.position_embed = nn.Parameter(torch.zeros(1, num_frames + 1, embed_dim))
        nn.init.trunc_normal_(self.position_embed, std=0.02)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=4 * embed_dim,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        B = x.size(0)
        cls_token = self.cls_token.expand(B, 1, -1)
        x = torch.cat([cls_token, x], dim=1)
        x = x + self.position_embed[:, :x.size(1)]
        x = self.transformer(x)
        return {
            "cls": x[:, 0],
            "tokens": x[:, 1:]
        }
class CricketCommentator(nn.Module):
    def __init__(self, train_mode=False, num_frames=16, train_layers=2):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.num_frames = num_frames

        import clip
        self.clip, self.preprocess = clip.load("ViT-B/32", device=self.device)
        self.clip = self.clip.float()

        if train_mode:
            for param in self.clip.parameters():
                param.requires_grad = False

        self.temporal_encoder = TemporalTransformerEncoder(
            embed_dim=512,
            num_heads=8,
            num_layers=3,
            num_frames=num_frames,
            dropout=0.1
        ).to(self.device).float()

        # Updated projection for DeepSeek (2048-dim)
        self.projection = nn.Sequential(
            nn.Linear(512, 2048),
            nn.GELU(),
            nn.LayerNorm(2048),
            nn.Dropout(0.1),
            nn.Linear(2048, 2048),
            nn.Tanh()
        ).to(self.device).float()

        # DeepSeek model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
        self.model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct").to(self.device).float()
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # Freeze all parameters initially
        for param in self.model.parameters():
            param.requires_grad = False

        # Unfreeze last N layers if training
        if train_mode and train_layers > 0:
            # Unfreeze last transformer blocks
            for block in self.model.model.layers[-train_layers:]:
                for param in block.parameters():
                    param.requires_grad = True
            
            # Unfreeze final norm and head
            for param in self.model.model.norm.parameters():
                param.requires_grad = True
            for param in self.model.lm_head.parameters():
                param.requires_grad = True

    def forward(self, frames):
        batch_size = frames.shape[0]
        frames = frames.view(-1, 3, 224, 224)
        with torch.no_grad():
            frame_features = self.clip.encode_image(frames.to(self.device))
        frame_features = frame_features.view(batch_size, self.num_frames, -1).float()
        frame_features = F.normalize(frame_features, p=2, dim=-1)

        temporal_out = self.temporal_encoder(frame_features)
        visual_embeds = self.projection(temporal_out["cls"])
        return F.normalize(visual_embeds, p=2, dim=-1).unsqueeze(1)

    def compute_loss(self, batch):
        frames = batch["frames"].to(self.device)
        prompts = batch["prompt"]
        responses = batch["response"]

        visual_embeds = self.forward(frames)  # [batch_size, 1, 2048]

        full_texts = [f"{p} {r}" for p, r in zip(prompts, responses)]
        inputs = self.tokenizer(
            full_texts,
            return_tensors="pt",
            padding='longest',
            truncation=True,
            max_length=128
        ).to(self.device)

        prompt_inputs = self.tokenizer(
            prompts,
            return_tensors="pt",
            padding='longest',
            truncation=True,
            max_length=128
        ).to(self.device)
        prompt_lengths = prompt_inputs.attention_mask.sum(dim=1)

        # Get text embeddings using DeepSeek's embedding layer
        text_embeddings = self.model.model.embed_tokens(inputs.input_ids)
        
        # Concatenate visual and text embeddings
        input_embeddings = torch.cat([visual_embeds, text_embeddings], dim=1)
        
        # Create attention mask for visual part
        visual_mask = torch.ones(visual_embeds.shape[:2]).to(self.device)
        combined_mask = torch.cat([visual_mask, inputs.attention_mask], dim=1)
        
        # Create labels (-100 for visual token and prompt)
        labels = inputs.input_ids.clone()
        extended_labels = torch.cat([
            -100 * torch.ones(labels.size(0), 1, dtype=torch.long).to(self.device),
            labels
        ], dim=1)
        
        # Mask prompt text in labels
        for i, plen in enumerate(prompt_lengths):
            extended_labels[i, 1:1+plen] = -100

        outputs = self.model(
            inputs_embeds=input_embeddings,
            attention_mask=combined_mask,
            labels=extended_labels
        )
        return outputs.loss
def collate_fn(batch):
    
    """Custom collate function to handle frames"""
    frames = [item["frames"] for item in batch]
    prompts = [item["prompt"] for item in batch]
    responses = [item["response"] for item in batch]
    
    # Stack all frames
    frames_tensor = torch.stack(frames)
    
    return {
        "frames": frames_tensor,
        "prompt": prompts,
        "response": responses
    }

def train_model(model, train_loader, val_loader, epochs, lr):
    device = model.device
    scaler = torch.cuda.amp.GradScaler()  # Mixed precision

    # Group parameters for different learning rates
    temporal_params = list(model.temporal_encoder.parameters())
    proj_params = list(model.projection.parameters())
    deepseek_trainable = [p for p in model.model.parameters() if p.requires_grad]

    optimizer = AdamW([
        {'params': temporal_params, 'lr': lr},
        {'params': proj_params, 'lr': lr},
        {'params': deepseek_trainable, 'lr': lr * 0.1}
    ], weight_decay=0.01)

    plateau_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)

    accum_steps = 4  # Gradient accumulation
    total_steps = len(train_loader) * epochs // accum_steps

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    best_val_loss = float('inf')
    epochs_no_improve = 0
    patience = 5  # For early stopping

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        total_train_loss = 0.0
        step_count = 0

        for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            with torch.cuda.amp.autocast():
                loss = model.compute_loss(batch)

            total_train_loss += loss.item()
            step_count += 1

            scaler.scale(loss).backward()

            if (i + 1) % accum_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

        avg_train_loss = total_train_loss / step_count

        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                loss = model.compute_loss(batch)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        plateau_scheduler.step(avg_val_loss)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            epochs_no_improve += 1
            print(f"No improvement for {epochs_no_improve} epoch(s).")

            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs.")
                break

    # Load best model
    model.load_state_dict(torch.load("best_model.pth"))
    return model

In [4]:
# üñ•Ô∏è Set the computation device (GPU if available)
# Load annotations
with open("final_data/Data_updated_1.json", "r") as f:
    annotations = json.load(f)

# Split into train and validation (85/15)
split_idx = int(0.85 * len(annotations))
train_annotations = annotations[:split_idx]
val_annotations = annotations[split_idx:]

# Initialize CLIP for dataset
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Create datasets
train_dataset = CricketCommentaryDataset(
    train_annotations,
    clip_model,
    preprocess,
    num_frames=16
)
val_dataset = CricketCommentaryDataset(
    val_annotations,
    clip_model,
    preprocess,
    num_frames=16
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=2,  # Small batch size due to memory constraints
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2
)
val_loader = DataLoader(
    val_dataset,
    batch_size=2,
    collate_fn=collate_fn,
    num_workers=2
)

# Initialize model in training mode
model = CricketCommentator(train_mode=True).to(device)

In [5]:
# üöÇ Training loop to fine-tune the model
# Train the model
torch.autograd.set_detect_anomaly(True)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
trained_model = train_model(
    model,
    train_loader,
    val_loader,
    epochs=30,
    lr=1e-4,

)
  # Save final model
torch.save(trained_model.state_dict(), "cricket_commentator_final.pth")
print("Model saved successfully!")

  scaler = torch.cuda.amp.GradScaler()  # Mixed precision
  with torch.cuda.amp.autocast():
Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [17:19<00:00,  3.00s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [03:49<00:00,  3.71s/it]


Epoch 1 | Train Loss: 5.1215 | Val Loss: 4.5680


Epoch 2:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [16:59<00:00,  2.94s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 2 | Train Loss: 4.0203 | Val Loss: 3.8988


Epoch 3:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [17:03<00:00,  2.95s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 3 | Train Loss: 3.3674 | Val Loss: 3.4583


Epoch 4:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [16:59<00:00,  2.94s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 4 | Train Loss: 2.9322 | Val Loss: 3.2787


Epoch 5:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [17:04<00:00,  2.95s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 5 | Train Loss: 2.5950 | Val Loss: 3.1927


Epoch 6:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [17:03<00:00,  2.95s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 6 | Train Loss: 2.3214 | Val Loss: 3.1988
No improvement for 1 epoch(s).


Epoch 7:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [17:00<00:00,  2.94s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 7 | Train Loss: 2.0786 | Val Loss: 3.1862


Epoch 8:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [16:54<00:00,  2.92s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 8 | Train Loss: 1.8447 | Val Loss: 3.2497
No improvement for 1 epoch(s).


Epoch 9:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [16:56<00:00,  2.93s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_P

Epoch 9 | Train Loss: 1.6383 | Val Loss: 3.3047
No improvement for 2 epoch(s).


Epoch 10:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [16:58<00:00,  2.94s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS

Epoch 10 | Train Loss: 1.4548 | Val Loss: 3.3247
No improvement for 3 epoch(s).


Epoch 11:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 11: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [16:56<00:00,  2.93s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS

Epoch 11 | Train Loss: 1.2955 | Val Loss: 3.4490
No improvement for 4 epoch(s).


Epoch 12:   0%|          | 0/347 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 12: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 347/347 [17:00<00:00,  2.94s/it]
Validation:   0%|          | 0/62 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS

Epoch 12 | Train Loss: 1.1449 | Val Loss: 3.4889
No improvement for 5 epoch(s).
Early stopping triggered after 12 epochs.
Model saved successfully!
