In [2]:
!pip install torch transformers datasets tqdm psutil sentencepiece

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
import time
import random
import os
import psutil
from tqdm import tqdm
import gc

# Remove any forced CPU usage to allow GPU utilization if available
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Set larger sequence lengths for a better context window
MAX_INPUT_LENGTH = 512   # Increased context window for input
MAX_TARGET_LENGTH = 256  # Increased context window for target

def check_requirements():
    try:
        import sentencepiece
    except ImportError:
        print("Please install required packages:")
        print("pip install sentencepiece transformers tqdm psutil")
        exit(1)
    try:
        import numpy
        if numpy.__version__.startswith('2'):
            print("Please run: pip install numpy==1.24.3")
            exit(1)
    except ImportError:
        pass

class CasualConversationDataset(Dataset):
    def __init__(self, tokenizer, max_samples=1000):
        self.tokenizer = tokenizer

        print("Loading Casual Conversation dataset...")
        dataset = load_dataset("SohamGhadge/casual-conversation")['train']
        all_examples = list(dataset)[:max_samples]

        self.examples = []
        for item in all_examples:
            if item.get('question') and item.get('answer'):
                question = item['question'].strip()
                answer = item['answer'].strip()
                if len(question) < 3 or len(answer) < 3:
                    continue
                self.examples.append({
                    'question': question,
                    'answer': answer
                })
                if len(self.examples) >= max_samples:
                    break

        print(f"Loaded {len(self.examples)} valid samples from casual conversation dataset")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        input_text = f"question: {item['question']}"
        inputs = self.tokenizer(
            input_text,
            max_length=MAX_INPUT_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        targets = self.tokenizer(
            text=item['answer'],
            max_length=MAX_TARGET_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = targets['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels
        }

class SQuADDataset(Dataset):
    def __init__(self, tokenizer, max_samples=2000):
        self.tokenizer = tokenizer

        print("Loading SQuAD dataset...")
        squad_dataset = load_dataset("rajpurkar/squad")['train']
        all_examples = list(squad_dataset)[:max_samples]

        self.examples = []
        for item in all_examples:
            if item.get('context') and item.get('question') and item.get('answers') and len(item['answers']['text']) > 0:
                context = item['context'].strip()
                question = item['question'].strip()
                answer = item['answers']['text'][0].strip()
                if len(context) < 10 or len(question) < 3 or len(answer) < 1:
                    continue
                self.examples.append({
                    'context': context,
                    'question': question,
                    'answer': answer
                })
                if len(self.examples) >= max_samples:
                    break

        print(f"Loaded {len(self.examples)} valid samples from SQuAD dataset")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        input_text = f"question: Given this context: {item['context']} {item['question']}"
        inputs = self.tokenizer(
            input_text,
            max_length=MAX_INPUT_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        targets = self.tokenizer(
            text=item['answer'],
            max_length=MAX_TARGET_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = targets['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels
        }

class UltraChatDataset(Dataset):
    CACHE_DIR = "dataset_cache"

    def __init__(self, tokenizer, max_samples=1000, stream_loading=True, cache=True):
        self.tokenizer = tokenizer
        self.max_samples = max_samples
        self.cache = cache

        if self.cache and not os.path.exists(self.CACHE_DIR):
            os.makedirs(self.CACHE_DIR)

        cache_file = os.path.join(self.CACHE_DIR, f"ultrachat_{max_samples}.pt")
        if self.cache and os.path.exists(cache_file):
            print(f"Loading cached UltraChat dataset from {cache_file}...")
            self.examples = torch.load(cache_file)
            print(f"Loaded {len(self.examples)} cached samples from UltraChat dataset")
            return

        print("Loading UltraChat dataset...")
        self.examples = []
        if stream_loading:
            dataset = load_dataset("stingning/ultrachat", streaming=True)
            counter = 0
            with tqdm(total=max_samples, desc="Processing UltraChat samples") as pbar:
                for item in dataset['train']:
                    if 'data' in item and isinstance(item['data'], list) and len(item['data']) >= 2:
                        data = item['data']
                        for i in range(0, len(data)-1, 2):
                            if i+1 < len(data):
                                question = data[i].strip()
                                answer = data[i+1].strip()
                                if len(question) < 5 or len(answer) < 5:
                                    continue
                                self.examples.append({
                                    'question': question,
                                    'answer': answer
                                })
                                counter += 1
                                pbar.update(1)
                                if counter >= max_samples:
                                    break
                    if counter >= max_samples:
                        break
                    if counter % 500 == 0:
                        gc.collect()
        else:
            dataset = load_dataset("stingning/ultrachat")['train']
            all_examples = list(dataset)[:max_samples*3]
            for item in tqdm(all_examples, desc="Processing UltraChat samples"):
                if 'data' in item and isinstance(item['data'], list) and len(item['data']) >= 2:
                    data = item['data']
                    for i in range(0, len(data)-1, 2):
                        if i+1 < len(data):
                            question = data[i].strip()
                            answer = data[i+1].strip()
                            if len(question) < 5 or len(answer) < 5:
                                continue
                            self.examples.append({
                                'question': question,
                                'answer': answer
                            })
                            if len(self.examples) >= max_samples:
                                break
                if len(self.examples) >= max_samples:
                    break

        print(f"Loaded {len(self.examples)} valid samples from UltraChat dataset")
        if self.cache:
            print(f"Caching processed UltraChat dataset to {cache_file}...")
            torch.save(self.examples, cache_file)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        input_text = f"question: {item['question']}"
        inputs = self.tokenizer(
            input_text,
            max_length=MAX_INPUT_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        targets = self.tokenizer(
            text=item['answer'],
            max_length=MAX_TARGET_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = targets['input_ids'].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels
        }

def custom_collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

def train_model(model, train_loader, num_epochs, learning_rate, device, max_grad_norm=1.0):
    # Set performance mode to maximum: one batch per accumulation (no accumulation)
    batch_accumulation_steps = 1
    sleep_interval = 0  # No sleep for maximum performance

    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    total_batches = len(train_loader)

    print("Training with maximum performance mode")

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print("=" * 30)

        progress_bar = tqdm(total=total_batches, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch_idx, batch in enumerate(train_loader):
            batch = {k: v.to(device) for k, v in batch.items()}

            if batch_idx % batch_accumulation_steps == 0:
                optimizer.zero_grad()

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"]
            )
            loss = outputs.loss / batch_accumulation_steps
            loss.backward()

            if (batch_idx + 1) % batch_accumulation_steps == 0 or (batch_idx + 1) == total_batches:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()

            total_loss += loss.item() * batch_accumulation_steps

            progress_bar.set_postfix({"Loss": f"{loss.item() * batch_accumulation_steps:.4f}"})
            progress_bar.update(1)

            if sleep_interval > 0:
                time.sleep(sleep_interval)

            if batch_idx % 10 == 0:
                gc.collect()

        progress_bar.close()
        avg_loss = total_loss / total_batches
        print(f"Epoch {epoch+1} Avg Loss: {avg_loss:.4f}")
        gc.collect()

def generate_answer(model, tokenizer, question, device, max_length=100, temperature=0.7, num_beams=5):
    model.eval()
    input_str = f"question: {question}"
    inputs = tokenizer.encode(input_str, return_tensors="pt").to(device)
    outputs = model.generate(
         inputs,
         max_length=max_length,
         num_beams=num_beams,
         do_sample=True,
         temperature=temperature,
         early_stopping=True,
         no_repeat_ngram_size=3,
         top_k=50,
         top_p=0.9
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

def monitor_system_resources():
    cpu_percent = psutil.cpu_percent(interval=0.1)
    memory_usage = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)
    return cpu_percent, memory_usage

if __name__ == "__main__":
    check_requirements()

    # Hardcoded configuration for maximum performance and capacity
    MODEL_SIZE = "base"            # Options: "small", "base", "large"
    CASUAL_SAMPLES = 1000
    SQUAD_SAMPLES = 2000
    ULTRACHAT_SAMPLES = 1000
    BATCH_SIZE = 4
    EPOCHS = 4
    LEARNING_RATE = 0.0005

    # Use the highest performance mode (no batch accumulation)
    PERFORMANCE_MODE = "performance"
    # Do not use smaller model settings
    SMALLER_MODEL = False

    start_time = time.time()

    if SMALLER_MODEL:
        MAX_INPUT_LENGTH = 256
        MAX_TARGET_LENGTH = 128
        print("Using smaller sequence lengths to reduce memory usage")

    print(f"Using model size: t5-{MODEL_SIZE}")
    print("Datasets selected: all")
    print(f"Input sequence length: {MAX_INPUT_LENGTH}, Target sequence length: {MAX_TARGET_LENGTH}")

    print("Initializing tokenizer...")
    tokenizer = T5Tokenizer.from_pretrained(f"t5-{MODEL_SIZE}")

    print("Loading datasets...")
    datasets = []
    # Load all datasets
    casual_dataset = CasualConversationDataset(tokenizer, max_samples=CASUAL_SAMPLES)
    datasets.append(casual_dataset)

    squad_dataset = SQuADDataset(tokenizer, max_samples=SQUAD_SAMPLES)
    datasets.append(squad_dataset)

    print("Loading UltraChat dataset (this may take a while)...")
    try:
        ultrachat_dataset = UltraChatDataset(tokenizer, max_samples=ULTRACHAT_SAMPLES, stream_loading=True, cache=True)
        datasets.append(ultrachat_dataset)
    except KeyboardInterrupt:
        print("\nUltraChat dataset loading interrupted. Continuing with other datasets.")
        gc.collect()

    if not datasets:
        print("Error: No datasets were loaded.")
        exit(1)

    combined_dataset = ConcatDataset(datasets)
    print(f"Combined dataset created with {len(combined_dataset)} total samples")

    effective_batch_size = BATCH_SIZE
    if MODEL_SIZE == "large":
        effective_batch_size = max(1, BATCH_SIZE // 2)
        print(f"Reducing batch size to {effective_batch_size} for large model")

    train_loader = DataLoader(
        combined_dataset,
        batch_size=effective_batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=False,
        drop_last=False,
        collate_fn=custom_collate_fn
    )

    # Automatically use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        print("Using GPU:", torch.cuda.get_device_name(0))
    else:
        print("Using CPU")
    print(f"Using device: {device}")

    cpu_percent, memory_usage = monitor_system_resources()
    print(f"Initial CPU usage: {cpu_percent}%, Memory usage: {memory_usage:.2f} GB")

    print("Initializing model...")
    model = T5ForConditionalGeneration.from_pretrained(f"t5-{MODEL_SIZE}")
    model.to(device)

    model_size_mb = sum(p.numel() for p in model.parameters()) * 4 / 1024 / 1024
    print(f"Model size: {model_size_mb:.2f} MB")

    print("Starting training...")
    try:
        train_model(
            model,
            train_loader,
            num_epochs=EPOCHS,
            learning_rate=LEARNING_RATE,
            device=device
        )
    except KeyboardInterrupt:
        print("\nTraining interrupted. Saving current model state...")
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print("\nOut of memory error! Consider using a smaller model or reducing the context window.")
        else:
            print(f"\nRuntime error during training: {e}")
        print("Attempting to save current model state...")

    end_time = time.time()
    training_duration = end_time - start_time
    hours = int(training_duration // 3600)
    minutes = int((training_duration % 3600) // 60)
    seconds = int(training_duration % 60)
    print(f"\nTotal training time: {hours:02d}:{minutes:02d}:{seconds:02d}")

    cpu_percent, memory_usage = monitor_system_resources()
    print(f"Final CPU usage: {cpu_percent}%, Memory usage: {memory_usage:.2f} GB")

    print("Saving model and tokenizer...")
    try:
        model.save_pretrained("saved_model")
        tokenizer.save_pretrained("saved_model")
        print("Model and tokenizer saved to 'saved_model' directory")
    except Exception as e:
        print(f"Error saving model: {e}")

    # Optionally, prepare a test set or interactive Q&A session
    print("\nInteractive Question Answering (type 'exit' to quit):")
    print("You can include context in your question by starting with 'Given this context: [your context]'")

    while True:
        user_input = input("\nYour question (or 'exit'): ")
        if user_input.lower() in ["exit", "quit"]:
            break
        answer = generate_answer(
            model,
            tokenizer,
            user_input,
            device=device,
            max_length=150,
            temperature=0.7
        )
        print(f"A: {answer}")

Using model size: t5-base
Datasets selected: all
Input sequence length: 512, Target sequence length: 256
Initializing tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading datasets...
Loading Casual Conversation dataset...


dialog.zip:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3725 [00:00<?, ? examples/s]

Loaded 1000 valid samples from casual conversation dataset
Loading SQuAD dataset...


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Loaded 2000 valid samples from SQuAD dataset
Loading UltraChat dataset (this may take a while)...
Loading UltraChat dataset...


README.md:   0%|          | 0.00/3.12k [00:00<?, ?B/s]

Processing UltraChat samples: 100%|██████████| 1000/1000 [00:00<00:00, 1627.25it/s]


Loaded 1000 valid samples from UltraChat dataset
Caching processed UltraChat dataset to dataset_cache/ultrachat_1000.pt...
Combined dataset created with 4000 total samples
Using GPU: Tesla T4
Using device: cuda
Initial CPU usage: 5.0%, Memory usage: 1.33 GB
Initializing model...


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model size: 850.31 MB
Starting training...
Training with maximum performance mode

Epoch 1/4


Epoch 1/4:   0%|          | 0/1000 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/4: 100%|██████████| 1000/1000 [13:46<00:00,  1.21it/s, Loss=0.8169]


Epoch 1 Avg Loss: 2.2395

Epoch 2/4


Epoch 2/4: 100%|██████████| 1000/1000 [13:49<00:00,  1.21it/s, Loss=0.7395]


Epoch 2 Avg Loss: 1.7233

Epoch 3/4


Epoch 3/4: 100%|██████████| 1000/1000 [13:48<00:00,  1.21it/s, Loss=1.0712]


Epoch 3 Avg Loss: 1.4320

Epoch 4/4


Epoch 4/4: 100%|██████████| 1000/1000 [13:48<00:00,  1.21it/s, Loss=1.4525]


Epoch 4 Avg Loss: 1.1910

Total training time: 00:55:47
Final CPU usage: 10.0%, Memory usage: 1.95 GB
Saving model and tokenizer...
Model and tokenizer saved to 'saved_model' directory

Interactive Question Answering (type 'exit' to quit):
You can include context in your question by starting with 'Given this context: [your context]'

Your question (or 'exit'): hi
A: i just got this outfit the other day.

Your question (or 'exit'): who are you
A: i'm really excited.

Your question (or 'exit'): exit


In [5]:
import shutil
shutil.make_archive("saved_model", "zip", "saved_model")

'/content/saved_model.zip'

In [6]:
!ls -l saved_model.zip

-rw-r--r-- 1 root root 827725529 Feb 27 21:18 saved_model.zip


In [7]:
from google.colab import files
files.download("saved_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>