In [1]:
# Install latest versions of all required dependencies
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install datasets
!pip install tqdm
!pip install torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
import json
import torch
from tqdm import tqdm
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import Dataset
import time
import os
from google.colab import drive

In [3]:
# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

Mounting Google Drive...
Mounted at /content/drive


In [11]:
# Set paths and parameters
INPUT_PATH = "/content/drive/MyDrive/translation_outputs/test_translated_qa_data.json"
OUTPUT_PATH = "/content/drive/MyDrive/alif_responses/test_qa_alif.json"
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
BATCH_SIZE = 16  # Number of examples to process before saving
PIPELINE_BATCH_SIZE = 8  # Number of examples to process in parallel on GPU
MAX_LENGTH = 256
MAX_ITEMS = 0  # 0 means all items
SAVE_INTERMEDIATE = True
SAVE_EVERY = 50

In [12]:
# Verify input file exists and create output directory
if not os.path.exists(INPUT_PATH):
    raise FileNotFoundError(f"Input file not found: {INPUT_PATH}")

output_dir = os.path.dirname(OUTPUT_PATH)
if not os.path.exists(output_dir):
    print(f"Creating output directory: {output_dir}")
    os.makedirs(output_dir, exist_ok=True)

In [13]:
# Load dataset
print(f"Loading dataset from {INPUT_PATH}")
with open(INPUT_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Apply max_items limit if specified
if MAX_ITEMS > 0:
    data = data[:MAX_ITEMS]

print(f"Dataset loaded with {len(data)} examples")

Loading dataset from /content/drive/MyDrive/translation_outputs/50_test_translated_qa_data.json
Dataset loaded with 50 examples


In [14]:
# Create the prompt template function
def create_prompt(example):
    """Create a prompt using the knowledge and question."""
    prompt_template = """You are Urdu Chatbot.
### Instruction:
Below is an instruction that describes a task. Write a response in Urdu that appropriately completes the request. Don't say you don't know unless you really don't.
Please be expressive when needed. Give long and detailed answers.

### Knowledge:
{knowledge}

### Input:
Based on the knowledge provided above, answer the following question in Urdu:
{question}

### Response:
"""
    example['prompt'] = prompt_template.format(
        knowledge=example['knowledge'],
        question=example['question']
    )
    return example

# Convert to HF Dataset and preprocess all prompts at once
dataset = Dataset.from_list(data)
dataset = dataset.map(create_prompt)
print(f"Dataset converted to HF format with {len(dataset)} examples")

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Dataset converted to HF format with 50 examples


In [8]:
# Load model with optimized memory settings
def load_model():
    """Load the Alif model and tokenizer - optimized for Google Colab T4 GPU"""
    print("Loading Alif model and tokenizer...")

    # Clear CUDA cache before loading model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"GPU detected: {torch.cuda.get_device_name(0)}")
        print(f"Available GPU memory before loading: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"Used GPU memory before loading: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    # 4-bit quantization configuration
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

    # Load model with optimized settings for T4
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )

    # Report memory usage after loading
    if torch.cuda.is_available():
        print(f"Used GPU memory after loading: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    return model, tokenizer

# Load model and tokenizer
model, tokenizer = load_model()

Loading Alif model and tokenizer...
GPU detected: Tesla T4
Available GPU memory before loading: 15.83 GB
Used GPU memory before loading: 0.00 GB


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Used GPU memory after loading: 9.16 GB


In [15]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [16]:
# Improved batch processing with dataset
def process_dataset_efficiently():
    """Process the dataset efficiently using batched processing"""
    print("Creating text generation pipeline...")

    # Create text generation pipeline
    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=MAX_LENGTH,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        temperature=0.7,
        repetition_penalty=1.2,
        device_map="auto"
    )

    results = []
    start_time = time.time()

    # Use the dataset's batching capabilities
    for i in tqdm(range(0, len(dataset), BATCH_SIZE), desc="Processing batches"):
        # Clear cache between batches
        torch.cuda.empty_cache()

        # Get current batch using dataset's built-in functionality
        end_idx = min(i + BATCH_SIZE, len(dataset))
        current_batch = dataset.select(range(i, end_idx))

        try:
            # Get all prompts at once
            prompts = current_batch["prompt"]

            # Generate text for all prompts using internal batching for parallel processing
            outputs = generator(
                prompts,
                batch_size=PIPELINE_BATCH_SIZE,  # Process this many in parallel
                return_full_text=False
            )

            # Process all results
            batch_results = []
            for j, output in enumerate(outputs):
                # Get original item data
                item = {k: v for k, v in current_batch[j].items() if k != "prompt"}

                # Add generated answer
                item["generated_answer"] = output[0]["generated_text"].strip()
                batch_results.append(item)

            # Add batch results to overall results
            results.extend(batch_results)

            # Log memory usage for debugging
            if torch.cuda.is_available() and i % (5 * BATCH_SIZE) == 0:
                mem_allocated = torch.cuda.memory_allocated() / 1e9
                mem_reserved = torch.cuda.memory_reserved() / 1e9
                print(f"Memory after batch {i}: {mem_allocated:.2f} GB allocated, {mem_reserved:.2f} GB reserved")

        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            # Add error items
            for j in range(len(current_batch)):
                item = {k: v for k, v in current_batch[j].items() if k != "prompt"}
                item["generated_answer"] = f"ERROR: {str(e)}"
                results.append(item)

        # Save intermediate results
        if SAVE_INTERMEDIATE and end_idx % SAVE_EVERY == 0:
            intermediate_file = f"{OUTPUT_PATH.rsplit('.', 1)[0]}_intermediate_{end_idx}.json"
            print(f"Saving intermediate results to {intermediate_file}")
            with open(intermediate_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

    # Calculate and report processing statistics
    end_time = time.time()
    total_time = end_time - start_time
    items_processed = len(results)
    avg_time_per_item = total_time / items_processed if items_processed > 0 else 0
    throughput = items_processed / total_time if total_time > 0 else 0

    print(f"\nProcessing statistics:")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Items processed: {items_processed}")
    print(f"Average time per item: {avg_time_per_item:.2f} seconds")
    print(f"Processing throughput: {throughput:.2f} items/second")

    return results

In [17]:
# Run processing
print("Starting batch processing with optimized dataset handling...")
all_results = process_dataset_efficiently()

# Save final results
print(f"Saving final results to {OUTPUT_PATH}")
with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print("Processing complete!")

Device set to use cuda:0


Starting batch processing with optimized dataset handling...
Creating text generation pipeline...


Processing batches:  25%|██▌       | 1/4 [02:32<07:37, 152.61s/it]

Memory after batch 0: 9.17 GB allocated, 11.23 GB reserved


Processing batches: 100%|██████████| 4/4 [07:36<00:00, 114.01s/it]

Saving intermediate results to /content/drive/MyDrive/alif_responses/50_test_qa_alif_intermediate_50.json

Processing statistics:
Total processing time: 456.04 seconds
Items processed: 50
Average time per item: 9.12 seconds
Processing throughput: 0.11 items/second
Saving final results to /content/drive/MyDrive/alif_responses/50_test_qa_alif.json
Processing complete!



