<a href="https://colab.research.google.com/github/zastixx/0docs/blob/main/sankriti_to_english_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas regex indic-transliteration scikit-learn
!pip install sentencepiece evaluate
!pip install transformers datasets

Collecting indic-transliteration
  Downloading indic_transliteration-2.3.69-py3-none-any.whl.metadata (1.4 kB)
Collecting backports.functools-lru-cache (from indic-transliteration)
  Downloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting roman (from indic-transliteration)
  Downloading roman-5.0-py3-none-any.whl.metadata (3.7 kB)
Downloading indic_transliteration-2.3.69-py3-none-any.whl (155 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.6/155.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backports.functools_lru_cache-2.0.0-py2.py3-none-any.whl (6.7 kB)
Downloading roman-5.0-py3-none-any.whl (5.5 kB)
Installing collected packages: roman, backports.functools-lru-cache, indic-transliteration
Successfully installed backports.functools-lru-cache-2.0.0 indic-transliteration-2.3.69 roman-5.0
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from 

In [2]:
import os

folders = ['output', 'tarun_model', 'sentencepiece']

for folder in folders:
    os.makedirs(folder, exist_ok=True)

In [4]:
import pandas as pd
import regex as re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from sklearn.model_selection import train_test_split
import os

# Create output directory if it doesn't exist
OUT_DIR = '/content/output/'
os.makedirs(OUT_DIR, exist_ok=True)

# 4. Load dataset
INPUT_CSV = '/content/BG_Sanskrit_English.csv'
df = pd.read_csv(INPUT_CSV)
print(f"Initial dataframe: {len(df)} rows")

# 5. Drop nulls and duplicates
df.dropna(subset=['verse_in_sanskrit','translation_in_english'], inplace=True)
df.drop_duplicates(inplace=True)
print(f"After dropna & dedup: {len(df)} rows")

# 6. Debug: show sample
print("Sample rows:\n", df.head(2))
print("\nData types:", df.dtypes)
print("\nUnique values counts:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

# 7. Define script-check functions with debugging
def is_sanskrit(text):
    if not isinstance(text, str):
        print(f"Non-string value found in Sanskrit column: {type(text)}, value: {text}")
        return False

    devanagari_match = bool(re.fullmatch(r'[\p{Devanagari}\s।।—…!?.,;\-]+', text))
    iast_match = bool(re.fullmatch(r"[A-Za-zāīūṛṝḷḹṅñṭḍṇśṣḥṃ\s\-\.,;''!?]+", text))
    return devanagari_match or iast_match

def is_english(text):
    if not isinstance(text, str):
        print(f"Non-string value found in English column: {type(text)}, value: {text}")
        return False

    return bool(re.fullmatch(r"[A-Za-z0-9\s\.,;''\"!?-]+", text))

# 8. Count how many pass each filter
sanskrit_ok = df['verse_in_sanskrit'].apply(is_sanskrit)
english_ok = df['translation_in_english'].apply(is_english)
print(f"Sanskrit-passing: {sanskrit_ok.sum()} / {len(df)}")
print(f"English-passing: {english_ok.sum()} / {len(df)}")

# Check for some failing examples
if len(df) > 0 and sanskrit_ok.sum() < len(df):
    print("\nSome Sanskrit examples that failed the filter:")
    for i, (is_ok, text) in enumerate(zip(sanskrit_ok, df['verse_in_sanskrit'])):
        if not is_ok and i < 5:  # Show just first 5 failures
            print(f"FAIL: {repr(text)}")

if len(df) > 0 and english_ok.sum() < len(df):
    print("\nSome English examples that failed the filter:")
    for i, (is_ok, text) in enumerate(zip(english_ok, df['translation_in_english'])):
        if not is_ok and i < 5:  # Show just first 5 failures
            print(f"FAIL: {repr(text)}")

# 9. Apply combined script filter
filtered_df = df[sanskrit_ok & english_ok].reset_index(drop=True)
print(f"After script filter: {len(filtered_df)} rows")

# If we have no data after filtering, relax filters
if len(filtered_df) == 0:
    print("WARNING: All data filtered out by script checks. Relaxing filters...")

    # Define more permissive script-check functions
    def is_sanskrit_relaxed(text):
        if not isinstance(text, str):
            return False
        # Accept anything with at least one Devanagari character or IAST character
        return bool(re.search(r'[\p{Devanagari}]', text)) or \
               bool(re.search(r'[āīūṛṝḷḹṅñṭḍṇśṣḥṃ]', text))

    def is_english_relaxed(text):
        if not isinstance(text, str):
            return False
        # More permissive - allow more punctuation and other characters
        return bool(re.search(r'[A-Za-z]', text))  # Just require some English letters

    sanskrit_ok_relaxed = df['verse_in_sanskrit'].apply(is_sanskrit_relaxed)
    english_ok_relaxed = df['translation_in_english'].apply(is_english_relaxed)

    print(f"With relaxed filters - Sanskrit-passing: {sanskrit_ok_relaxed.sum()} / {len(df)}")
    print(f"With relaxed filters - English-passing: {english_ok_relaxed.sum()} / {len(df)}")

    filtered_df = df[sanskrit_ok_relaxed & english_ok_relaxed].reset_index(drop=True)
    print(f"After relaxed script filter: {len(filtered_df)} rows")

    # If still no data, just use the original data
    if len(filtered_df) == 0:
        print("WARNING: Still no data after relaxed filters. Using original data.")
        filtered_df = df.copy()
else:
    print("Filtered sample:\n", filtered_df.head(2))

# Continue with filtered_df
df = filtered_df

# 10. Normalize whitespace & punctuation
df['verse_in_sanskrit'] = df['verse_in_sanskrit'].astype(str)\
    .str.strip().str.replace(r'\s+', ' ', regex=True)
df['translation_in_english'] = df['translation_in_english'].astype(str)\
    .str.strip().str.replace(r'[\u2018\u2019\u201C\u201D]', "'", regex=True)

# 11. Transliterate Devanagari → IAST
def to_iast(text):
    try:
        if isinstance(text, str) and re.search(r'\p{Devanagari}', text):
            return transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
        return text
    except Exception as e:
        print(f"Transliteration error for text: {text}")
        print(f"Error: {e}")
        return text

df['verse_iast'] = df['verse_in_sanskrit'].apply(to_iast)

# 12. Length filtering (max 200 tokens)
df['src_len'] = df['verse_iast'].astype(str).str.split().str.len()
df['tgt_len'] = df['translation_in_english'].astype(str).str.split().str.len()

# Print length statistics
print("\nLength statistics before filtering:")
print(f"Source min: {df['src_len'].min()}, max: {df['src_len'].max()}, mean: {df['src_len'].mean():.1f}")
print(f"Target min: {df['tgt_len'].min()}, max: {df['tgt_len'].max()}, mean: {df['tgt_len'].mean():.1f}")

# Count rows that would be filtered out
zero_src = (df['src_len'] == 0).sum()
zero_tgt = (df['tgt_len'] == 0).sum()
long_src = (df['src_len'] > 200).sum()
long_tgt = (df['tgt_len'] > 200).sum()

print(f"Rows with zero source length: {zero_src}")
print(f"Rows with zero target length: {zero_tgt}")
print(f"Rows with source length > 200: {long_src}")
print(f"Rows with target length > 200: {long_tgt}")

length_filtered = df[(df['src_len'] > 0) & (df['tgt_len'] > 0) &
                      (df['src_len'] <= 200) & (df['tgt_len'] <= 200)].reset_index(drop=True)
print(f"After length filter: {len(length_filtered)} rows")

# If we have no data after length filtering, use original data
if len(length_filtered) == 0:
    print("WARNING: All data filtered out by length checks. Using unfiltered data.")
    # Skip length filtering
    length_filtered = df.copy()

df = length_filtered

# 13. Train/dev/test split (only if we have data)
if len(df) >= 3:  # Need at least 3 rows for a meaningful split
    train, temp = train_test_split(df, test_size=0.2, random_state=42)
    dev, test = train_test_split(temp, test_size=0.5, random_state=42)
    print(f"Splits → train: {len(train)}, dev: {len(dev)}, test: {len(test)}")

    # 14. Save splits
    train.to_csv(f'{OUT_DIR}dataset_train.csv', index=False)
    dev.to_csv(f'{OUT_DIR}dataset_dev.csv', index=False)
    test.to_csv(f'{OUT_DIR}dataset_test.csv', index=False)
    print("Saved train/dev/test CSVs.")
else:
    print(f"WARNING: Not enough data ({len(df)} rows) for train/dev/test split. Minimum 3 rows needed.")
    if len(df) > 0:
        df.to_csv(f'{OUT_DIR}dataset_all.csv', index=False)
        print("Saved all data to a single CSV instead.")

Initial dataframe: 701 rows
After dropna & dedup: 640 rows
Sample rows:
                                    verse_in_sanskrit  \
0  धृतराष्ट्र उवाच |धर्मक्षेत्रे कुरुक्षेत्रे समव...   
1  सञ्जय उवाच ।दृष्ट्वा तु पाण्डवानीकं व्यूढं दुर...   

                              translation_in_english  
0  Dhritarashtra said: O Sanjay, after gathering ...  
1  Sanjay said: On observing the Pandava army sta...  

Data types: verse_in_sanskrit         object
translation_in_english    object
dtype: object

Unique values counts:
verse_in_sanskrit: 640 unique values
translation_in_english: 640 unique values
Sanskrit-passing: 0 / 640
English-passing: 462 / 640

Some Sanskrit examples that failed the filter:
FAIL: 'धृतराष्ट्र उवाच |धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः |मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय ||1||'
FAIL: 'सञ्जय उवाच ।दृष्ट्वा तु पाण्डवानीकं व्यूढं दुर्योधनस्तदा ।आचार्यमुपसङ्गम्य राजा वचनमब्रवीत् ।। 2।।'
FAIL: 'पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् ।व्यूढां द्रुपदपुत्रेण तव शिष

In [5]:
import sentencepiece as spm
import pandas as pd
import os

# 1. Paths to cleaned text files
TRAIN_CSV = '/content/output/dataset_train.csv'
VOCAB_PREFIX = '/content/sentencepiece_model'  # will create .model and .vocab

# 2. Check if the training CSV exists
if not os.path.exists(TRAIN_CSV):
    raise FileNotFoundError(f"Training CSV not found at {TRAIN_CSV}. Make sure the previous script completed successfully.")

# 3. Extract Sanskrit (IAST) + English into a single training file
df_train = pd.read_csv(TRAIN_CSV)
print(f"Training data: {len(df_train)} rows")

# Check for empty dataframe
if len(df_train) == 0:
    raise ValueError("The training CSV file is empty. Please check your data preparation step.")

# Print column names to verify
print(f"CSV columns: {df_train.columns.tolist()}")

# 4. Check if required columns exist
required_cols = ['verse_iast', 'translation_in_english']
for col in required_cols:
    if col not in df_train.columns:
        raise ValueError(f"Required column '{col}' not found in the CSV. Available columns: {df_train.columns.tolist()}")

# 5. Extract text to a file for training
with open(f'{VOCAB_PREFIX}.txt', 'w', encoding='utf-8') as f:
    for src, tgt in zip(df_train['verse_iast'], df_train['translation_in_english']):
        if isinstance(src, str) and isinstance(tgt, str):
            f.write(src + '\n')
            f.write(tgt + '\n')
        else:
            print(f"Skipping non-string data: src={type(src)}, tgt={type(tgt)}")

# 6. Check the size of the training data
with open(f'{VOCAB_PREFIX}.txt', 'r', encoding='utf-8') as f:
    line_count = sum(1 for _ in f)
print(f"Training file contains {line_count} lines")

# 7. Train SentencePiece model with lower vocab size
# The error indicates 31437 is the maximum possible vocab size for this dataset
vocab_size = 16000  # Using a lower, safe value

print(f"Training SentencePiece with vocab_size={vocab_size}")
spm.SentencePieceTrainer.Train(
    input=f'{VOCAB_PREFIX}.txt',
    model_prefix=VOCAB_PREFIX,
    vocab_size=vocab_size,  # Reduced from 32000
    character_coverage=1.0,
    model_type='bpe',
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    user_defined_symbols=[],
    input_sentence_size=100000,  # Process at most this many sentences
    train_extremely_large_corpus=True  # For efficiency with small datasets
)

print("SentencePiece model and vocab saved as:")
print(f"  {VOCAB_PREFIX}.model")
print(f"  {VOCAB_PREFIX}.vocab")

# 8. Load the trained tokenizer
sp = spm.SentencePieceProcessor(model_file=f'{VOCAB_PREFIX}.model')

# 9. Tokenize examples
if len(df_train) > 0:
    # Sanskrit example
    sanskrit_example = df_train['verse_iast'].iloc[0]
    sanskrit_tokens = sp.encode(sanskrit_example, out_type=str)
    print("\nSample Sanskrit tokenization:")
    print(f"Original: {sanskrit_example}")
    print(f"Tokens: {sanskrit_tokens}")

    # English example
    english_example = df_train['translation_in_english'].iloc[0]
    english_tokens = sp.encode(english_example, out_type=str)
    print("\nSample English tokenization:")
    print(f"Original: {english_example}")
    print(f"Tokens: {english_tokens}")

    # Vocabulary statistics
    print(f"\nVocabulary size: {sp.vocab_size()}")
    print(f"SentencePiece model loaded successfully")

Training data: 512 rows
CSV columns: ['verse_in_sanskrit', 'translation_in_english', 'verse_iast', 'src_len', 'tgt_len']
Training file contains 1024 lines
Training SentencePiece with vocab_size=16000
SentencePiece model and vocab saved as:
  /content/sentencepiece_model.model
  /content/sentencepiece_model.vocab

Sample Sanskrit tokenization:
Original: lelihyase grasamāna: samantā-llokānsamagrānvadanairjvaladbhi: |tejobhirāpūrya jagatsamagraṃbhāsastavogrā: pratapanti viṣṇo || 30||
Tokens: ['▁lelihyase', '▁grasamāna', ':', '▁samantā', '-', 'llokānsamagrān', 'vadanair', 'jvaladbhi', ':', '▁|', 'tejobhirāpūrya', '▁jagatsamagraṃ', 'bhāsastavogrā', ':', '▁pratapanti', '▁viṣṇo', '▁||', '▁30||']

Sample English tokenization:
Original: With Your fiery tongues You are licking up the hosts of living beings on all sides and devouring them with Your blazing mouths. O Vishnu, You are scorching the entire universe with the fierce, all-pervading rays of Your effulgence.
Tokens: ['▁With', '▁Your', '▁f

In [7]:
import pandas as pd
import os
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import sentencepiece as spm
from datasets import Dataset, DatasetDict

# Paths & constants
SP_MODEL = '/content/sentencepiece/sentencepiece_model.model'
TRAIN_CSV = '/content/output/dataset_train.csv'
DEV_CSV = '/content/output/dataset_dev.csv'
TEST_CSV = '/content/output/dataset_test.csv'
MODEL_NAME_OR_PATH = 'google/mt5-base'
OUTPUT_DIR = '/content/tarun_model'

# Check if files exist
print("Checking files...")
for filepath in [SP_MODEL, TRAIN_CSV, DEV_CSV, TEST_CSV]:
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Required file not found: {filepath}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer from mT5
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH, use_fast=False)

# Load datasets
print("Loading datasets...")
def load_split(path):
    df = pd.read_csv(path)
    # Ensure columns exist
    if 'verse_iast' not in df.columns or 'translation_in_english' not in df.columns:
        raise ValueError(f"Required columns not found in {path}")

    # Select and rename columns
    df = df[['verse_iast', 'translation_in_english']].rename(
        columns={'verse_iast': 'src', 'translation_in_english': 'tgt'}
    )

    # Ensure all values are strings
    df['src'] = df['src'].astype(str)
    df['tgt'] = df['tgt'].astype(str)

    return Dataset.from_pandas(df)

datasets = DatasetDict({
    'train': load_split(TRAIN_CSV),
    'validation': load_split(DEV_CSV),
    'test': load_split(TEST_CSV),
})

print(f"Datasets loaded: train={len(datasets['train'])}, validation={len(datasets['validation'])}, test={len(datasets['test'])}")

# Define preprocessing function
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["src"]
    targets = examples["tgt"]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Tokenize targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    # Replace tokenizer pad token id by -100 for loss calculation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    return model_inputs

# Apply preprocessing
print("Tokenizing datasets...")
tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=["src", "tgt"],
)

print(f"Tokenized datasets: {list(tokenized_datasets['train'].features.keys())}")

# Load model
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME_OR_PATH)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments (basic version)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,  # Reduced for testing
    save_total_limit=2,
    logging_steps=100,
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
print("Starting training...")
trainer.train()

# Save model
print("Saving model...")
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
print("Training complete!")

# Test translations
print("\nTesting model with sample translations:")
for i, example in enumerate(datasets["test"].select(range(min(3, len(datasets["test"]))))):
    src_text = example["src"]
    reference = example["tgt"]

    # Generate translation
    inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=128)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\nExample {i+1}:")
    print(f"Source: {src_text}")
    print(f"Translation: {translation}")
    print(f"Reference: {reference}")

Checking files...
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading datasets...
Datasets loaded: train=512, validation=64, test=64
Tokenizing datasets...


Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Tokenized datasets: ['input_ids', 'attention_mask', 'labels']
Loading model...


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtarunkumaruttam-kiitecell[0m ([33mtarunkumaruttam-kiitecell-kiit-deemed-to-be-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


Saving model...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training complete!

Testing model with sample translations:


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [12]:
import pandas as pd
import os
import numpy as np
import torch
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import sentencepiece as spm
from datasets import Dataset, DatasetDict

# Paths & constants
SP_MODEL = '/content/sentencepiece/sentencepiece_model.model'
TRAIN_CSV = '/content/output/dataset_train.csv'
DEV_CSV = '/content/output/dataset_dev.csv'
TEST_CSV = '/content/output/dataset_test.csv'
MODEL_NAME_OR_PATH = 'google/mt5-base'
OUTPUT_DIR = '/content/tarun_model'

# Check if CUDA is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Check if files exist
print("Checking files...")
for filepath in [SP_MODEL, TRAIN_CSV, DEV_CSV, TEST_CSV]:
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Required file not found: {filepath}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load tokenizer from mT5
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH, use_fast=False)

# Load datasets
print("Loading datasets...")
def load_split(path):
    df = pd.read_csv(path)
    # Ensure columns exist
    if 'verse_iast' not in df.columns or 'translation_in_english' not in df.columns:
        raise ValueError(f"Required columns not found in {path}")

    # Select and rename columns
    df = df[['verse_iast', 'translation_in_english']].rename(
        columns={'verse_iast': 'src', 'translation_in_english': 'tgt'}
    )

    # Ensure all values are strings
    df['src'] = df['src'].astype(str)
    df['tgt'] = df['tgt'].astype(str)

    return Dataset.from_pandas(df)

datasets = DatasetDict({
    'train': load_split(TRAIN_CSV),
    'validation': load_split(DEV_CSV),
    'test': load_split(TEST_CSV),
})

print(f"Datasets loaded: train={len(datasets['train'])}, validation={len(datasets['validation'])}, test={len(datasets['test'])}")

# Define preprocessing function
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["src"]
    targets = examples["tgt"]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Tokenize targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    # Replace tokenizer pad token id by -100 for loss calculation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    return model_inputs

# Apply preprocessing
print("Tokenizing datasets...")
tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=["src", "tgt"],
)

print(f"Tokenized datasets: {list(tokenized_datasets['train'].features.keys())}")

# Clear cache before loading the model
print("Clearing GPU cache...")
torch.cuda.empty_cache()
gc.collect()

# Load model with memory optimization and move it to the correct device
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME_OR_PATH,
    low_cpu_mem_usage=True
    # Removed torch_dtype=torch.float16 to avoid FP16 issues
)
model = model.to(device)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments with memory optimization but without FP16
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,  # Reduced batch size to save memory
    per_device_eval_batch_size=1,   # Reduced batch size to save memory
    gradient_accumulation_steps=16, # Increased to compensate for smaller batch size
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    save_total_limit=1,             # Save fewer checkpoints
    logging_steps=100,
    # Memory optimizations
    gradient_checkpointing=True,    # Use gradient checkpointing to save memory
    # Device settings
    no_cuda=(device.type == 'cpu'),
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
print("Starting training...")
trainer.train()

# Save model
print("Saving model...")
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
print("Training complete!")

# Function to clear GPU memory between operations
def clear_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

# Test translations with memory management
print("\nTesting model with sample translations:")
for i, example in enumerate(datasets["test"].select(range(min(3, len(datasets["test"]))))):
    # Clear memory before processing each example
    clear_gpu_memory()

    src_text = example["src"]
    reference = example["tgt"]

    # Generate translation with proper device handling and memory constraints
    inputs = tokenizer(src_text, return_tensors="pt", padding=True, truncation=True, max_length=96)
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Use more memory-efficient generation settings
    with torch.no_grad():  # Disable gradient calculation during inference
        outputs = model.generate(
            **inputs,
            max_length=96,           # Reduced from 128
            num_beams=2,             # Use fewer beams
            length_penalty=0.6,
            early_stopping=True
        )
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\nExample {i+1}:")
    print(f"Source: {src_text}")
    print(f"Translation: {translation}")
    print(f"Reference: {reference}")

    # Free memory
    del inputs, outputs
    clear_gpu_memory()

Using device: cuda
Checking files...
Loading tokenizer...
Loading datasets...
Datasets loaded: train=512, validation=64, test=64
Tokenizing datasets...


Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Tokenized datasets: ['input_ids', 'attention_mask', 'labels']
Clearing GPU cache...
Loading model...


Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))
  trainer = Seq2SeqTrainer(


Starting training...


Step,Training Loss


Saving model...
Training complete!

Testing model with sample translations:





Example 1:
Source: mayi sarvāṇi karmāṇi saṃnyasyādhyātmacetasā |nirāśīrnirmamo bhūtvā yudhyasva vigatajvara: || 30||
Translation: <extra_id_0> <extra_id_50> <extra_id_24>
Reference: Performing all works as an offering unto Me, constantly meditate on Me as the Supreme. Become free from desire and selfishness, and with your mental grief departed, fight!

Example 2:
Source: tasmātpraṇamya praṇidhāya kāyaṃprasādaye tvāmahamīśamīḍyam |piteva putrasya sakheva sakhyu:priya: priyāyārhasi deva soḍhum || 44||
Translation: <extra_id_0>b <extra_id_55>
Reference: Therefore, O adorable Lord, bowing deeply and prostrating before You, I implore You for Your grace. As a father tolerates his son, a friend forgives his friend, and a lover pardons the beloved, please forgive me for my offences.

Example 3:
Source: sañjaya uvāca |ityarjunaṃ vāsudevastathoktvāsvakaṃ rūpaṃ darśayāmāsa bhūya: |āśvāsayāmāsa ca bhītamenaṃbhūtvā puna: saumyavapurmahātmā || 50||
Translation: <extra_id_0>... <extra_id_40>
Referen

In [13]:
import shutil
import os

# Set the folder path to zip (current directory in this case)
folder_to_zip = '/content'

# Name of the output zip file
output_zip = '/content/my_colab_archive.zip'

# Create a ZIP archive
shutil.make_archive(base_name=output_zip.replace('.zip', ''),
                    format='zip',
                    root_dir=folder_to_zip)

print(f"Zip created at: {output_zip}")


Zip created at: /content/my_colab_archive.zip


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()
