<a href="https://colab.research.google.com/github/tarakantaacharya/Case_study-code/blob/main/odiatts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install datasets



In [5]:
from datasets import load_dataset

# Example of loading a TTS dataset
dataset = load_dataset("SPRINGLab/IndicTTS_Odia")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'gender'],
        num_rows: 11564
    })
})

In [7]:
from datasets import load_dataset
import torch

# Define a function to process each example
def process_example(example):
    audio_array = example['audio']['array']

    audio_tensor = torch.tensor(audio_array)

    return {
        'text': example['text'],
        'audio': audio_tensor,
    }

# Load and process only the first 100 rows
processed_data = [process_example(dataset['train'][i]) for i in range(5)]

In [8]:
from datasets import Dataset, DatasetDict
import torch

data = processed_data

dataset = Dataset.from_dict({
    'text': [d['text'] for d in data],
    'audio': [d['audio'] for d in data]
})

dataset_dict = DatasetDict({
    'train': dataset
})

In [9]:
from transformers import VitsModel, AutoTokenizer
from torch.utils.data import DataLoader
import torch
from datasets import Dataset
from torch.nn.utils.rnn import pad_sequence

# Load the pre-trained model and tokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-ory")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ory")


# Define collate_fn to handle padding for both text and audio
def collate_fn(batch):
    # Tokenize the text within the collate_fn
    inputs = tokenizer([item['text'] for item in batch], return_tensors="pt", padding=True)

    # Now access input_ids and attention_mask from the tokenizer output
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Convert audio to tensors before padding if it is not already a tensor
    audio = [torch.tensor(item['audio']) if not isinstance(item['audio'], torch.Tensor) else item['audio'] for item in batch]

    max_audio_len = max([len(a) for a in audio])  # Get the maximum audio length
    audio_padded = pad_sequence([torch.cat([a, torch.zeros(max_audio_len - len(a))]) if len(a) < max_audio_len else a for a in audio], batch_first=True)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "audio": audio_padded}

# Define DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.L1Loss()

# Training loop
epochs = 2  # Define number of epochs

# Assuming you have a device (GPU or CPU) defined like this:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move the model to the device


for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        # Move batch data to the device (GPU or CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        audio = batch["audio"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_waveform = outputs.waveform

        # Ensure predicted and target waveforms are of same length
        min_len = min(predicted_waveform.shape[1], audio.shape[1])
        predicted_waveform = predicted_waveform[:, :min_len]
        audio = audio[:, :min_len]

        # Compute loss
        loss = criterion(predicted_waveform, audio)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/2, Loss: 0.0819
Epoch 2/2, Loss: 0.0795


In [9]:
!pip install --upgrade peft



In [10]:
from transformers import VitsModel, AutoTokenizer
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from peft import get_peft_model, LoraConfig
from torch.cuda.amp import autocast, GradScaler

# Load the pre-trained model and tokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-ory")
lora_model = model
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ory")

lora_config = LoraConfig(
    r=8,  # Rank of LoRA
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["k_proj", "q_proj", "v_proj", "out_proj"]  # Apply LoRA to attention layers
)

model = get_peft_model(lora_model, lora_config)

# Define a custom collate function to pad sequences
def collate_fn(batch):
    inputs = tokenizer([item['text'] for item in batch], return_tensors="pt", padding=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Convert audio to tensors before padding if necessary
    # and ensure requires_grad is True
    audio = [torch.tensor(item['audio'], requires_grad=True) if not isinstance(item['audio'], torch.Tensor) else item['audio'].requires_grad_(True) for item in batch]

    max_audio_len = max([len(a) for a in audio])  # Get the maximum audio length
    audio_padded = pad_sequence([torch.cat([a, torch.zeros(max_audio_len - len(a), requires_grad=True)]) if len(a) < max_audio_len else a for a in audio], batch_first=True)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "audio": audio_padded}

# Define DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# Optimizer
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=5e-4)
criterion = torch.nn.L1Loss()

# Mixed Precision setup
scaler = GradScaler()

# Training loop with mixed precision and LoRA
epochs = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        # Move batch data to device (GPU or CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        audio = batch["audio"].to(device)

        # Forward pass
        outputs = lora_model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_waveform = outputs.waveform

        # Ensure predicted and target waveforms are of same length
        min_len = min(predicted_waveform.shape[1], audio.shape[1])
        predicted_waveform = predicted_waveform[:, :min_len]
        audio = audio[:, :min_len]

        # Compute loss
        loss = criterion(predicted_waveform, audio)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

  scaler = GradScaler()


Epoch 1/2, Loss: 0.1028
Epoch 2/2, Loss: 0.1074


In [11]:
import torch
from transformers import AutoModelForCausalLM
from peft import PeftModel

# Function to calculate model size
def get_model_size(model):
    param_size = sum(p.numel() * p.element_size() for p in model.parameters())
    return param_size / (1024**2)  # Convert to MB

# Print sizes
print(f"Base Model Size: {get_model_size(model):.2f} MB")
print(f"LoRA Model Size: {get_model_size(lora_model):.2f} MB")

Base Model Size: 138.72 MB
LoRA Model Size: 138.72 MB
