In [None]:
!pip install pandas librosa tensorflow gtts googletrans==4.0.0-rc1
!apt-get install -y ffmpeg


Collecting gtts
  Downloading gTTS-2.5.1-py3-none-any.whl (29 kB)
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.5.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)


In [2]:
import os
import pandas as pd
import librosa
import numpy as np
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.corpus import stopwords
from torch.utils.data import DataLoader, Dataset

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Data preparation
file_path = '/content/drive/MyDrive/bruce/dialogue/csv/english_shona.csv'
audio_dir = '/content/drive/MyDrive/bruce/dialogue/'

# Load CSV file
data = pd.read_csv(file_path)

# Extract text and audio file paths
text_data = {
    'eng': data['eng'],
    'shona': data['shona']
}
audio_files = data['audio_files'].tolist()

# Text preprocessing function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return tokens

# Preprocess the English text data
preprocessed_text = [preprocess_text(text) for text in text_data['eng']]

# Audio preprocessing function
def load_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13).T
    return mfcc

# Load and preprocess audio files
audio_files = [os.path.join(audio_dir, str(file_name)) for file_name in audio_files if isinstance(file_name, (str, bytes))]
mfcc_data = [load_audio(file_path) for file_path in audio_files]
max_mfcc_length = max(len(mfcc) for mfcc in mfcc_data)
padded_mfcc_data = [np.pad(mfcc, ((0, max_mfcc_length - len(mfcc)), (0, 0)), mode='constant') for mfcc in mfcc_data]

# Initialize model and tokenizer
model_name = 't5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare input data
input_text = [' '.join(text) for text in preprocessed_text]
encoded_inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
input_ids = encoded_inputs['input_ids']
decoder_input_ids = torch.ones_like(input_ids) * tokenizer.eos_token_id

# Ensure all lists have the same length
min_length = min(len(input_ids), len(padded_mfcc_data), len(decoder_input_ids))
input_ids = input_ids[:min_length]
padded_mfcc_data = padded_mfcc_data[:min_length]
decoder_input_ids = decoder_input_ids[:min_length]

# Create a custom dataset
class TextAudioDataset(Dataset):
    def __init__(self, input_ids, audio_data, decoder_input_ids):
        self.input_ids = input_ids
        self.audio_data = audio_data
        self.decoder_input_ids = decoder_input_ids

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'audio_data': torch.tensor(self.audio_data[idx], dtype=torch.float32),
            'decoder_input_ids': self.decoder_input_ids[idx]
        }

# Create dataloader
dataset = TextAudioDataset(input_ids, padded_mfcc_data, decoder_input_ids)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
model.train()
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(num_epochs):
    for batch in dataloader:
        batch_input_ids = batch['input_ids']
        batch_input_audio = batch['audio_data']
        batch_decoder_input_ids = batch['decoder_input_ids']

        # Ensure labels are passed to compute the loss
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_input_ids.ne(tokenizer.pad_token_id), labels=batch_decoder_input_ids)
        loss = outputs.loss

        if loss is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} completed with loss: {loss.item()}")

print("Training completed.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 completed with loss: 0.13209183514118195
Epoch 2/10 completed with loss: 0.0022484639193862677
Epoch 3/10 completed with loss: 0.02071440778672695
Epoch 4/10 completed with loss: 0.00012819981202483177
Epoch 5/10 completed with loss: 0.00027314943145029247
Epoch 6/10 completed with loss: 0.002102519851177931
Epoch 7/10 completed with loss: 0.000677475705742836
Epoch 8/10 completed with loss: 0.0005559088895097375
Epoch 9/10 completed with loss: 0.00043942881166003644
Epoch 10/10 completed with loss: 3.075513450312428e-05
Training completed.


In [3]:
import torch

# After training the model
model_save_path = '/content/drive/MyDrive/bruce/dialogue/t5_shona_english_model.pth'
tokenizer_save_path = '/content/drive/MyDrive/bruce/dialogue/t5_tokenizer/'

# Save the model
torch.save(model.state_dict(), model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

print("Model and tokenizer saved.")


Model and tokenizer saved.
