<a href="https://colab.research.google.com/github/tarakantaacharya/TTS_ODIA/blob/main/Preprocessing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets
! pip install librosa

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
import json
import numpy as np
import torch
import librosa
from datasets import load_dataset
from transformers import AutoTokenizer, AutoProcessor, AutoModel

# Load DAC processor and model
processor = AutoProcessor.from_pretrained("ylacombe/dac_44khz")
dac_model = AutoModel.from_pretrained("ylacombe/dac_44khz")

# Load text tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")

def ensure_mono(audio_waveform):
    """Convert stereo audio to mono."""
    if len(audio_waveform.shape) > 1:  # Check if stereo
        audio_waveform = np.mean(audio_waveform, axis=0)  # Convert to mono
    return audio_waveform

def preprocess_audio(audio_data, sampling_rate, target_sr=44100):
    """Resample and normalize audio."""
    # Ensure mono audio
    audio_data = ensure_mono(audio_data)

    if sampling_rate != target_sr:
        audio_data = librosa.resample(audio_data, orig_sr=sampling_rate, target_sr=target_sr)

    return audio_data, target_sr

def encode_audio_to_dac(audio_waveform, sampling_rate):
    """Convert raw audio waveform into DAC tokens."""
    # Ensure audio is mono
    audio_waveform = ensure_mono(audio_waveform)

    # Resample to 44100 Hz if needed
    if sampling_rate != 44100:
        audio_waveform = librosa.resample(audio_waveform, orig_sr=sampling_rate, target_sr=44100)

    # Convert to PyTorch tensor
    audio_tensor = torch.tensor(audio_waveform, dtype=torch.float32)

    # Process with DAC using the processor, passing correct sampling rate
    with torch.no_grad():
        inputs = processor(audio_tensor, sampling_rate=44100, return_tensors="pt")  # Fixed!
        dac_tokens = dac_model(**inputs).audio_codes  # Extract DAC tokens

    return dac_tokens.squeeze(0).tolist()

def preprocess_text(text, tokenizer):
    """Tokenize text."""
    tokens = tokenizer(text, return_tensors="pt")
    return tokens['input_ids'].squeeze(0).tolist()

def preprocess_data_from_dataset(example, tokenizer):
    """Preprocess a single example of audio & text."""
    audio_data = example["audio"]["array"]
    sampling_rate = example["audio"]["sampling_rate"]

    # Filter only audios of duration 3-5 seconds
    duration_sec = len(audio_data) / sampling_rate
    if not (3.0 <= duration_sec <= 5.0):
        return None  # Skip samples outside this range

    # Preprocess audio & convert to DAC tokens
    norm_audio_data, processed_sr = preprocess_audio(audio_data, sampling_rate)
    dac_audio = encode_audio_to_dac(norm_audio_data, processed_sr)

    # Tokenize text
    text = example["text"]
    tokenized_text = preprocess_text(text, tokenizer)

    return {"audio": dac_audio, "text": tokenized_text}

# Load dataset & filter only 3-5 sec audios
dataset = load_dataset("SPRINGLab/IndicTTS_Odia", split="train")
filtered_dataset = dataset.filter(lambda x: 3.0 <= len(x["audio"]["array"]) / x["audio"]["sampling_rate"] <= 5.0)

processed_data = []

# Process dataset in batches
batch_size = 50
for i in range(0, len(filtered_dataset)//42, batch_size):
    batch = filtered_dataset.select(range(i, min(i + batch_size, len(filtered_dataset))))

    for example in batch:
        processed_item = preprocess_data_from_dataset(example, tokenizer)
        if processed_item:
            processed_data.append(processed_item)

    print(f"Processed {min(i + batch_size, len(filtered_dataset))} rows out of {len(filtered_dataset)}")

# Save processed data as JSON
with open("processed_data.json", "w") as json_file:
    json.dump(processed_data, json_file)

print("✅ Data has been saved to processed_data.json.")

Processed 50 rows out of 4213
Processed 100 rows out of 4213
✅ Data has been saved to processed_data.json.


In [2]:
import pandas as pd
df = pd.read_json('processed_data.json')
len(df)

100

In [4]:
import numpy as np

def pad_audio(audio, target_length):
    """Pad or truncate audio to the target shape."""
    if audio.shape[1] < target_length:
        pad_width = target_length - audio.shape[1]
        padding = np.zeros((audio.shape[0], pad_width))
        return np.concatenate([audio, padding], axis=1)  # Pad
    return audio[:, :target_length]  # Truncate if needed

# Find the max width across all samples
max_length = max(df['audio'].apply(lambda x: np.array(x).shape[1]))  # Extract width

# Apply padding function
df['padded_audio'] = df['audio'].apply(lambda x: pad_audio(np.array(x), max_length))

# Check the count of different padded shapes
df['padded_audio_shape'] = df['padded_audio'].apply(lambda x: x.shape)
print(df['padded_audio_shape'].value_counts())

padded_audio_shape
(9, 431)    100
Name: count, dtype: int64


In [5]:
print(type(df['audio']))

<class 'pandas.core.series.Series'>


In [6]:
print(type(df['text']))

<class 'pandas.core.series.Series'>


In [7]:
import numpy as np

# Function to pad audio to the maximum length
def pad_text(text, target_length):
    # If the audio is shorter than the target length, pad it with zeros
    if len(text) < target_length:
        padding = np.zeros(target_length - len(text))
        return np.concatenate([text, padding])
    else:
        # If the audio is longer, truncate it
        return text[:target_length]

# Find the maximum audio length
max_length = max([len(audio) for audio in df['text']])

# Pad the audio to the maximum length
df['padded_text'] = df['text'].apply(lambda x: pad_text(x, max_length))

# Check the shape of padded audio
print(df['padded_text'].apply(lambda x: len(x)).value_counts())

padded_text
48    100
Name: count, dtype: int64


In [8]:
import numpy as np
import pandas as pd
from collections import Counter

# Function to get the shape of each audio file
def get_audio_shape(audio):
    return str(np.array(audio).shape)  # Convert shape to string for counting

# Apply the function to get shapes
df['audio_shape'] = df['padded_audio'].apply(get_audio_shape)

# Count occurrences of each shape
shape_counts = df['audio_shape'].value_counts()

# Display the results
print("Audio shape counts:")
print(shape_counts)

Audio shape counts:
audio_shape
(9, 431)    100
Name: count, dtype: int64


In [9]:
# Function to check the shape of audio arrays
def check_text_shape(df):
    # Create a dictionary to store the counts of each shape
    shape_counts = {}

    for text in df['padded_text']:
        # Get the shape of the audio (assuming each audio is a numpy array or list)
        shape = len(text)

        # Update the count for the shape
        if shape in shape_counts:
            shape_counts[shape] += 1
        else:
            shape_counts[shape] = 1

    return shape_counts

# Get the shape counts
shape_counts = check_text_shape(df)

# Print the result
for shape, count in shape_counts.items():
    print(f"Text with shape {shape}: {count} files")

Text with shape 48: 100 files


In [10]:
df.columns

Index(['audio', 'text', 'padded_audio', 'padded_audio_shape', 'padded_text',
       'audio_shape'],
      dtype='object')

In [11]:
print(df.head())

                                               audio  \
0  [[568, 568, 568, 568, 568, 568, 568, 568, 568,...   
1  [[568, 568, 568, 568, 568, 568, 568, 568, 568,...   
2  [[568, 568, 568, 568, 568, 568, 568, 568, 568,...   
3  [[568, 568, 568, 568, 568, 568, 568, 568, 568,...   
4  [[568, 568, 568, 568, 568, 568, 568, 568, 568,...   

                                                text  \
0  [2, 41, 1, 62670, 1, 62670, 41, 1, 62827, 6200...   
1  [2, 41, 1, 62670, 62006, 62007, 1, 41, 1, 6200...   
2  [2, 41, 1, 41, 61937, 1, 62008, 62688, 62006, ...   
3  [2, 41, 1, 41, 1, 62007, 41, 1, 62670, 62006, ...   
4  [2, 41, 1, 62007, 61937, 41, 1, 62670, 1, 41, ...   

                                        padded_audio padded_audio_shape  \
0  [[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...           (9, 431)   
1  [[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...           (9, 431)   
2  [[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...           (9, 431)   
3  [[568.0, 568.0, 568.0, 

In [12]:
df = df.drop(columns=['text','audio','audio_shape','padded_audio_shape'])

In [13]:
df.head()

Unnamed: 0,padded_audio,padded_text
0,"[[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...","[2.0, 41.0, 1.0, 62670.0, 1.0, 62670.0, 41.0, ..."
1,"[[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...","[2.0, 41.0, 1.0, 62670.0, 62006.0, 62007.0, 1...."
2,"[[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...","[2.0, 41.0, 1.0, 41.0, 61937.0, 1.0, 62008.0, ..."
3,"[[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...","[2.0, 41.0, 1.0, 41.0, 1.0, 62007.0, 41.0, 1.0..."
4,"[[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...","[2.0, 41.0, 1.0, 62007.0, 61937.0, 41.0, 1.0, ..."


In [14]:
print(df.iloc[0])

padded_audio    [[568.0, 568.0, 568.0, 568.0, 568.0, 568.0, 56...
padded_text     [2.0, 41.0, 1.0, 62670.0, 1.0, 62670.0, 41.0, ...
Name: 0, dtype: object


In [15]:
type(df['padded_audio'].iloc[1])

numpy.ndarray

In [16]:
type(df['padded_text'].iloc[1])

numpy.ndarray

In [17]:
df.to_json('final_dataset.json', orient='records')

---