checking weather the hugging face token is valid or not

In [None]:
# Test Hugging Face authentication
from huggingface_hub import login
from google.colab import userdata

try:
    hf_token = userdata.get('HF_TOKEN')
    if hf_token:
        login(hf_token)
        print("Successfully logged in to Hugging Face!")
    else:
        print("Token exists but is empty!")
except Exception as e:
    print(f"Error logging in: {e}")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Successfully logged in to Hugging Face!


setting up all the directories in the google drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
BASE_PATH = '/content/drive/My Drive/Sandalwood_ASR_Project'
AUDIO_PATH = f'{BASE_PATH}/raw_data/audio_files'
TRANSCRIPTION_PATH = f'{BASE_PATH}/raw_data/transcriptions'
PROCESSED_PATH = f'{BASE_PATH}/processed_data'
MODEL_PATH = f'{BASE_PATH}/models'

# Create directories if they don't exist
!mkdir -p "{AUDIO_PATH}"
!mkdir -p "{TRANSCRIPTION_PATH}"
!mkdir -p "{PROCESSED_PATH}"
!mkdir -p "{MODEL_PATH}"

# Function to check dataset structure
def verify_dataset_setup():
    import os

    print("Checking dataset structure...")

    # Check if directories exist
    dirs_to_check = {
        'Project Root': BASE_PATH,
        'Audio Files': AUDIO_PATH,
        'Transcriptions': TRANSCRIPTION_PATH,
        'Processed Data': PROCESSED_PATH,
        'Models': MODEL_PATH
    }

    for name, path in dirs_to_check.items():
        if os.path.exists(path):
            print(f"✅ {name} directory found at: {path}")
        else:
            print(f"❌ {name} directory not found!")

    # Check for audio files
    if os.path.exists(AUDIO_PATH):
        audio_files = [f for f in os.listdir(AUDIO_PATH) if f.endswith(('.wav', '.mp3', '.m4a'))]
        print(f"\nFound {len(audio_files)} audio files")
        if audio_files:
            print("Sample files:", audio_files[:3])

    # Check for transcription files
    if os.path.exists(TRANSCRIPTION_PATH):
        trans_files = [f for f in os.listdir(TRANSCRIPTION_PATH) if f.endswith(('.txt', '.csv', '.tsv'))]
        print(f"\nFound {len(trans_files)} transcription files")
        if trans_files:
            print("Sample files:", trans_files[:3])

# Run verification
verify_dataset_setup()

# Print helpful information
print("\nTo use this dataset in your ASR model:")
print(f"AUDIO_DIR = '{AUDIO_PATH}'")
print(f"TRANSCRIPTION_DIR = '{TRANSCRIPTION_PATH}'")
print(f"MODEL_SAVE_DIR = '{MODEL_PATH}'")

Mounted at /content/drive
Checking dataset structure...
✅ Project Root directory found at: /content/drive/My Drive/Sandalwood_ASR_Project
✅ Audio Files directory found at: /content/drive/My Drive/Sandalwood_ASR_Project/raw_data/audio_files
✅ Transcriptions directory found at: /content/drive/My Drive/Sandalwood_ASR_Project/raw_data/transcriptions
✅ Processed Data directory found at: /content/drive/My Drive/Sandalwood_ASR_Project/processed_data
✅ Models directory found at: /content/drive/My Drive/Sandalwood_ASR_Project/models

Found 141 audio files
Sample files: ['SandalWoodNewsStories_200 (1).mp3', 'SandalWoodNewsStories_230 (1).mp3', 'SandalWoodNewsStories_295 (1).mp3']

Found 1 transcription files
Sample files: ['transcriptions_with_translations.csv']

To use this dataset in your ASR model:
AUDIO_DIR = '/content/drive/My Drive/Sandalwood_ASR_Project/raw_data/audio_files'
TRANSCRIPTION_DIR = '/content/drive/My Drive/Sandalwood_ASR_Project/raw_data/transcriptions'
MODEL_SAVE_DIR = '/con

checking weather the value of the hugging face is correct or not

In [None]:
from huggingface_hub import login
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


For Denoising

In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
import noisereduce as nr
from tqdm import tqdm
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import logging
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class AudioDenoiser:
    def __init__(self, input_dir, output_dir, sample_rate=16000, chunk_duration=30):
        """
        Initialize the Audio Denoiser

        Args:
            input_dir (str): Directory containing input audio files
            output_dir (str): Directory for denoised audio files
            sample_rate (int): Target sample rate for processing
            chunk_duration (int): Duration of each chunk in seconds for processing
        """
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.sample_rate = sample_rate
        self.chunk_duration = chunk_duration
        self.chunk_size = self.sample_rate * chunk_duration

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Setup logging
        self.setup_logging()

    def setup_logging(self):
        """Setup logging configuration"""
        log_file = os.path.join(self.output_dir, f'denoising_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )

    def process_chunk(self, chunk, noise_clip_duration=1.0):
        """
        Process a single chunk of audio

        Args:
            chunk (np.array): Audio chunk to process
            noise_clip_duration (float): Duration in seconds to use for noise profile

        Returns:
            np.array: Denoised audio chunk
        """
        try:
            # Estimate noise from the first second
            noise_clip_samples = int(noise_clip_duration * self.sample_rate)
            noise = chunk[:noise_clip_samples] if len(chunk) > noise_clip_samples else chunk

            # Apply noise reduction with updated parameters
            denoised_chunk = nr.reduce_noise(
                y=chunk,
                sr=self.sample_rate,
                prop_decrease=0.95,
                n_fft=1024,
                win_length=1024,
                hop_length=256,
                stationary=True,
                verbose=False
            )

            return denoised_chunk

        except Exception as e:
            logging.error(f"Error processing chunk: {str(e)}")
            return chunk

    def process_audio_file(self, file_path):
        """
        Process a single audio file

        Args:
            file_path (str): Path to the input audio file
        """
        try:
            # Generate output file path
            filename = os.path.basename(file_path)
            output_path = os.path.join(self.output_dir, f"denoised_{filename}")

            if os.path.exists(output_path):
                logging.info(f"Skipping {filename} - already processed")
                return

            # Load audio file
            logging.info(f"Processing: {filename}")
            audio, sr = librosa.load(file_path, sr=self.sample_rate)

            # Process audio in chunks
            chunks = [audio[i:i + self.chunk_size]
                     for i in range(0, len(audio), self.chunk_size)]

            # Process chunks in parallel using ThreadPoolExecutor
            denoised_chunks = []
            with ThreadPoolExecutor() as executor:
                denoised_chunks = list(tqdm(
                    executor.map(self.process_chunk, chunks),
                    total=len(chunks),
                    desc=f"Processing chunks for {filename}",
                    leave=False
                ))

            # Concatenate chunks
            denoised_audio = np.concatenate(denoised_chunks)

            # Save denoised audio
            sf.write(output_path, denoised_audio, self.sample_rate)
            logging.info(f"Successfully processed {filename}")

        except Exception as e:
            logging.error(f"Error processing {filename}: {str(e)}")

    def process_all_files(self, num_processes=None):
        """
        Process all audio files in the input directory

        Args:
            num_processes (int): Number of parallel processes to use
        """
        if num_processes is None:
            num_processes = mp.cpu_count() - 1

        # Get list of audio files
        audio_files = [f for f in os.listdir(self.input_dir)
                      if f.endswith(('.wav', '.mp3', '.m4a', '.flac'))]

        if not audio_files:
            logging.warning("No audio files found in input directory")
            return

        logging.info(f"Found {len(audio_files)} audio files to process")

        # Process files in parallel
        file_paths = [os.path.join(self.input_dir, f) for f in audio_files]

        for file_path in file_paths:
            self.process_audio_file(file_path)

        logging.info("Completed processing all files")

# Example usage
if __name__ == "__main__":
    # Define paths
    INPUT_DIR = "/content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/audio_files"
    OUTPUT_DIR = "/content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/denoised_audio_files"

    # Create denoiser instance
    denoiser = AudioDenoiser(
        input_dir=INPUT_DIR,
        output_dir=OUTPUT_DIR,
        sample_rate=16000,
        chunk_duration=60  # Process in 1-minute chunks
    )

    # Process all files
    denoiser.process_all_files(num_processes=4)

ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
Processing chunks for SandalWoodNewsStories_168 (1).mp3:   0%|          | 0/3 [00:00<?, ?it/s]ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
Processing chunks for SandalWoodNewsStories_223 (1).mp3:   0%|          | 0/3 [00:00<?, ?it/s]ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
ERROR:root:Error processing chunk: reduce_noise() got an unexpected keyword argument 'verbose'
ERROR:root:Error processing chunk: reduce_noise() go

KeyboardInterrupt: 

splitting audio to small pieces

In [None]:
# Install required packages
!pip install pydub
!pip install speechrecognition
!apt-get install -y ffmpeg

import os
from google.colab import drive
import speech_recognition as sr
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import math

def mount_drive():
    """Mount Google Drive"""
    drive.mount('/content/drive')

def convert_to_wav(input_path, output_path):
    """Convert audio file to WAV format"""
    audio = AudioSegment.from_file(input_path)
    audio.export(output_path, format="wav")
    return audio

def detect_sentence_boundaries(audio_segment, min_silence_len=500, silence_thresh=-40):
    """Detect potential sentence boundaries using silence detection"""
    nonsilent_ranges = detect_nonsilent(audio_segment,
                                      min_silence_len=min_silence_len,
                                      silence_thresh=silence_thresh)
    boundaries = []
    for i in range(len(nonsilent_ranges)-1):
        end_of_current = nonsilent_ranges[i][1]
        start_of_next = nonsilent_ranges[i+1][0]
        if start_of_next - end_of_current >= min_silence_len:
            boundaries.append(end_of_current)
    return boundaries

def split_audio(input_file, output_dir, target_length=35000):  # 35 seconds in milliseconds
    """Split audio file into segments at sentence boundaries"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Convert to WAV if needed
    filename = os.path.basename(input_file)
    base_name = os.path.splitext(filename)[0]
    wav_path = os.path.join(output_dir, f"{base_name}_temp.wav")
    audio = convert_to_wav(input_file, wav_path)

    # Detect sentence boundaries
    boundaries = detect_sentence_boundaries(audio)

    # Split audio into segments
    start_time = 0
    segment_count = 0

    for i in range(len(boundaries)):
        current_boundary = boundaries[i]
        segment_length = current_boundary - start_time

        if segment_length >= target_length:
            # Export segment
            segment = audio[start_time:current_boundary]
            output_path = os.path.join(output_dir, f"{base_name}_segment_{segment_count:03d}.wav")
            segment.export(output_path, format="wav")
            segment_count += 1
            start_time = current_boundary

    # Handle the last segment
    if len(audio) - start_time > 0:
        segment = audio[start_time:]
        output_path = os.path.join(output_dir, f"{base_name}_segment_{segment_count:03d}.wav")
        segment.export(output_path, format="wav")

    # Clean up temporary WAV file
    os.remove(wav_path)
    return segment_count + 1

def process_all_files(input_dir, output_dir):
    """Process all audio files in the input directory"""
    # Mount Google Drive
    mount_drive()

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get all audio files
    audio_extensions = ('.mp3', '.wav', '.m4a', '.aac', '.wma', '.ogg')
    audio_files = [f for f in os.listdir(input_dir)
                  if os.path.isfile(os.path.join(input_dir, f))
                  and f.lower().endswith(audio_extensions)]

    total_segments = 0
    for audio_file in audio_files:
        input_path = os.path.join(input_dir, audio_file)
        print(f"Processing {audio_file}...")
        segments = split_audio(input_path, output_dir)
        total_segments += segments
        print(f"Created {segments} segments for {audio_file}")

    print(f"\nProcessing complete! Total segments created: {total_segments}")

# Example usage
input_directory = '/content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/audio_files'  # Change this to your input folder path
output_directory = '/content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/splitted_audio_files'  # Change this to your output folder path

# Run the processing
process_all_files(input_directory, output_directory)

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting speechrecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: speechrecognition
Successfully installed speechrecognition-3.11.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing SandalWoodNewsStories_200 (1)

KeyboardInterrupt: 

converting the splitted audio into kannada text and then kannada text to english text

In [None]:
import torch
import torchaudio
import librosa
import os
import numpy as np
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    MarianMTModel,
    MarianTokenizer
)
import pandas as pd
from tqdm.notebook import tqdm
from huggingface_hub import login
from google.colab import userdata

class KannadaSpeechToEnglishPipeline:
    def __init__(self, token=None):
        # Load ASR model (Whisper for multilingual speech recognition)
        print("Loading ASR model...")
        self.asr_model_name = "openai/whisper-medium"
        self.asr_processor = WhisperProcessor.from_pretrained(
            self.asr_model_name,
            token=token
        )
        self.asr_model = WhisperForConditionalGeneration.from_pretrained(
            self.asr_model_name,
            token=token
        )

        # Load translation model (Kannada text to English)
        print("Loading translation model...")
        self.trans_model_name = "Helsinki-NLP/opus-mt-mul-en"
        self.trans_tokenizer = MarianTokenizer.from_pretrained(
            self.trans_model_name,
            token=token
        )
        self.trans_model = MarianMTModel.from_pretrained(
            self.trans_model_name,
            token=token
        )

        # Move models to GPU if available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.asr_model = self.asr_model.to(self.device)
        self.trans_model = self.trans_model.to(self.device)

        print(f"Models loaded and running on {self.device}")

    def process_audio(self, audio_path):
        """Process audio file for ASR"""
        try:
            # Load and resample audio
            speech_array, sampling_rate = librosa.load(audio_path, sr=16000)

            # Normalize
            speech_array = speech_array / np.max(np.abs(speech_array))

            # Prepare for model
            inputs = self.asr_processor(
                speech_array,
                sampling_rate=16000,
                return_tensors="pt",
                padding=True
            ).to(self.device)

            return inputs

        except Exception as e:
            print(f"Error processing audio file {audio_path}: {str(e)}")
            return None

    def speech_to_kannada(self, audio_path):
        """Convert Kannada speech to Kannada text"""
        try:
            inputs = self.process_audio(audio_path)
            if inputs is None:
                return None

            # Get ASR predictions
            with torch.no_grad():
                # Use forced decoder ids for Kannada
                forced_decoder_ids = self.asr_processor.get_decoder_prompt_ids(language="kannada", task="transcribe")
                generated_ids = self.asr_model.generate(
                    inputs.input_features,
                    forced_decoder_ids=forced_decoder_ids,
                    max_length=448,
                )

            kannada_text = self.asr_processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )[0]

            return kannada_text

        except Exception as e:
            print(f"Error in speech to Kannada conversion: {str(e)}")
            return None

    def kannada_to_english(self, kannada_text):
        """Translate Kannada text to English"""
        try:
            # Prepare for translation
            inputs = self.trans_tokenizer(
                kannada_text,
                return_tensors="pt",
                padding=True
            ).to(self.device)

            # Generate translation
            with torch.no_grad():
                translated_ids = self.trans_model.generate(
                    **inputs,
                    max_length=512,
                    num_beams=5,
                    length_penalty=1.0,
                    early_stopping=True
                )

            english_text = self.trans_tokenizer.batch_decode(
                translated_ids,
                skip_special_tokens=True
            )[0]

            return english_text

        except Exception as e:
            print(f"Error in Kannada to English translation: {str(e)}")
            return None

    def process_directory(self, input_dir, output_file):
        """Process all audio files in a directory"""
        results = []

        # Get all audio files
        audio_files = [f for f in os.listdir(input_dir)
                      if f.endswith(('.wav', '.mp3', '.m4a'))]

        print(f"Found {len(audio_files)} audio files to process")

        for audio_file in tqdm(audio_files, desc="Processing audio files"):
            audio_path = os.path.join(input_dir, audio_file)

            # Process the audio file
            kannada_text = self.speech_to_kannada(audio_path)
            if kannada_text:
                english_text = self.kannada_to_english(kannada_text)

                results.append({
                    'file_name': audio_file,
                    'kannada_text': kannada_text,
                    'english_text': english_text
                })

        # Save results
        df = pd.DataFrame(results)
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"\nResults saved to {output_file}")

        return df

# Mount Google Drive (if needed)
from google.colab import drive
drive.mount('/content/drive')

# Example usage
def main():
    # Get HF token from Colab secrets
    token = userdata.get('HF_TOKEN')

    # Define paths - modify these according to your Drive structure
    INPUT_DIR = '/content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/splitted_audio_files'
    OUTPUT_FILE = '/content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/transcriptions/transcriptions_with_translations.csv'

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

    # Create pipeline with token
    pipeline = KannadaSpeechToEnglishPipeline(token=token)

    # Process single file example
    print("\nTesting with single file...")
    test_files = os.listdir(INPUT_DIR)
    if test_files:
        test_file = os.path.join(INPUT_DIR, test_files[0])
        kannada_text = pipeline.speech_to_kannada(test_file)
        if kannada_text:
            english_text = pipeline.kannada_to_english(kannada_text)
            print(f"\nSample results for {os.path.basename(test_file)}:")
            print(f"Kannada: {kannada_text}")
            print(f"English: {english_text}")
    else:
        print("No audio files found in input directory!")

    # Process all files
    print("\nProcessing all files...")
    results_df = pipeline.process_directory(INPUT_DIR, OUTPUT_FILE)

    print("\nSample of processed data:")
    print(results_df.head())

if __name__ == "__main__":
    main()


Mounted at /content/drive
Loading ASR model...


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

Loading translation model...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Models loaded and running on cuda

Testing with single file...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Sample results for SandalWoodNewsStories_200 (1)_segment_000.wav:
Kannada:  ಮಾಮಾಸ್ಕರಾ ನಾಮಾಸ್ಕರಾ ನಾನನನನ ವಿದು ವಿದು ಮಾಡಿದು ಅದ್ರಿಯಾದ್ಯಾದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದು ಮಾಡಿದ
English: "Mascara, Namascara, I have made my bed, and made myself naked, and made myself naked, and made myself naked, and made myself naked, and made myself naked, and made myself naked, and made myself naked, and made myself naked, and made myself naked.

Processing all files...
Found 286 audio files to process


Processing audio files:   0%|          | 0/286 [00:00<?, ?it/s]


Results saved to /content/drive/MyDrive/Sandalwood_ASR_Project/raw_data/transcriptions/transcriptions_with_translations.csv

Sample of processed data:
                                       file_name  \
0  SandalWoodNewsStories_200 (1)_segment_000.wav   
1  SandalWoodNewsStories_200 (1)_segment_001.wav   
2  SandalWoodNewsStories_200 (1)_segment_002.wav   
3  SandalWoodNewsStories_200 (1)_segment_003.wav   
4  SandalWoodNewsStories_200 (1)_segment_004.wav   

                                        kannada_text  \
0   ಮಾಮಾಸ್ಕರಾ ನಾಮಾಸ್ಕರಾ ನಾನನನನ ವಿದು ವಿದು ಮಾಡಿದು ಅ...   
1   ಇಲ್ಲಿ ನೂಡಿದ್ರೆ ನಾವರಿದಿ ಚಪಾರಿಲಿಲಿಲಿಲಿಲಿಲಿಲಿಲಿಲ...   
2   ಇಂದಿನಾಯಾಗಿದಿಲಾಂತಾ ಇದನಾಗಿದಿರಾದಿ ಅಮಿಲಿ ಇಸ್ಟಾಯಕಿ...   
3   ಇಲ್ಲಿ ಇಲ್ಲಿ ಸಿಪಿದೇಲಾಂದಿದೇಲಿ ನಾನಿನಿನಿನಿನಿನಿನಿನ...   
4   ಇಲ್ಲಿ ಪ್ರತಾಯಾಪಲ್ಲಿ ನಾಡಿದಿವಿನಿನಿನಿನಿನಿನಿನಿನಿನಿ...   

                                        english_text  
0  "Mascara, Namascara, I have made my bed, and m...  
1  For there are hundreds of thousands of people ...  
2  But if it isn't so today, then