In [None]:
from datasets import load_dataset
from tqdm import tqdm

TARGET_SAMPLES = 1500

streaming_dataset = load_dataset(
    "google/fleurs",
    "bn_in",
    split="train",
    streaming=True,
    trust_remote_code=True
)

dataset = []
skipped = 0

pbar = tqdm(total=TARGET_SAMPLES, desc="Collecting Bengali samples")

for sample in streaming_dataset:
    if len(dataset) >= TARGET_SAMPLES:
        break

    try:
        # Accessing audio triggers the download
        _ = sample["audio"]["array"]
        dataset.append(sample)
        pbar.update(1)

    except Exception as e:
        skipped += 1
        continue

pbar.close()

print(f"Collected: {len(dataset)} samples")
print(f"Skipped (failed downloads): {skipped}")









Collecting Bengali samples: 100%|██████████| 1500/1500 [07:09<00:00,  3.49it/s]

Collected: 1500 samples
Skipped (failed downloads): 0





In [4]:
dataset[0].keys()


dict_keys(['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'])

In [5]:
dataset[0]["audio"]["sampling_rate"]


16000

In [6]:
import IPython.display as ipd
ipd.Audio(
    dataset[0]["audio"]["array"],
    rate=dataset[0]["audio"]["sampling_rate"]
)


In [7]:
import src.preprocess
print(src.preprocess.__file__)


c:\Users\swastik dasgupta\Desktop\bengali_asr\src\preprocess.py


In [1]:
import sys
import os

# Step 1: get current working directory
NOTEBOOK_DIR = os.getcwd()

# Step 2: get parent directory (project root)
PROJECT_ROOT = os.path.dirname(NOTEBOOK_DIR)

print("Notebook working directory:", NOTEBOOK_DIR)
print("Project root:", PROJECT_ROOT)

# Step 3: add project root to Python path
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Step 4: now import
from src.preprocess import preprocess

# Step 5: Check if dataset exists, if not, load it
if 'dataset' not in globals() or dataset is None or len(dataset) == 0:
    print("⚠️  Dataset not found. Loading dataset...")
    from datasets import load_dataset
    from tqdm import tqdm
    
    TARGET_SAMPLES = 1500
    
    streaming_dataset = load_dataset(
        "google/fleurs",
        "bn_in",
        split="train",
        streaming=True,
        trust_remote_code=True
    )
    
    dataset = []
    skipped = 0
    
    pbar = tqdm(total=TARGET_SAMPLES, desc="Collecting Bengali samples")
    
    for sample in streaming_dataset:
        if len(dataset) >= TARGET_SAMPLES:
            break
        
        try:
            # Accessing audio triggers the download
            _ = sample["audio"]["array"]
            dataset.append(sample)
            pbar.update(1)
        
        except Exception as e:
            skipped += 1
            continue
    
    pbar.close()
    
    print(f"Collected: {len(dataset)} samples")
    print(f"Skipped (failed downloads): {skipped}")
else:
    print(f"✅ Using existing dataset with {len(dataset)} samples")

# Step 6: run preprocessing
preprocess(dataset)



Notebook working directory: c:\Users\swastik dasgupta\Desktop\bengali_asr\notebooks
Project root: c:\Users\swastik dasgupta\Desktop\bengali_asr
📁 PROJECT ROOT: c:\Users\swastik dasgupta\Desktop\bengali_asr
⚠️  Dataset not found. Loading dataset...


Collecting Bengali samples: 100%|██████████| 1500/1500 [08:28<00:00,  2.95it/s]


Collected: 1500 samples
Skipped (failed downloads): 0
🔹 Starting preprocessing
Total samples received: 1500
📂 Writing audio to: c:\Users\swastik dasgupta\Desktop\bengali_asr\data\processed\audio
🧾 Writing CSV to: c:\Users\swastik dasgupta\Desktop\bengali_asr\data\processed\train.csv


Saving audio + text: 100%|██████████| 1500/1500 [00:48<00:00, 30.73it/s]


✅ Preprocessing complete
Saved 1500 samples


In [None]:
import pandas as pd

df = pd.read_csv("C:\\Users\\swastik dasgupta\\Desktop\\bengali_asr\\data\\processed\\train.csv")
len(df)


100

In [None]:
df.head()

Unnamed: 0,audio_path,text
0,data/processed/audio\sample_0.wav,এক মাইলের কম থেকে শুরু করে বেশি দূরত্ব যা কিনা...
1,data/processed/audio\sample_1.wav,ব্রাজিলের জাতীয় কংগ্রেস সামাজিক বিবাহকে বছরের...
2,data/processed/audio\sample_2.wav,স্থলরেখা থেকে অনেক দূরে থাকায় মার্কিন যুক্তরা...
3,data/processed/audio\sample_3.wav,বর্তমানে অনেক সামি আধুনিক ব্যবসায় কাজ করেসামি...
4,data/processed/audio\sample_4.wav,অশ্বরোহীর পাদান তার পায়ের সাপোর্টের জন্য যা অ...
