# Fine Tuning Whisper small on French Language

Model testing

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import soundfile as sf

# Load model and processor
processor = AutoProcessor.from_pretrained("openai/whisper-small")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")

# Load audio file
audio_file = "D:/Personal-project/whisper-french/whisper-small-french/dataset/lesmis-converted/lesmis_0001_16bit.wav"
audio, sample_rate = sf.read(audio_file)

# Resample audio to 16 kHz
target_sample_rate = 16000
audio_resampled = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)

# Perform transcription (use target_sample_rate in the processor)
inputs = processor(audio_resampled, return_tensors="pt")
print("input")
print(type(inputs))
print(inputs)
print()
# Generate the transcription
generated_ids = model.generate(inputs.input_features)
# Decode transcription
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(transcription)


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


input
<class 'transformers.feature_extraction_utils.BatchFeature'>
{'input_features': tensor([[[-0.6714, -0.4491, -0.3204,  ..., -1.0559, -1.0559, -1.0559],
         [-0.7622, -0.5544, -0.5803,  ..., -1.0559, -1.0559, -1.0559],
         [-0.8359, -0.7048, -0.6068,  ..., -1.0559, -1.0559, -1.0559],
         ...,
         [-1.0559, -1.0559, -1.0559,  ..., -1.0559, -1.0559, -1.0559],
         [-1.0559, -1.0559, -1.0559,  ..., -1.0559, -1.0559, -1.0559],
         [-1.0559, -1.0559, -1.0559,  ..., -1.0559, -1.0559, -1.0559]]])}



AttributeError: 'WhisperProcessor' object has no attribute 'encode'

In [None]:
# pip install transformers torch

import torch
from transformers import pipeline

whisper = pipeline("automatic-speech-recognition", "openai/whisper-small", torch_dtype=torch.float16, device="cuda:0")

transcription = whisper("D:/Personal-project/whisper-french/whisper-small-french/dataset/lesmis-converted/lesmis_0001_16bit.wav")

print(transcription["text"])

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


 Chapètre I. La charie du Faubourg Saint-Antoine et la sida du Faubourg du Temple


Data Preparation

In [None]:
import soundfile as sf

filename = "D:/Personal-project/whisper-french/whisper-small-french/dataset/lesmis/lesmis_0001.wav"
data, samplerate = sf.read(filename)

# Now you can work with the data
print(data.shape)  # Shape of the audio data
print(samplerate)  # Sample rate


LibsndfileError: Error opening 'D:/Personal-project/whsiper-french/dataset/lesmis/lesmis_0001.wav': System error.

In [None]:
import ffmpeg
import os

# Directory containing your audio files
input_dir = "D:/Personal-project/whisper-french/whisper-small-french/dataset/lupincontresholme"
output_dir = "D:/Personal-project/whisper-french/whisper-small-french/dataset/lupincontresholme-converted"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through all .wav files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".wav"):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, filename)

        # Add "_16bit" to the output filename to distinguish converted files
        output_file = os.path.splitext(output_file)[0] + "_16bit.wav"

        try:
            # Convert the file using ffmpeg
            ffmpeg.input(input_file).output(output_file, acodec="pcm_s16le").run()
            print(f"Converted: {filename}")
        except Exception as e:
            print(f"Error converting {filename}: {e}")

print("Conversion completed.")


Converted: lupincontresholme_0001.wav
Converted: lupincontresholme_0002.wav
Converted: lupincontresholme_0003.wav
Converted: lupincontresholme_0004.wav
Converted: lupincontresholme_0005.wav
Converted: lupincontresholme_0006.wav
Converted: lupincontresholme_0007.wav
Converted: lupincontresholme_0008.wav
Converted: lupincontresholme_0009.wav
Converted: lupincontresholme_0010.wav
Converted: lupincontresholme_0011.wav
Converted: lupincontresholme_0012.wav
Converted: lupincontresholme_0013.wav
Converted: lupincontresholme_0014.wav
Converted: lupincontresholme_0015.wav
Converted: lupincontresholme_0016.wav
Converted: lupincontresholme_0017.wav
Converted: lupincontresholme_0018.wav
Converted: lupincontresholme_0019.wav
Converted: lupincontresholme_0020.wav
Converted: lupincontresholme_0021.wav
Converted: lupincontresholme_0022.wav
Converted: lupincontresholme_0023.wav
Converted: lupincontresholme_0024.wav
Converted: lupincontresholme_0025.wav
Converted: lupincontresholme_0026.wav
Converted: l

In [None]:
import wave
import pyaudio

# Load wave file
filename = "D:/Personal-project/whisper-french/whisper-small-french/dataset/lesmis-converted/lesmis_0001_16bit.wav"
wf = wave.open(filename, 'rb')

# Initialize PyAudio
p = pyaudio.PyAudio()

# Open stream
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True)

# Read and play audio
chunk = 1024
data = wf.readframes(chunk)
while data:
    stream.write(data)
    data = wf.readframes(chunk)

# Cleanup
stream.close()
p.terminate()
wf.close()


Merge data

In [2]:
import shutil
import os
import glob

all_lesmis = glob.glob("D:/Personal-project/whisper-french/whisper-small-french/dataset/lesmis-converted/*.wav")
for one_file_path_lesmis in all_lesmis:
    shutil.copy(one_file_path_lesmis, one_file_path_lesmis.replace("lesmis-converted", "data"))

all_lupincontreholme = glob.glob("D:/Personal-project/whisper-french/whisper-small-french/dataset/lupincontreholme-converted/*.wav")
for one_file_path_lupincontreholme in all_lupincontreholme:
    shutil.copy(one_file_path_lupincontreholme, one_file_path_lupincontreholme.replace("lupincontreholme-converted", "data"))

In [1]:
import csv

label_mapping = {}
with open("D:/Personal-project/whisper-french/whisper-small-french/dataset/transcript.csv", "r") as old_file:
    reader = csv.reader(old_file, delimiter="|")
    next(reader)
    for one_row in reader:
        file_name_full = one_row[0]
        file_name = file_name_full.split("/")[1]
        file_name_no_extension = file_name.split(".")[0]
        label_mapping[file_name_no_extension + "_16bit.wav"] = one_row[2]

label_mapping

{'lesmis_0001_16bit.wav': 'Chapitre I La Charybde du faubourg Saint-Antoine et la Scylla du faubourg du Temple',
 'lesmis_0002_16bit.wav': "Les deux plus mémorables barricades que l'observateur des maladies sociales puisse mentionner n'appartiennent point à la période où est placée l'action de ce livre.",
 'lesmis_0003_16bit.wav': "Ces deux barricades, symboles toutes les deux, sous deux aspects différents, d'une situation redoutable,",
 'lesmis_0004_16bit.wav': "sortirent de terre lors de la fatale insurrection de juin quarante-huit, la plus grande guerre des rues qu'ait vue l'histoire.",
 'lesmis_0005_16bit.wav': "Il arrive quelquefois que, même contre les principes, même contre la liberté, l'égalité et la fraternité,",
 'lesmis_0006_16bit.wav': 'même contre le vote universel, même contre le gouvernement de tous par tous, du fond de ses angoisses, de ses découragements, de ses dénûments, de ses fièvres, de ses détresses, de ses miasmes, de ses ignorances,',
 'lesmis_0007_16bit.wav': 

In [2]:
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from transformers import WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", langauge="french", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="french", task="transcribe")


In [None]:
# # Save
# import librosa
# import soundfile as sf
# import glob
# import os 

# # Define a function to tokenize the labels
# def preprocess_function(batch):
#     batch["labels"] = tokenizer(batch["label"], padding="longest", truncation=True).input_ids
#     return batch

# def load_audio_files_to_dataset(folder_path):
#     all_files = glob.glob(f"{folder_path}/*.wav")
#     data = []
#     for one_file in all_files:
#         audio, sample_rate = sf.read(one_file)
#         target_sample_rate = 16000
#         audio_resampled = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
#         audio_resampled = processor(audio_resampled, return_tensors="pt")
#         label = label_mapping[os.path.basename(one_file)]
#         data.append({
#             "audio" : audio_resampled,
#             "label": label,
#         })
#     return data

# data = load_audio_files_to_dataset("D:/Personal-project/whisper-french/whisper-small-french/dataset/data")

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument 

In [1]:
import csv

label_mapping = {}
with open("D:/Personal-project/whisper-french/whisper-small-french/dataset/transcript.csv", "r") as old_file:
    reader = csv.reader(old_file, delimiter="|")
    next(reader)
    for one_row in reader:
        file_name_full = one_row[0]
        file_name = file_name_full.split("/")[1]
        file_name_no_extension = file_name.split(".")[0]
        label_mapping[file_name_no_extension + "_16bit.wav"] = one_row[2]

label_mapping

{'lesmis_0001_16bit.wav': 'Chapitre I La Charybde du faubourg Saint-Antoine et la Scylla du faubourg du Temple',
 'lesmis_0002_16bit.wav': "Les deux plus mémorables barricades que l'observateur des maladies sociales puisse mentionner n'appartiennent point à la période où est placée l'action de ce livre.",
 'lesmis_0003_16bit.wav': "Ces deux barricades, symboles toutes les deux, sous deux aspects différents, d'une situation redoutable,",
 'lesmis_0004_16bit.wav': "sortirent de terre lors de la fatale insurrection de juin quarante-huit, la plus grande guerre des rues qu'ait vue l'histoire.",
 'lesmis_0005_16bit.wav': "Il arrive quelquefois que, même contre les principes, même contre la liberté, l'égalité et la fraternité,",
 'lesmis_0006_16bit.wav': 'même contre le vote universel, même contre le gouvernement de tous par tous, du fond de ses angoisses, de ses découragements, de ses dénûments, de ses fièvres, de ses détresses, de ses miasmes, de ses ignorances,',
 'lesmis_0007_16bit.wav': 

In [2]:
import librosa
import soundfile as sf
import glob
import os 
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Define a function to tokenize the labels
def preprocess_function(batch):
    batch["labels"] = tokenizer(batch["label"], padding="longest", truncation=True).input_ids
    return batch

def load_audio_files_to_dataset(folder_path, test_size=0.2, validation_size=0.1, random_state=42):
    all_files = glob.glob(f"{folder_path}/*.wav")
    data = []
    label_data = []
    audio_data = []

    for one_file in all_files:
        audio, sample_rate = sf.read(one_file)
        target_sample_rate = 16000
        audio_resampled = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
        # audio_resampled = processor(audio_resampled, return_tensors="pt")
        label = label_mapping[os.path.basename(one_file)]
        label_data.append(label)
        audio_data.append({
            "audio": audio_resampled,
            "sampling_rate": target_sample_rate
        })
    dataset = Dataset.from_dict({
        'audio': audio_data,
        'label': label_data,
    })
            # Split the dataset into train and test sets
    train_data, temp_test_val = train_test_split(dataset, test_size=(test_size + validation_size), random_state=random_state)
    # Split the temp_test_val into test and validation sets
    test_data, validation_data = train_test_split(temp_test_val, test_size=validation_size / (test_size + validation_size), random_state=random_state)
    # Create Dataset objects from the split data
    train_dataset = Dataset.from_dict(train_data)
    test_dataset = Dataset.from_dict(test_data)
    validation_dataset = Dataset.from_dict(validation_data)
    # Create a DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset,
        'validation': validation_dataset,
    })
    return dataset_dict
 
def load_audio_files_to_dataset(folder_path, test_size=0.2, validation_size=0.1, random_state=42):
    all_files = glob.glob(f"{folder_path}/*.wav")
    audio_data = []
    label_data = []

    for one_file in all_files:
        audio, sample_rate = sf.read(one_file)
        target_sample_rate = 16000
        audio_resampled = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
        label = label_mapping[os.path.basename(one_file)]
        label_data.append(label)
        audio_data.append({
            "audio": audio_resampled,
            "sampling_rate": target_sample_rate
        })

    dataset = Dataset.from_dict({
        'audio': audio_data,
        'label': label_data,
    })

    # Convert Dataset to a format compatible with train_test_split
    dataset_indices = list(range(len(dataset)))

    train_indices, temp_test_val_indices = train_test_split(dataset_indices, test_size=(test_size + validation_size), random_state=random_state)
    test_indices, validation_indices = train_test_split(temp_test_val_indices, test_size=validation_size / (test_size + validation_size), random_state=random_state)

    train_dataset = dataset.select(train_indices)
    test_dataset = dataset.select(test_indices)
    validation_dataset = dataset.select(validation_indices)

    dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset,
        'validation': validation_dataset,
    })

    return dataset_dict


data_dict = load_audio_files_to_dataset("D:/Personal-project/whisper-french/whisper-small-french/dataset/data")

In [None]:
# data_dict.save_to_disk("dataset_saved")

Saving the dataset (0/8 shards):   0%|          | 0/3846 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1099 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/550 [00:00<?, ? examples/s]

In [None]:
# from datasets import Dataset

# def create_dataset_with_nested_audio(audio_data, sampling_rates, label_data):
#     """
#     Creates a Hugging Face Dataset with nested audio data.

#     Args:
#         audio_data (list): List of audio arrays.
#         sampling_rates (list): List of sampling rates.
#         label_data (list): List of labels.

#     Returns:
#         Dataset: A Hugging Face Dataset object.
#     """

#     # Ensure all lists have the same length
#     if not (len(audio_data) == len(sampling_rates) == len(label_data)):
#         raise ValueError("All input lists must have the same length.")

#     # Create the nested audio dictionary for each row
#     audio_dicts = [{'audio': audio_data[i], 'sampling_rate': sampling_rates[i]} for i in range(len(audio_data))]

#     dataset = Dataset.from_dict({
#         'audio': audio_dicts,
#         'label': label_data,
#     })

#     return dataset

# # Example Usage (replace with your actual data):
# # dataset = create_dataset_with_nested_audio(audio_data, sampling_rates, label_data)

In [None]:
# from datasets import Dataset, DatasetDict
# from sklearn.model_selection import train_test_split

# # Example usage (assuming your list is called 'your_data_list'):
# # your_data_list = [{'audio': {'audio': array([...]), 'sampling_rate': 16000}, 'label': '...'}, ...]
# # dataset_dict = create_and_split_dataset(your_data_list)
# # print(dataset_dict)

# def create_and_triple_split_dataset(data_list, test_size=0.2, validation_size=0.1, random_state=42):
#     """
#     Converts a list of audio and label dictionaries to a Hugging Face Dataset and splits it into train, test, and validation sets.

#     Args:
#         data_list (list): List of dictionaries, each containing 'audio' and 'label'.
#         test_size (float): Proportion of the dataset to include in the test split.
#         validation_size (float): Proportion of the dataset to include in the validation split.
#         random_state (int): Random seed for reproducibility.

#     Returns:
#         DatasetDict: A dictionary containing 'train', 'test', and 'validation' datasets.
#     """

#     # Extract audio and label data
#     audio_data = [item['audio']['audio'] for item in data_list]
#     sampling_rates = [item['audio']['sampling_rate'] for item in data_list]
#     label_data = [item['label'] for item in data_list]
#     print(sampling_rates)
#     # Create a Hugging Face Dataset
#     # audio_dicts = [{'audio': audio_data[i], 'sampling_rate': sampling_rates[i]} for i in range(len(audio_data))]

#     # dataset = Dataset.from_dict({
#     #     'audio': audio_dicts,
#     #     'label': label_data,
#     # })
#     dataset =  create_dataset_with_nested_audio(audio_data, sampling_rates, label_data)

#     # Split the dataset into train and test sets
#     train_data, temp_test_val = train_test_split(dataset, test_size=(test_size + validation_size), random_state=random_state)

#     # Split the temp_test_val into test and validation sets
#     test_data, validation_data = train_test_split(temp_test_val, test_size=validation_size / (test_size + validation_size), random_state=random_state)

#     # Create Dataset objects from the split data
#     train_dataset = Dataset.from_dict(train_data)
#     test_dataset = Dataset.from_dict(test_data)
#     validation_dataset = Dataset.from_dict(validation_data)

#     # Create a DatasetDict
#     dataset_dict = DatasetDict({
#         'train': train_dataset,
#         'test': test_dataset,
#         'validation': validation_dataset,
#     })

#     return dataset_dict

# dataset = create_and_triple_split_dataset(data)
# # Example Usage to create train, test and validation sets:
# # dataset_dict = create_and_triple_split_dataset(your_data_list)
# # print(dataset_dict)

[16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000

ArrowMemoryError: realloc of size 8934555648 failed

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["audio"], sampling_rate=audio["sampling_rate"])

    batch["labels"] = tokenizer(batch["label"]).input_ids
    return batch

In [None]:
from datasets import Dataset, DatasetDict
dataset = Dataset.from

In [5]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 3846
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 1099
    })
    validation: Dataset({
        features: ['audio', 'label'],
        num_rows: 550
    })
})

================================= below is my work

In [None]:
# def load_audio_files_to_dataset(folder_path):
#     all_files = glob.glob(f"{folder_path}/*.wav")
#     data = []
#     for one_file in all_files:
#         audio, sample_rate = sf.read(one_file)
#         target_sample_rate = 16000
#         audio_resampled = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sample_rate)
#         input_features = processor(audio_resampled, return_tensors="pt").input_features
#         label = label_mapping[os.path.basename(one_file)]
#         labels = tokenizer(label, padding="longest", truncation=True, return_tensors="pt").input_ids

#         data.append({
#             "input_features": input_features,
#             "labels": labels.squeeze(0),  # Remove the batch dimension from labels
#         })
#     return data

# data = load_audio_files_to_dataset("D:/Personal-project/whisper-french/whisper-small-french/dataset/data")


It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument 

In [15]:
data[:5]

[{'input_features': tensor([[[-0.6714, -0.4491, -0.3204,  ..., -1.0559, -1.0559, -1.0559],
           [-0.7622, -0.5544, -0.5803,  ..., -1.0559, -1.0559, -1.0559],
           [-0.8359, -0.7048, -0.6068,  ..., -1.0559, -1.0559, -1.0559],
           ...,
           [-1.0559, -1.0559, -1.0559,  ..., -1.0559, -1.0559, -1.0559],
           [-1.0559, -1.0559, -1.0559,  ..., -1.0559, -1.0559, -1.0559],
           [-1.0559, -1.0559, -1.0559,  ..., -1.0559, -1.0559, -1.0559]]]),
  'labels': tensor([50258, 50363,  6546,   569,   270,   265,   286,  2369,   761,   822,
             65,  1479,  1581,  2050,   836,   396,    70, 12902,    12, 35807,
          44454,  1030,   635,   318,  1344,  3505,  1581,  2050,   836,   396,
             70,  1581, 17642, 50257])},
 {'input_features': tensor([[[-0.1762, -0.2094, -0.0068,  ..., -0.7772, -0.7772, -0.7772],
           [-0.4482, -0.4658, -0.2365,  ..., -0.7772, -0.7772, -0.7772],
           [-0.7772, -0.5929, -0.4200,  ..., -0.7772, -0.7772, -0.7772

In [None]:
# data_train

[{'input_features': array([-4.89159429e-05, -6.57524361e-05, -5.67014540e-05, ...,
          1.89506391e-04,  2.24122545e-04,  0.00000000e+00]),
  'labels': tensor([50258, 50363,  6546,   569,   270,   265,   286,  2369,   761,   822,
             65,  1479,  1581,  2050,   836,   396,    70, 12902,    12, 35807,
          44454,  1030,   635,   318,  1344,  3505,  1581,  2050,   836,   396,
             70,  1581, 17642, 50257])},
 {'input_features': array([ 0.00017453,  0.00023177,  0.00026638, ...,  0.00120823,
         -0.00249736, -0.00194827]),
  'labels': tensor([50258, 50363, 43701,  8208,  1804,   275,  4011,   284,  2965,  2159,
           1341,  2977,   631,   287,     6, 16537,  1978, 15540,   730, 39500,
            530, 29623, 42363,  2152,  1193,   297,     6,  1746,   446, 28977,
            935,  1531,   635, 44703,  9068,   871, 20831,  3856,   287,     6,
           2894,   368,  1769, 24735,    13, 50257])},
 {'input_features': array([-0.00057117, -0.00014227, -0.00

In [None]:
# data_train_and_validation = data[0:int(len(data) * 0.8)]
# data_validation = data_train_and_validation[int(len(data_train_and_validation) * 0.8) + 1 : len(data_train_and_validation)]
# data_train = data_train_and_validation[0:int(len(data_train_and_validation) * 0.8)]
# data_test = data[int(len(data) * 0.8)+1: len(data)]

In [16]:
processed_data_dict["train"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 3846
})

In [None]:
processed_data_dict.save_to_disk("processed_dataset_save")

Saving the dataset (0/8 shards):   0%|          | 0/3846 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1099 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/550 [00:00<?, ? examples/s]

In [3]:
def postprocess_dataset(data_dict, tokenizer):
    """
    Postprocesses a DatasetDict by renaming 'audio' to 'input_features',
    tokenizing 'label' to 'labels', and removing 'label', and recreating the Dataset.

    Args:
        data_dict (dict): The dictionary containing the train, test, and validation Dataset.
        tokenizer (WhisperTokenizer): The tokenizer to use for tokenizing labels.

    Returns:
        dict: The postprocessed dictionary.
    """

    for split in ["train", "test", "validation"]:
        if split in data_dict:
            processed_items = []
            for item in data_dict[split]:
                item["input_features"] = item.pop("audio")
                item["labels"] = torch.tensor(
                    tokenizer(item["label"], padding="longest", truncation=True).input_ids
                ).squeeze(0)
                item.pop("label")
                processed_items.append(item)
            data_dict[split] = Dataset.from_list(processed_items) #recreate dataset.

    return data_dict

processed_data_dict = postprocess_dataset(data_dict, tokenizer)


NameError: name 'tokenizer' is not defined

In [None]:
# for split in ["train", "test", "validation"]:
#         if split in data_dict:  # Check if the split exists
#             for item in data_dict[split]:
#                 item["input_features"] = item.pop("audio")
#                 item["labels"] = torch.tensor(
#                     tokenizer(item["label"], padding="longest", truncation=True).input_ids
#                 ).squeeze(0) #added squeeze to remove batch dimension
#                 item.pop("label")

In [None]:
# import torch

# for item in data_dict["train"]:
#     item["input_features"] = item.pop("audio")
#     item["labels"] = torch.tensor(
#         tokenizer(item["label"], padding="longest", truncation=True).input_ids
#     )
#     item.pop("label")

# for item in data_dict["test"]:
#     item["input_features"] = item.pop("audio")
#     item["labels"] = torch.tensor(
#         tokenizer(item["label"], padding="longest", truncation=True).input_ids
#     )   
#     item.pop("label")

# for item in data_dict["validation"]:
#     item["input_features"] = item.pop("audio")
#     item["labels"] = torch.tensor(
#         tokenizer(item["label"], padding="longest", truncation=True).input_ids
#     )   
#     item.pop("label")

In [None]:
# from torch.utils.data import DataLoader

# def collate_fn(batch):
#     # Pad and stack audio arrays and labels for each batch
#     audios = [torch.tensor(item["audio"]) for item in batch]
#     labels = [item["labels"] for item in batch]
#     return {
#         "input_features": torch.nn.utils.rnn.pad_sequence(audios, batch_first=True),
#         "labels": torch.nn.utils.rnn.pad_sequence(labels, batch_first=True),
#     }

# # Create the DataLoader
# train_loader = DataLoader(data_train, batch_size=8, shuffle=True, collate_fn=collate_fn)


In [None]:
# from datasets import Dataset
# import os
# import torch

# # Function to preprocess and save in chunks
# def process_and_save_in_chunks(data_list, chunk_size, output_path):
#     os.makedirs(output_path, exist_ok=True)
#     for i in range(0, len(data_list), chunk_size):
#         chunk = data_list[i:i + chunk_size]
#         chunk_dataset = Dataset.from_list(chunk)
#         chunk_path = os.path.join(output_path, f"chunk_{i//chunk_size}.arrow") # changed
#         chunk_dataset.save_to_disk(chunk_path)

# # Process and save chunks
# chunk_size = 1000  # Adjust based on memory
# process_and_save_in_chunks(data_train, chunk_size, "processed_data_train")
# process_and_save_in_chunks(data_test, chunk_size, "processed_data_test")
# process_and_save_in_chunks(data_validation, chunk_size, "processed_data_validation")

Saving the dataset (2/2 shards): 100%|██████████| 1000/1000 [00:00<00:00, 1542.38 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 1000/1000 [00:00<00:00, 1591.67 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 1000/1000 [00:00<00:00, 1501.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 516/516 [00:00<00:00, 1352.48 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 1000/1000 [00:00<00:00, 1266.13 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 98/98 [00:00<00:00, 1504.99 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 879/879 [00:00<00:00, 1108.95 examples/s]


In [18]:
data_dict["test"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 1099
})

In [None]:
# from datasets import Dataset, concatenate_datasets

# def load_combined_dataset(path):
#     chunk_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".arrow")]
#     datasets = [Dataset.load_from_disk(chunk_file) for chunk_file in chunk_files]
#     return concatenate_datasets(datasets)

# # Load datasets
# data_train = load_combined_dataset("processed_data_train")
# data_test = load_combined_dataset("processed_data_test")
# data_validation = load_combined_dataset("processed_data_validation")


In [None]:
# data_train

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 3516
})

In [None]:
# import evaluate
 
# metric = evaluate.load("wer")

# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     label_ids[label_ids == -100] == tokenizer.pad_token_id
    
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     wer = 100 * metric.compute(predictions = pred_str, references=label_str)

#     return {"wer": wer}

Using LoRa

In [20]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, WhisperProcessor
# import wandb
# import torch
# from dataclasses import dataclass
# from typing import Any, Dict, List, Union

# # Load the processor instead of just the tokenizer
# processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="french", task="transcribe")


# wandb.init(project="whisper_french", name="whisper_french_lesmis")


# # Custom data collator to handle audio features and labels
# @dataclass
# class DataCollatorSpeechSeq2SeqWithPadding:
#     processor: Any

#     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
#         # Process audio features
#         input_features = [{"input_features": feature["input_features"]} for feature in features]
#         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

#         # Process labels
#         label_features = [{"input_ids": feature["labels"]} for feature in features]
#         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
#         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

#         # Remove BOS token if present
#         if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
#             labels = labels[:, 1:]
        
#         batch["labels"] = labels
#         return batch

# data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# model.config.forced_decoder_ids = False
# model.config.supress_tokens = []

# training_args = Seq2SeqTrainingArguments(
#     output_dir="my_finetuned_model",
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     logging_dir="logs",
#     logging_strategy="steps",
#     logging_steps=500,
#     save_total_limit=3,
#     fp16=True,

#     gradient_checkpointing=True,
#     gradient_accumulation_steps=1,
#     generation_max_length=225,

# )

# # Pass the data collator to the trainer and use processor's tokenizer
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=data_train,  # Remove 'label' column if exists
#     eval_dataset=data_validation,
#     tokenizer=processor.feature_extractor,
#     data_collator=data_collator,
#     compute_metrics = compute_metrics
# )

# trainer.train()
# model.save_pretrained("my_finetuned_model")
# results = trainer.evaluate(data_test)  # Ensure test data also correct
# wandb.finish()

RuntimeError: Failed to import transformers.trainer_seq2seq because of the following error (look up to see its traceback):
cannot import name 'clear_device_cache' from 'accelerate.utils.memory' (d:\Personal-project\env3.9\lib\site-packages\accelerate\utils\memory.py)

In [None]:
# from dataclasses import dataclass
# from typing import Any, Dict, List, Union

# # Custom data collator to handle audio features and labels
# @dataclass
# processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="french", task="transcribe")
# class DataCollatorSpeechSeq2SeqWithPadding:
#     processor: Any

#     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
#         # Process audio features
#         input_features = [{"input_features": feature["input_features"]} for feature in features]
#         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

#         # Process labels
#         label_features = [{"input_ids": feature["labels"]} for feature in features]
#         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
#         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

#         # Remove BOS token if present
#         if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
#             labels = labels[:, 1:]
        
#         batch["labels"] = labels
#         return batch

# data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [None]:
# def tokenize_func(examples):
# 	return tokenizer(examples["input_features"], examples["labels"], truncation=True)  # max_length=512,  padding=True

# encoded_dataset_train = data_train.map(tokenize_func, batched=True)
# encoded_dataset_test = data_test.map(tokenize_func, batched=True)
# encoded_dataset_evaluation = data_validation.map(tokenize_func, batched=True)

Map:   0%|          | 0/3516 [00:00<?, ? examples/s]

=============== From here ====

In [5]:
from datasets import Dataset, DatasetDict
dataset = DatasetDict.load_from_disk("processed_dataset_save")

In [6]:
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from transformers import WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", langauge="french", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="french", task="transcribe")


In [7]:
def prepare_dataset(batch, feature_extractor=None):
    audio = batch["input_features"]

    batch["input_features"] = feature_extractor(audio["audio"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = batch["labels"]
    return batch

dataset = dataset.map(prepare_dataset, fn_kwargs={"feature_extractor": feature_extractor}, num_proc=2)

In [9]:
dataset.save_to_disk("half_processed_dataset_save")

Saving the dataset (0/8 shards):   0%|          | 0/3846 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/1099 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/550 [00:00<?, ? examples/s]

In [None]:
from datasets import Dataset, DatasetDict
dataset = DatasetDict.load_from_disk("half_processed_dataset_save")

In [None]:
dataset["test"][0]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 1099
})

In [11]:
dataset["validation"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 550
})

In [12]:
import evaluate
 
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] == tokenizer.pad_token_id
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions = pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, WhisperProcessor
import wandb
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Load the processor instead of just the tokenizer
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="french", task="transcribe")


wandb.init(project="whisper_french", name="whisper_french_lesmis")


# Custom data collator to handle audio features and labels
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Process audio features
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Process labels
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Remove BOS token if present
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

model.config.forced_decoder_ids = False
model.config.supress_tokens = []

training_args = Seq2SeqTrainingArguments(
    output_dir="my_finetuned_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=500,
    save_total_limit=3,
    fp16=True,
    report_to="wandb",
    gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    generation_max_length=225,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics = compute_metrics
)

trainer.train()
model.save_pretrained("my_finetuned_model")
results = trainer.evaluate(dataset["test"])  # Ensure test data also correct
wandb.finish()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


==================== Done training ====================

In [26]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import wandb

from transformers import WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import soundfile as sf

# Load model and processor
processor = AutoProcessor.from_pretrained("openai/whisper-small")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
# Load the tokenizer
# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")

wandb.init(project="whisper_french", name="whisper_french_lesmis")

training_args = Seq2SeqTrainingArguments(
    output_dir="my_finetuned_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="logs",
    logging_strategy="steps",
    label_names=["labels"],
    logging_steps=500,
    save_total_limit=3,
    fp16=True,
)

# Pass the data collator to the trainer and use processor's tokenizer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],  # Remove 'label' column if exists
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained("my_finetuned_model")
# results = trainer.evaluate(data_test.remove_columns("label"))  # Ensure test data also correct
wandb.finish()

  trainer = Seq2SeqTrainer(


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['input_features', 'labels']

In [None]:
from datasets import Dataset, Audio, IterableDataset, Features, Value

whole_dataset = Dataset.from_list(data)
whole_dataset = whole_dataset.cast_column("audio", Audio())


whole_dataset

Dataset({
    features: ['audio', 'label'],
    num_rows: 5
})

In [28]:
len(data)

5495

In [None]:
import wandb
wandb.login(relogin=True)

In [None]:
import wandb
from transformers import TrainingArguments, Trainer


training_arguments = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.1,
)

trainer = Trainer(
    model=model,
    args = training_arguments,
    train_dataset=data_train,
    eval_dataset=data_validation
)


trainer.train()

model.save_pretrained("my_finetuned_model")

results = trainer.evaluate(data_test)
wandb.finish()



ValueError: too many dimensions 'str'

In [6]:
from datasets import disable_caching
from datasets import Dataset, Audio, IterableDataset, Features, Value

disable_caching()

def data_generator():
    # Replace this with your actual data source (e.g., reading from disk/database)
    count = 0
    for one_sample in data:  # Iterate without loading everything
        count += 1
        if count / 50 == 0:
            print(count)
        yield {
            "audio": one_sample["audio"], 
            "label": one_sample["label"]
        }

features = Features({
    "audio": Audio(),  # Audio feature (will decode on-the-fly)
    "label": Value("string")  # Or "int32"/"int64" for numerical labels
})

# Create an IterableDataset
whole_dataset = IterableDataset.from_generator(data_generator, features=features)
whole_dataset.save_to_disk("preprocessed_dataset")


KeyboardInterrupt: 

In [None]:
whole_dataset.save_to_disk("preprocessed_dataset")

In [19]:
from datasets import Dataset, Audio, IterableDataset, Features, Value

features = Features({
    "audio": Audio(),
    "label": Value("string"),  # Adjust if the label is numeric or categorical
})
streaming_dataset = IterableDataset.from_generator(lambda: data, features=features)

# Cast the "audio" column to Audio() type
# streaming_dataset = streaming_dataset.cast_column("audio", Audio())

# Save to disk in batches
output_path = "preprocessed_dataset"
streaming_dataset.save_to_disk(output_path)



AttributeError: 'IterableDataset' object has no attribute 'save_to_disk'

In [None]:
# loaded_dataset = Dataset.load_from_disk("preprocessed_dataset")

In [21]:
# Materialize the dataset into memory
streaming_dataset


IterableDataset({
    features: ['audio', 'label'],
    num_shards: 1
})

In [None]:
train_data = 

### Training

In [8]:
import torch
print(torch.cuda.is_available())


True


In [None]:

from transformers import TrainingArguments, Trainer

training_arguments = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.1,
)

trainer = Trainer(
    model=model,
    TrainingArguments = training_arguments,
    train_dataset= 
)

SyntaxError: invalid syntax (2902145941.py, line 6)