In [None]:
# Cell 1: Install necessary packages
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
!pip install huggingface-hub
!pip install joblib
!pip install librosa
!pip install pandas

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-msnzxhyu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-msnzxhyu
  Resolved https://github.com/huggingface/transformers.git to commit 72fb02c47dbbe1999ae105319f24631cad6e2e00
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.43.0.dev0-py3-none-any.whl size=9389929 sha256=86ef2ce46209ff7821c56c37ae2fcecbe8ec6565d28895ba6d5b7f328e8a4e97
  Stored in directory: /tmp/pip-ephem-wheel-cache-8fgwc5rh/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16
Successfully bu

In [None]:
# Cell 2: Import necessary libraries
import os
import glob
import random
import logging
import numpy as np
import tensorflow as tf
import pandas as pd
import librosa
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from transformers import AutoFeatureExtractor, TFWav2Vec2Model

from google.colab import files
import zipfile

In [None]:
# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

# Set random seed
tf.keras.utils.set_random_seed(42)

# Define constants
MAX_DURATION = 1
SAMPLING_RATE = 16000
BATCH_SIZE = 32
NUM_CLASSES = 10  # Update this according to your dataset
HIDDEN_DIM = 768
MAX_SEQ_LENGTH = MAX_DURATION * SAMPLING_RATE
MAX_FRAMES = 49
MAX_EPOCHS = 2
MODEL_CHECKPOINT = "facebook/wav2vec2-base"

In [None]:
# Cell 3: Upload and extract dataset
# Upload your zip file containing the dataset
uploaded = files.upload()

# Assuming the uploaded zip file is named 'IIT.zip'
zip_file = next(iter(uploaded.keys()))
EXTRACTION_DIR = '/content/IIT'

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(EXTRACTION_DIR)

# List all audio files and create labels from file names or directory structure
audio_files = glob.glob(os.path.join(EXTRACTION_DIR, '**/*.wav'), recursive=True)
print(f"Found {len(audio_files)} audio files.")

# Assuming the label is encoded in the directory name or file name
def extract_label(file_path):
    # Modify this function based on your actual data structure
    # Example: if label is part of the file name like 'label_something.wav'
    return os.path.basename(file_path).split('_')[0]

data = {'audio': [], 'label': []}
for file_path in audio_files:
    data['audio'].append(file_path)
    data['label'].append(extract_label(file_path))

df = pd.DataFrame(data)
print(df.head())

# Convert labels to numeric values
labels = df['label'].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
df['label'] = df['label'].map(label2id)

# Load audio files and extract features
def load_audio(file_path):
    audio, _ = librosa.load(file_path, sr=SAMPLING_RATE)
    return audio

df['audio'] = df['audio'].apply(lambda x: load_audio(x))

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.5, stratify=df['label'], random_state=42)


Saving IIT.zip to IIT.zip
Found 16 audio files.
                                         audio   label
0     /content/IIT/IIT/Ses01F_impro06_F028.wav  Ses01F
1     /content/IIT/IIT/Ses02F_impro02_M003.wav  Ses02F
2     /content/IIT/IIT/Ses04M_impro01_F017.wav  Ses04M
3  /content/IIT/IIT/Ses01M_script01_1_F014.wav  Ses01M
4  /content/IIT/IIT/Ses04M_script03_1_F013.wav  Ses04M


In [None]:
# Cell 4: Process the dataset
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_CHECKPOINT, return_attention_mask=True)

def preprocess_function(examples):
    audio_arrays = examples['audio'].tolist()
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=MAX_SEQ_LENGTH,
        truncation=True,
        padding=True,
    )
    return inputs

train_dataset = preprocess_function(train_df)
test_dataset = preprocess_function(test_df)

train_dataset['label'] = train_df['label'].values
test_dataset['label'] = test_df['label'].values

# Convert to TensorFlow format
train = tf.data.Dataset.from_tensor_slices(((train_dataset['input_values'], train_dataset['attention_mask']), train_dataset['label'])).shuffle(buffer_size=len(train_df)).batch(BATCH_SIZE)
test = tf.data.Dataset.from_tensor_slices(((test_dataset['input_values'], test_dataset['attention_mask']), test_dataset['label'])).batch(BATCH_SIZE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



In [None]:
# Cell 5: Define the model architecture
def mean_pool(hidden_states, feature_lengths):
    attention_mask = tf.sequence_mask(
        feature_lengths, maxlen=MAX_FRAMES, dtype=tf.dtypes.int64
    )
    padding_mask = tf.cast(
        tf.reverse(tf.cumsum(tf.reverse(attention_mask, [-1]), -1), [-1]),
        dtype=tf.dtypes.bool,
    )

    # Change this line to broadcast to the correct batch size
    hidden_states = tf.where(
        tf.broadcast_to(
            tf.expand_dims(~padding_mask, -1), tf.shape(hidden_states)  # Dynamically get the batch size
        ),
        0.0,
        hidden_states,
    )
    pooled_state = tf.math.reduce_sum(hidden_states, axis=1) / tf.reshape(
        tf.math.reduce_sum(tf.cast(padding_mask, dtype=tf.dtypes.float32), axis=1),
        [-1, 1],
    )
    return pooled_state

class TFWav2Vec2ForAudioClassification(layers.Layer):
    def __init__(self, model_checkpoint, num_classes):
        super().__init__()
        self.wav2vec2 = TFWav2Vec2Model.from_pretrained(
            model_checkpoint, apply_spec_augment=False, from_pt=True
        )
        self.pooling = layers.GlobalAveragePooling1D()
        self.intermediate_layer_dropout = layers.Dropout(0.5)
        self.final_layer = layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        hidden_states = self.wav2vec2(inputs["input_values"])[0]

        if tf.is_tensor(inputs["attention_mask"]):
            audio_lengths = tf.cumsum(inputs["attention_mask"], -1)[:, -1]
            feature_lengths = self.wav2vec2.wav2vec2._get_feat_extract_output_lengths(
                audio_lengths
            )
            pooled_state = mean_pool(hidden_states, feature_lengths)
        else:
            pooled_state = self.pooling(hidden_states)

        intermediate_state = self.intermediate_layer_dropout(pooled_state)
        final_state = self.final_layer(intermediate_state)

        return final_state

In [None]:
def build_model():
    inputs = {
        "input_values": tf.keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="float32"),
        "attention_mask": tf.keras.Input(shape=(MAX_SEQ_LENGTH,), dtype="int32"),
    }
    wav2vec2_model = TFWav2Vec2ForAudioClassification(MODEL_CHECKPOINT, NUM_CLASSES)(inputs)
    model = tf.keras.Model(inputs, wav2vec2_model)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(learning_rate=1e-5)
    model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
    return model

model = build_model()


TFWav2Vec2Model has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2Model: ['quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_q.weight', 'quantizer.weight_proj.weight', 'project_hid.bias', 'project_hid.weight']
- This IS expected if you are initializing TFWav2Vec2Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFWav2Vec2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the che

In [None]:
# Cell 6: Train the model
history = model.fit(train, validation_data=test, epochs=MAX_EPOCHS)

Epoch 1/2
Epoch 2/2


In [None]:
# Cell 7: Evaluate the model and make predictions
preds = model.predict(test)
import IPython.display as ipd

rand_int = random.randint(0, len(test_dataset['input_values']) - 1)

ipd.Audio(data=np.asarray(test_dataset['input_values'][rand_int]), autoplay=True, rate=16000)

print("Original Label is ", id2label[test_dataset['label'][rand_int]])
print("Predicted Label is ", id2label[np.argmax(preds[rand_int])])

Original Label is  Ses04M
Predicted Label is  Ses01F


In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Tokenizer, Wav2Vec2Processor

# Load the model
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_CLASSES)
tokenizer = Wav2Vec2Tokenizer.from_pretrained(MODEL_CHECKPOINT)
processor = Wav2Vec2Processor.from_pretrained(MODEL_CHECKPOINT) # Define processor

model.push_to_hub("wavv", organization="DYPatil1")
processor.push_to_hub("wavv", organization="DYPatil1") # Now you can push processor
!git clone https://huggingface.co/DYPatil1/wavv

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Cloning into 'wavv'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 14 (delta 1), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (14/14), 6.58 KiB | 1.64 MiB/s, done.
