### Author: Vignesh Srinivasa Naidu Prakash
### GTID: 903809799

### Dataset

In [1]:
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers import default_data_collator
from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification
import torch
import torch.nn as nn
from datasets import Dataset, Audio, Value, Features,load_dataset,ClassLabel
from transformers import Wav2Vec2Processor
from transformers.modeling_outputs import SequenceClassifierOutput
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
x = [str(i) for i in range(0,100,1)]
features = Features(
    {
        "id": Value("string"),
        "speaker_id": Value("string"), 
        'path': Value('string'),
        "audio": Audio(sampling_rate=16000),
        "label": ClassLabel(num_classes=100,names=x,names_file=None,id=None)
    }
)


dataset = load_dataset('csv', 
                       data_files={'train': '../data/identification/train_100.csv',
                                    'dev':'../data/identification/dev_100.csv', 
                                    'test': '../data/identification/test_100.csv'},
                       features=features)
dataset = dataset.map(remove_columns=(['path','speaker_id']),num_proc=24)
dataset = dataset.sort("label")
sampling_rate = dataset["train"].features["audio"].sampling_rate
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base",return_attention_mask=True)

Using custom data configuration default-32600726bfa6b6de
Found cached dataset csv (/storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 404.76it/s]
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-2c523a6aa4d437d1.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-752369dc0b7f5663.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-6649bf103c92c53a.arrow
Loading cached processed dataset at /s

Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5f59cca9aadb504c.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9e49cfa90a76ff4b.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-fa31363967087aba.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f8198b4958d3f6e4.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/defau

Loading cached sorted indices for dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-25f8df774a41bc33.arrow
Loading cached sorted indices for dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-157e63ef843315a3.arrow


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'label'],
        num_rows: 11125
    })
    dev: Dataset({
        features: ['id', 'audio', 'label'],
        num_rows: 533
    })
    test: Dataset({
        features: ['id', 'audio', 'label'],
        num_rows: 632
    })
})

In [4]:
def featurize(batch):
#     audio_arrays = [batch['audio'][i]['array'] for i in range(len(batch))]
    audio_arrays = [batch['audio'][i]['array'] for i in range(len(batch['id']))]
    print(len(audio_arrays))
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=16_000, 
        max_length=int(16_000 * 10),  # 10s
        truncation=True, 
        padding='max_length',
    )
    return inputs
dataset = dataset.map(featurize, remove_columns='audio',batched=True,num_proc=20)


Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5055f45db9aefa06.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a1ba8dec82a007b6.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4fc3e9fb787afaa2.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f10b5955abda65be.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/defau

Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-068cca06efa2a9c8.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-ac9fc749860da47a.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e39d191d1aa52d27.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/default-32600726bfa6b6de/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-11165ea54f9f0840.arrow
Loading cached processed dataset at /storage/home/hcocice1/vkotra3/.cache/huggingface/datasets/csv/defau

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'input_values', 'attention_mask'],
        num_rows: 11125
    })
    dev: Dataset({
        features: ['id', 'label', 'input_values', 'attention_mask'],
        num_rows: 533
    })
    test: Dataset({
        features: ['id', 'label', 'input_values', 'attention_mask'],
        num_rows: 632
    })
})

In [33]:
dataset.set_format("torch",columns=["id","input_values", "attention_mask", "label"])
train_data_collator = default_data_collator(dataset['train'])
dev_data_collator = default_data_collator(dataset['dev'])

In [34]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    dataset["train"], shuffle=True, batch_size=32, collate_fn=train_data_collator
)
dev_dataloader = DataLoader(
    dataset["dev"], batch_size=32, collate_fn=dev_data_collator
)

In [70]:
class CustomHuBERTSID(nn.Module):
    def __init__(self,checkpoint,num_labels,inter_layer_num):
        super(CustomHuBERTSID, self).__init__()
#         self.hubert = HubertForSequenceClassification.from_pretrained("superb/hubert-base-superb-sid")
        self.model =AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.num_labels = num_labels
        ### New layers:
        self.linear1 = nn.Linear(768, 1024)
        self.linear2 = nn.Linear(1024, num_labels)
        
        ### Intermediate Layer Number
        self.layer_num = inter_layer_num

    def forward(self, input_values=None, attention_mask=None,labels=None):
        outputs = self.model(input_values=input_values, attention_mask=attention_mask)
        feature = outputs.hidden_states[self.layer_num]
        agg_vec_list = []
        for i in range(len(feature)):
            if torch.nonzero(attention_mask[i] < 0, as_tuple=False).size(0) == 0:
                length = len(feature[i])
            else:
                length = torch.nonzero(attention_mask[i] < 0, as_tuple=False)[0] + 1
            agg_vec=torch.mean(feature[i][:length], dim=0)
            agg_vec_list.append(agg_vec)
        mean = torch.stack(agg_vec_list)
        # sequence_output has the following shape: (batch_size, sequence_length, 768)
        linear1_output = self.linear1(mean) ## extract the 1st token's embeddings
        logits = self.linear2(linear1_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)    

In [71]:
checkpoint = "facebook/wav2vec2-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inter_layer = 10
model=CustomHuBERTSID(checkpoint=checkpoint,num_labels=100,inter_layer_num=inter_layer).to(device)

loading configuration file config.json from cache at /storage/home/hcocice1/vkotra3/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-base",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token

In [72]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


data_collator = default_data_collator
metric_name="accuracy"
args = TrainingArguments(
    "test-sid-100-w2v-base",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
)

trainer = Trainer(
    model,
    args,
    train_dataset = dataset["train"],
    eval_dataset = dataset["dev"],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)


trainer.train()

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `CustomHuBERTSID.forward` and have been ignored: id. If id are not expected by `CustomHuBERTSID.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11125
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 3339
  Number of trainable parameters = 95261668


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

### Custom Trainer(Work in Progress!! Will do tomorrow)

In [None]:
from transformers import AdamW,get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [36]:
from datasets import load_metric
metric = load_metric("accuracy")

In [37]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(dev_dataloader)))


for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

model.eval()
for batch in dev_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
progress_bar_eval.update(1)

print(metric.compute())



  0%|          | 0/1044 [01:21<?, ?it/s][A[A

  0%|          | 0/51 [01:21<?, ?it/s][A


TypeError: 'dict' object is not callable

In [17]:
model_keys = model_ft.state_dict().keys()

In [26]:
keylist = list(model_keys)

In [27]:
keywords = ['spec_embed','feature_extractor','feature_projection']

for word in keywords:
    for key in keylist:
        if word in key:
            print(key)

model.masked_spec_embed
model.feature_extractor.conv_layers.0.conv.weight
model.feature_extractor.conv_layers.0.layer_norm.weight
model.feature_extractor.conv_layers.0.layer_norm.bias
model.feature_extractor.conv_layers.1.conv.weight
model.feature_extractor.conv_layers.2.conv.weight
model.feature_extractor.conv_layers.3.conv.weight
model.feature_extractor.conv_layers.4.conv.weight
model.feature_extractor.conv_layers.5.conv.weight
model.feature_extractor.conv_layers.6.conv.weight
model.feature_projection.layer_norm.weight
model.feature_projection.layer_norm.bias
model.feature_projection.projection.weight
model.feature_projection.projection.bias


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inter_layer = 10
model_ft = CustomHuBERTSID(checkpoint=checkpoint,num_labels=100,inter_layer_num=inter_layer).to(device)
ct = 0
for child in model_ft.children():
    list(child.state_dict().keys())
    ct += 1
    if ct < 7:
        for param in child.parameters():
            param.requires_grad = False

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2Model: ['project_hid.bias', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_hid.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


odict_keys(['masked_spec_embed', 'feature_extractor.conv_layers.0.conv.weight', 'feature_extractor.conv_layers.0.layer_norm.weight', 'feature_extractor.conv_layers.0.layer_norm.bias', 'feature_extractor.conv_layers.1.conv.weight', 'feature_extractor.conv_layers.2.conv.weight', 'feature_extractor.conv_layers.3.conv.weight', 'feature_extractor.conv_layers.4.conv.weight', 'feature_extractor.conv_layers.5.conv.weight', 'feature_extractor.conv_layers.6.conv.weight', 'feature_projection.layer_norm.weight', 'feature_projection.layer_norm.bias', 'feature_projection.projection.weight', 'feature_projection.projection.bias', 'encoder.pos_conv_embed.conv.bias', 'encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v', 'encoder.layer_norm.weight', 'encoder.layer_norm.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.q_proj