# HW1: Frame-Level Speech Recognition

In this homework, you will be working with MFCC data consisting of 15 features at each time step/frame. Your model should be able to recognize the phoneme occured in that frame.

# Import Libraries

In [None]:
!nvidia-smi

In [None]:
!pip install torchsummaryX wandb --quiet

In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import accuracy_score
import gc
import zipfile
import pandas as pd
from tqdm.auto import tqdm
import os
import datetime
import wandb
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

In [None]:
## If you are using colab, you can import google drive to save model checkpoints in a folder
from google.colab import drive
drive.mount('/content/drive')

In [None]:
### PHONEME LIST
PHONEMES = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']

# Kaggle

This section contains code that helps you install kaggle's API, creating kaggle.json with you username and API key details. Make sure to input those in the given code to ensure you can download data from the competition successfully.

In [None]:
#Install Kaggle API
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"","key":""}') 
    # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

In [None]:
#Download data and unzip
! kaggle competitions download -c 11-785-f22-hw1p2

Downloading 11-785-f22-hw1p2.zip to /content
100% 2.12G/2.13G [00:11<00:00, 241MB/s]
100% 2.13G/2.13G [00:11<00:00, 204MB/s]


In [None]:
#unzip 
! unzip -qo '11-785-f22-hw1p2.zip' -d '/content'

# Dataset

This section covers the dataset/dataloader class for speech data. You will have to spend time writing code to create this class successfully. We have given you a lot of comments guiding you on what code to write at each stage, from top to bottom of the class. Please try and take your time figuring this out, as it will immensely help in creating dataset/dataloader classes for future homeworks.

Before running the following cells, please take some time to analyse the structure of data. Try loading a single MFCC and its transcipt, print out the shapes and print out the values. Do the transcripts look like phonemes?

In [None]:
# Dataset class to load train and validation data
class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, data_path, context, offset=0, partition= "train-clean-100", limit=-1): # Feel free to add more arguments

        self.context = context #hyperparameter, generally optimal betweeen 0 and 50
        self.offset = offset
        self.data_path = data_path
        # Mel Frequency Cepstral Coefficient (MFCC)
      
        self.mfcc_dir = self.data_path +'/'+ partition + '/mfcc' 
        self.transcript_dir = self.data_path +'/'+ partition + '/transcript' 
      
        mfcc_names = sorted(os.listdir(self.mfcc_dir))
        transcript_names = sorted(os.listdir(self.transcript_dir))

        assert len(mfcc_names) == len(transcript_names) 
        self.mfccs, self.transcripts = [], []

        for i in range(0, len(mfcc_names)):
            mfcc = np.load(self.mfcc_dir + '/' + mfcc_names[i])
        #   Gaussin Normalization of mfcc
            mean = np.mean(mfcc,axis = 0)
            sigma = np.std(mfcc, axis = 0)
            mfcc = (mfcc - mean)/sigma

        #   Load the corresponding transcript
        #   Remove [SOS] and [EOS] from the transcript 
            transcript = np.load(self.transcript_dir + '/' + transcript_names[i])[1:-1] 
        #   Append each mfcc to self.mfcc, transcript to self.transcript
            self.mfccs.append(mfcc)
            self.transcripts.append(transcript)
        # NOTE:
        # Each mfcc is of shape T1 x 15, T2 x 15, ...
        # Each transcript is of shape (T1+2) x 15, (T2+2) x 15 before removing [SOS] and [EOS]
        # Concatenate all mfccs in self.X such that the final shape is T x 15 (Where T = T1 + T2 + ...) 
        self.mfccs = np.concatenate(self.mfccs, axis = 0)
        # Concatenate all transcripts in self.Y such that the final shape is (T,) meaning, each time step has one phoneme output
        self.transcripts = np.concatenate(self.transcripts)
        # Take some time to think about what we have done. self.mfcc is an array of the format (Frames x Features). 
        # Our goal is to recognize phonemes of each frame
        # From hw0, you will be knowing what context is.
        # TODOL We can introduce context by padding zeros on top and bottom of self.mfcc
        self.mfccs = np.pad(self.mfccs, [(self.context, self.context), (0,0)], 'constant', constant_values=(0, 0)) 
        # These are the available phonemes in the transcript
        self.phonemes = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']
        # But the neural network cannot predict strings as such. Instead we map these phonemes to integers
        # Map the phonemes to their corresponding list indexes in self.phonemes
        self.transcripts = [self.phonemes.index(self.transcripts[i]) for i in range(len(self.transcripts))]
        # Now, if an element in self.transcript is 0, it means that it is 'SIL' (as per the above example)
        # Length of the dataset is now the length of concatenated mfccs/transcripts
        self.length = len(self.transcripts)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        # TODO: Based on context and offset, return a frame at given index with context frames to the left, and right.
        frames = self.mfccs[ind + self.offset - self.context: ind + self.offset + self.context + 1]
        # After slicing, you get an array of shape 2*context+1 x 15. But our MLP needs 1d data and not 2d.
        # TODO: Flatten to get 1d data
        frames = frames.flatten()
        frames = torch.FloatTensor(frames) # Convert to tensors
        phoneme = torch.tensor(self.transcripts[ind])       

        return frames, phoneme
        

In [None]:
class AudioTestDataset(torch.utils.data.Dataset):
    # Create a test dataset class similar to tclass but you dont have transcripts for this
    def __init__(self, data_path, context, offset=0, partition= "test-clean", limit=-1): 
        self.context = context #hyperparameter, generally optimal betweeen 0 and 50
        self.offset = offset
        self.data_path = data_path
        # Mel Frequency Cepstral Coefficient (MFCC)
        self.mfcc_dir = self.data_path +'/'+ partition + '/mfcc' 
        mfcc_names = sorted(os.listdir(self.mfcc_dir))
        self.mfccs = []

        for i in range(0, len(mfcc_names)):
        #   Load a single mfcc
            mfcc = np.load(self.mfcc_dir + '/' + mfcc_names[i])
        #   Gaussian Normalization of mfcc
            mean = np.mean(mfcc,axis = 0)
            sigma = np.std(mfcc, axis = 0)
            mfcc = (mfcc - mean)/sigma
        #   Append each mfcc to self.mfcc, 
            self.mfccs.append(mfcc)

        self.mfccs = np.concatenate(self.mfccs, axis = 0)
        self.mfccs = np.pad(self.mfccs, [(self.context, self.context), (0,0)], 'constant', constant_values=(0, 0)) 
        self.phonemes = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']

        self.length = len(self.mfccs) - 2*self.context

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        frames = self.mfccs[ind + self.offset - self.context: ind + self.offset + self.context + 1]
        frames = frames.flatten()
        frames = torch.FloatTensor(frames) # Convert to tensors

        return frames

# Create Train Dataset and Validation Dataset

In [None]:
def bulid_dataclass(context):
  # Create a dataset object using the AudioDataset class for the training data 
  train_data = AudioDataset('/content', context, offset=context, partition= "train-clean-100", limit=-1)
  # Create a dataset object using the AudioDataset class for the validation data 
  val_data = AudioDataset('/content', context, offset=context, partition= "dev-clean", limit=-1) 
  # Create a dataset object using the AudioTestDataset class for the test data 
  return train_data, val_data


Get subset data for tunning hyperparameter (faster)

In [None]:
def get_subset(train_data, val_data):
  #Get subset of train_data and val_data
  torch.manual_seed(0)
  # subsetsize
  train_indices = torch.randperm(len(train_data))[:len(train_data)]
  val_indices = torch.randperm(len(val_data))[:len(val_data)]
  train_data = torch.utils.data.Subset(train_data, train_indices)
  val_data = torch.utils.data.Subset(val_data, val_indices)
  return train_data, val_data

Build dataset

In [None]:
def build_data(batch_size, train_data, val_data):
    train_loader = torch.utils.data.DataLoader(train_data, 
                                               num_workers= 4, 
                                               batch_size=batch_size, 
                                               pin_memory= True,shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, num_workers= 2,
                                            batch_size=batch_size, pin_memory= True,
                                            shuffle= False)
    return train_loader, val_loader

# Wandb


In [None]:
wandb.login(key="") #API Key is in your wandb account, under settings (wandb.ai/settings)

# Helper function
    evaluation function:
        eval(model, dataloader)
    train function for 1 epoch:
        train(model, optimizer, criterion, dataloader, scaler, scheduler)
    train function:
        train1(model, train_loader, val_loader, optimizer, criterion, scheduler, scaler)

In [None]:
def eval(model, dataloader):
    model.eval() # set model in evaluation mode
    phone_true_list = []
    phone_pred_list = []
    for i, data in enumerate(dataloader):
        frames, phonemes = data
        ### Move data to device (ideally GPU)
        frames, phonemes = frames.to(device), phonemes.to(device) 
        # makes sure that there are no gradients computed as we are not training the model now
        # no_grad mode
        with torch.inference_mode(): 
            ### Forward Propagation
            logits = model(frames)
        ### Get Predictions
        predicted_phonemes = torch.argmax(logits, dim=1)
        ### Store Pred and True Labels
        phone_pred_list.extend(predicted_phonemes.tolist())
        phone_true_list.extend(phonemes.tolist())
        # Do you think we need loss.backward() and optimizer.step() here?
        del frames, phonemes, logits
        torch.cuda.empty_cache()
    ### Calculate Accuracy
    accuracy = accuracy_score(phone_pred_list, phone_true_list) 
    return accuracy*100

In [None]:
def train(model, optimizer, criterion, dataloader, scaler, scheduler):
    model.train()
    train_loss = 0.0 # Monitoring Loss
    #Use coloured version of progress bars
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')
    for iter, (mfccs, phonemes) in enumerate(dataloader):
        ### Move Data to Device (Ideally GPU)
        mfccs = mfccs.to(device)
        phonemes = phonemes.to(device)
        # Processing inputs and calling backward
        with torch.autocast(device):
            ### Forward Propagation
            logits = model(mfccs)
            ### Loss Calculation
            loss = criterion(logits, phonemes)
            
        train_loss += loss.item()
        batch_bar.set_postfix(
              # acc="{:.04f}%".format(100 * num_correct / ((i + 1) * run_config['batch_size'])),
              loss="{:.04f}".format(float(train_loss / (iter + 1))),
              # num_correct=num_correct,
              lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        ### Initialize Gradients
        # Zero the gradient buffers of all parameters and backprops with random gradients:
        optimizer.zero_grad()
       ### Backward Propagation
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        batch_bar.update()
        scheduler.step()

    batch_bar.close()
    train_loss /= len(dataloader)

    return train_loss

In [None]:
def train1(model, train_loader, val_loader, optimizer, criterion, scheduler, scaler):
  torch.cuda.empty_cache()
  best_acc = 0
  for epoch in range(config['epochs']):
      train_loss = train(model, optimizer, criterion, train_loader, scaler, scheduler)
      #train_acc = eval(model, train_loader)
      val_acc = eval(model, val_loader)
      lr = float(optimizer.param_groups[0]['lr'])
      #Train Accuracy {:.04f}%,
      print("Epoch {}/{}:  Validation Accuracy {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
          epoch + 1,
          config['epochs'],
          #train_acc ,
          val_acc,
          train_loss,
          lr
          )
      )
      #schedule lr
      #scheduler.step(val_acc)
      # What to log 
      metrics = {
          "train_loss":train_loss,
          #"train_acc": train_acc,
          'val_acc': val_acc,
          'lr': lr
      }
      # Log to run
      wandb.log(metrics)
      # Updating the model version
      if val_acc > best_acc:
        best_acc = val_acc
        # Saving the model and optimizer states
        torch.save({
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict()
              }, "Model")
        
        # Creating Artifact
        model_artifact = wandb.Artifact(config['model'], type='model')
        # Adding model file to Artifact
        model_artifact.add_file("Model")
        # Saving Artifact to WandB
        run.log_artifact(model_artifact)
  
  #wandb.finish()

# Network Architecture


Define the MLP structure

In [None]:
class Net1(nn.Module):
    def __init__(self, context, size1,size2,size3,size4,size5,size6,drop,initial,initial_choice):

        super(Net1, self).__init__()

        input_size = (2*context + 1) * 15 
        layer1 = size1
        layer2 = size2
        layer3 = size3
        layer4 = size4
        layer5 = size5
        layer6 = size6
        output_size = 40 
        
        self.model = nn.Sequential(
              nn.Linear(input_size, layer1, bias=False),
              nn.BatchNorm1d(layer1),
              nn.ReLU(),
              nn.Dropout(drop),

              nn.Linear(layer1, layer2, bias=False),
              nn.BatchNorm1d(layer2),
              nn.ReLU(),
              nn.Dropout(drop),

              nn.Linear(layer2, layer3, bias=False),
              nn.BatchNorm1d(layer3),
              nn.ReLU(),
              nn.Dropout(drop),

              nn.Linear(layer3, layer4, bias=False),
              nn.BatchNorm1d(layer4),
              nn.ReLU(),
              nn.Dropout(drop),

              nn.Linear(layer4, layer5, bias=False),
              nn.BatchNorm1d(layer5),
              nn.ReLU(),
              nn.Dropout(drop),

              nn.Linear(layer5, layer6, bias=False),
              nn.BatchNorm1d(layer6),
              nn.ReLU(),
              nn.Dropout(drop),

              nn.Linear(layer6, output_size),
              )
        # Initialize the parameters
        if initial:
          for m in self.modules():
            if isinstance(m, nn.Linear):
              if initial_choice == 'xavier_uniform':
                torch.nn.init.xavier_uniform_(m.weight)
              elif initial_choice == 'uniform':
                torch.nn.init.uniform_(m.weight)
              elif initial_choice == 'normal':
                torch.nn.init.normal_(m.weight)
              elif initial_choice == 'kaiming_uniform':
                torch.nn.init.kaiming_uniform_(m.weight)
              elif initial_choice == 'kaiming_normal':
                torch.nn.init.kaiming_normal_(m.weight) 
    # define the forward function
    # the backward function (where gradients are computed)
    # is automatically defined using autograd
    def forward(self, x):
        out = self.model(x)
    # .parameters() return learnable parameters of a model
        return out

## Train

In [None]:
torch.cuda.empty_cache()
gc.collect()

425

Parameter Setting

In [None]:
config = {
    'epochs': 40,
    'batch_size' : 1024,
    'context' : 32,
    'learning_rate' : 0.001,
    'lr_schedule': 'CosineAnnealing',#'CosineAnnealing', #'StepLR', 'ReduceLROnPlateau', 'Exponential', 'CosineAnnealing'
    'optimizer': 'Adamw',#'Nesterov',#'Adam', #'SGD', 'RMSProp'
    'weight_decay': 0.02,
    'model': "try4",
    'drop':0.2,
    'initial':'kaiming_uniform'
}

Load Data

In [None]:
train_data, val_data = bulid_dataclass(config['context'])
#train_data, val_data = get_subset(train_data, val_data)
train_loader, val_loader = build_data(config['batch_size'], train_data, val_data)
print("Batch size: ", config['batch_size'])
print("Context: ", config['context'])
print("Input size: ", (2*config['context']+1)*15)
print("Output symbols: ", len(PHONEMES))
print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(val_data.__len__(), len(val_loader)))
batch_number = len(train_loader) # for CosineAnnealing period

Wandb

In [None]:
# Create your wandb run
run = wandb.init(
    name = config['model'], ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit= True, ### Allows reinitalizing runs when you re-run this cell
    project ="hw1", ### Project should be created in your wandb account 
    config=config,### Wandb Config for your run
)

Initial Model

    In this setting, model have 19.864616M parameter
    It has 7 layers, inputsize-->2048-->2048-->2048-->2048-->2048-->512-->output
    Dropout = 0.2
    Weight Initial: kaiming_uniform

In [None]:
model = Net1(config['context'], 2048,2048,2048,2048,2048,512,config['drop'], True, config['initial']).to(device)
# Check number of parameters of your network 
# - Remember, you are limited to 20 million parameters for HW1 (including ensembles)
summary(model, frames.to(device))

In [None]:
### Save your model architecture as a string with str(model) 
model_arch = str(model)

### Save it in a txt file 
arch_file = open("model_arch.txt", "w")
file_write = arch_file.write(model_arch)
file_write = arch_file.write("\n")
file_write = arch_file.write("parameter setting:\n")
for key, value in config.items(): 
        arch_file.write('%s:%s\n' % (key, value))
arch_file.close()

### log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

['/content/wandb/run-20220926_154053-nbnvwa3q/files/model_arch.txt']

Define loss function, optimizer and learning rate scheduler. 

In [None]:
# Defining Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
# Define scheduler with inial lr = 0.001
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, batch_number,
                                                          eta_min=1e-4, last_epoch=- 1, verbose=False)
# Defining Loss function 
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

## Train

    Due to the limit usage on colab, it cannot train model for epochs bigger than 40. 
    Reload the previous model and train next 40 epochs

In [None]:
train1(model, train_loader, val_loader, optimizer, criterion,scheduler,scaler)

Save model

In [None]:
torch.save({'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
              }, "Model")
        
# Creating Artifact
model_artifact = wandb.Artifact(config['model'], type='model')
# Adding model file to Artifact
model_artifact.add_file("Model")
# Saving Artifact to WandB
run.log_artifact(model_artifact)
wandb.finish()

# Retrain

    Load previous model and change hyper-parameter to retrain
    


## First Re-Train
    1. change CosineAnnealing minimal learing rate between 1e-3 and 1e-5
    2. change CosineAnnealing minimal learing rate between 1e-4 and 1e-5

Define new hyperparameter


In [None]:
config = {
    'epochs': 40,
    'batch_size' : 1024,
    'context' : 32,
    'learning_rate' : 0.001,
    'lr_schedule': 'CosineAnnealing',
    'optimizer': 'Adamw',
    'weight_decay': 0.01,
    'model': "try4",
    'drop':0.2,
    'initial':'kaiming_uniform'
}

In [None]:
import wandb
run =  wandb.init(
    name = config['model'], ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit= True, ### Allows reinitalizing runs when you re-run this cell
    project ="hw1", ### Project should be created in your wandb account 
    config=config,### Wandb Config for your run
    entity=""# add file name
)
artifact = run.use_artifact('', type='model') # add name
artifact_dir = artifact.download()

In [None]:
torch.cuda.empty_cache()
gc.collect()

Load model 

In [None]:
# Define model
model1 = Net1(config['context'], 2048,2048,2048,2048,2048,512, config['drop'], True, config['initial']).to(device)
# Load model state
model1.load_state_dict(torch.load('')['model_state_dict']) # add name

In [None]:
# Defining Optimizer
optimizer = torch.optim.AdamW(model1.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
# Load optimizer state
optimizer.load_state_dict(torch.load('')['optimizer_state_dict']) # add name

Change CosineAnnealingLR minimal learing-rate to 1e-5

In [None]:
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = batch_number,
                                                           eta_min=1e-5, last_epoch=- 1, verbose=False)
# Defining Loss function 
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

In [None]:
train1(model1, train_loader, val_loader, optimizer, criterion, scheduler, scaler)

Save model

In [None]:
torch.save({'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
              }, "Model")
        
# Creating Artifact
model_artifact = wandb.Artifact(config['model'], type='model')
# Adding model file to Artifact
model_artifact.add_file("Model")
# Saving Artifact to WandB
run.log_artifact(model_artifact)
wandb.finish()

## Second Re-Train
    
    change dropout from 0.2 to 0.15

Define new hyperparameter


In [None]:
config = {
    'epochs': 40,
    'batch_size' : 1024,
    'context' : 32,
    'learning_rate' : 0.0001,
    'lr_schedule': 'CosineAnnealing',
    'optimizer': 'Adamw',
    'weight_decay': 0.01,
    'model': "try4",
    'drop':0.15,
    'initial':'kaiming_uniform'
}

In [None]:
import wandb
run =  wandb.init(
    name = config['model'], ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit= True, ### Allows reinitalizing runs when you re-run this cell
    project ="hw1", ### Project should be created in your wandb account 
    config=config,### Wandb Config for your run
    entity="11785chong"
)
artifact = run.use_artifact('11785chong/hw1/try4:v0', type='model')
artifact_dir = artifact.download()

In [None]:
torch.cuda.empty_cache()
gc.collect()

Load model 

In [None]:
# Define model
model1 = Net1(config['context'], 2048,2048,2048,2048,2048,512, config['drop'], True, config['initial']).to(device)
# Load model state
model1.load_state_dict(torch.load('/content/artifacts/try4:v0/Model')['model_state_dict'])

In [None]:
# Defining Optimizer
optimizer = torch.optim.AdamW(model1.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
# Load optimizer state
optimizer.load_state_dict(torch.load('/content/artifacts/try4:v0/Model')['optimizer_state_dict'])

Change CosineAnnealingLR min learing-rate to 1e-5

In [None]:
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = batch_number,
                                                           eta_min=1e-5, last_epoch=- 1, verbose=False)
# Defining Loss function 
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

In [None]:
train1(model1, train_loader, val_loader, optimizer, criterion, scheduler, scaler)

Save model

In [None]:
torch.save({'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
              }, "Model")
        
# Creating Artifact
model_artifact = wandb.Artifact(config['model'], type='model')
# Adding model file to Artifact
model_artifact.add_file("Model")
# Saving Artifact to WandB
run.log_artifact(model_artifact)
wandb.finish()

# Testing and submission to Kaggle

In [None]:
test_data = AudioTestDataset('/content', config['context'], offset=config['context'], partition= "test-clean", limit=-1) 
test_loader = torch.utils.data.DataLoader(test_data, batch_size=config['batch_size'], shuffle=False)

In [None]:
def test(model, test_loader):
  ### What you call for model to perform inference?
  model.eval()

  ### List to store predicted phonemes of test data
  test_predictions = []

  ### Which mode do you need to avoid gradients?
  with torch.inference_mode(): 
      for i, frames in enumerate(tqdm(test_loader)):

          frames = frames.float().to(device)             
          
          output = model(frames)

          ### Get most likely predicted phoneme with argmax
          predicted_phonemes = torch.argmax(output, dim=1)

          ### How do you store predicted_phonemes with test_predictions? Hint, look at eval 
          test_predictions.extend(predicted_phonemes.tolist())
          
  return test_predictions

In [None]:
predictions = test(model1, test_loader)

  0%|          | 0/1898 [00:00<?, ?it/s]

In [None]:
### Create CSV file with predictions
with open("./submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(predictions)):
        f.write("{},{}\n".format(i, predictions[i]))

In [None]:
## Submit to kaggle competition using kaggle API
!kaggle competitions submit -c 11-785-f22-hw1p2 -f ./submission.csv -m "Test Submission"

100% 18.6M/18.6M [00:01<00:00, 10.1MB/s]
Successfully submitted to Frame-Level Speech Recognition