# Imports

In [3]:
import os
# import os
os.environ['TRANSFORMERS_CACHE'] = '/data/cg46773/transformer'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="7" 
from tqdm.notebook import tqdm


In [4]:
# import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as transforms
import argparse
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data.sampler import Sampler, BatchSampler, SubsetRandomSampler
import pickle
import numpy as np
import python_speech_features 
import scipy.io.wavfile as wav




In [14]:
import math

In [5]:
from ast import literal_eval

In [6]:
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', type=int, default=1, metavar='N',
                    help='input batch size for training (default: 10)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
                    help='number of epochs to train (default: 10)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--no-mps', action='store_true', default=False,
                    help='disables macOS GPU training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=300, metavar='N',
                    help='how many batches to wait before logging training status')
parser.add_argument('--silence', type=int, default=1, choices=[0,1],metavar='N',
                    help='If silence==1 then _ char is put in begin/end of phm_seq, if ')
parser.add_argument('--hidden_size', type=int, default=10, metavar='N',
                    help='hidden size of the LSTM')
parser.add_argument('--num_layers', type=int, default=1, metavar='N',
                    help='num layers of  LSTM')
parser.add_argument('--lr', type=float, default=1e-3,
                    help='learning rate')
args = parser.parse_args(args=[])
args.cuda = not args.no_cuda and torch.cuda.is_available()
# args.mps = not args.no_mps and torch.backends.mps.is_available()

if args.cuda:
    device = torch.device("cuda")
elif args.mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")


In [15]:
args.num_layers,args.silence

(1, 1)

# Initialize Locale, Phone, Phoneme, Mapping

In [8]:
allophone_data = pickle.load(open("allophone_mappings_new.pkl","rb"))
list(allophone_data.keys())
num_universal_phones = len(allophone_data['id to universal_phone'])
locale_to_id={locale:idx for idx,locale in enumerate(allophone_data['language_locale to (language-local)id to phoneme'].keys())}
id_to_locale={idx:locale for locale,idx in locale_to_id.items()}
# maps locale to len of phonemes
locale_to_num_phonemes={locale:len(allophone_data['language_locale to (language-local)id to phoneme'][locale]) for locale in locale_to_id}
lang_to_phoneme_to_id = allophone_data['language_locale to phoneme to (language-local)id']

In [9]:
num_universal_phones

642

In [10]:
list(allophone_data.keys())

['id to universal_phone',
 'universal_phone to id',
 'locale to langcode mapping',
 'langcode to locale mapping',
 'language_locale to (language-local)id to phoneme',
 'language_locale to phoneme to (language-local)id',
 '(Allophone mappings) language_locale to (language-local)id to universal_phone_ids']

In [11]:
'''
    This cell initializes the signature matrix
'''
locale_to_allophone_mappings_id = allophone_data['(Allophone mappings) language_locale to (language-local)id to universal_phone_ids']
#locale_to_allophone_mappings_array = {loc:np.zeros((locale_to_num_phonemes[loc], num_universal_phones)) for loc in locale_to_id}
signature_matrix = {loc:np.zeros((locale_to_num_phonemes[loc], num_universal_phones)) for loc in locale_to_id}

for lang in signature_matrix:
    signature_array = signature_matrix[lang]
    # print(locale_to_allophone_mappings_id[lang])
    for q, phones in locale_to_allophone_mappings_id[lang].items():
        for p in phones:
            # q (Phoneme) x P (phone)
            signature_array[q, p] = 1


# Initialize Dataset

In [10]:
#https://pytorch.org/audio/stable/generated/torchaudio.transforms.MFCC.html#torchaudio.transforms.MFCC
# def MFCC(filename="sample.wav"):
def MFCC(filename="output1.wav"):
    waveform, sample_rate = torchaudio.load(filename, normalize=True)
    transform = transforms.MFCC(
        sample_rate=sample_rate,
        n_mfcc=40,
        #https://pytorch.org/audio/stable/generated/torchaudio.transforms.MelSpectrogram.html#torchaudio.transforms.MelSpectrogram
        melkwargs={"n_fft": 400, #"hop_length": 160,
                   "n_mels": 40, "center": False}
                  # "win_length": 0.025},
    )
    mfcc = transform(waveform)
    return mfcc

In [11]:
import scipy.signal

In [12]:
(rate,sig) = wav.read('data/hin/audio/hin-008-001.wav')
# mfcc = torch.Tensor(
# print(len(sig))
signal_resampled=scipy.signal.resample(sig,(len(sig)*16000)//rate)
# python_speech_features.mfcc(signal_resampled,16000,numcep=40, nfilt=80)
# )


In [13]:
rate

44100

In [14]:
class MozillaDataset(Dataset):
    def __init__(self, args, csv_file, type_,pt_file,audio_dir):
        data = pd.read_csv(csv_file,keep_default_na=False, na_values=[''])
        data=data[data["type"]==type_].reset_index(drop=True)
        self.length = len(data)
        audio_paths=audio_dir + '/' + data["path"]
        if not data["path"].iloc[0].endswith(".wav"):
            audio_paths=audio_paths + ".wav"
        self.x_mfcc = []
        #JORDI-- torch-mfcc computation\n",
        # for audio_path in audio_paths:
        #     mfcc = MFCC(audio_path)
        #     #so that the final dim is: 1 x time x n_mfcc\n",
        #     mfcc = mfcc.permute(0,2,1)
        #     mfcc = torch.squeeze(mfcc)
        #     self.x_mfcc.append(mfcc)
        
        #Chitrank-- via python-speech-features",
        want_to_load_from_pt_file=True
        if want_to_load_from_pt_file==True:
            self.x_mfcc=torch.load(pt_file)
        else:    
            for audio_path in tqdm(audio_paths):
                (rate,sig) = wav.read(audio_path)
                if rate>16000:
                    sig=scipy.signal.resample(sig,(len(sig)*16000)//rate)
                    rate=16000
                mfcc = torch.Tensor(python_speech_features.mfcc(sig,rate,numcep=40, nfilt=80))
                # mfcc size is already T x F(=40)
                self.x_mfcc.append(mfcc)
            torch.save(self.x_mfcc,f=pt_file)
        

        self.lang = [locale_to_id[i] for i in data["langcode"].values]
        self.start_pos_data = [0]
        for idx in range(1,len(self.lang)):
            if self.lang[idx] != self.lang[idx-1]:
                # append when the language changes 
                self.start_pos_data.append(idx)

        if args.silence==1:
            phm_seq_raw = [['_']+literal_eval(i)+['_'] for i in data["phoneme_sequence"].values]
        else:
            phm_seq_raw = [literal_eval(i) for i in data["phoneme_sequence"].values]
        phm_seq_raw= [ torch.Tensor([lang_to_phoneme_to_id[id_to_locale[self.lang[idx]]][phm] for phm in phm_seq])
                      for idx,phm_seq in enumerate(phm_seq_raw)]
        # self.raw = phm_seq_raw

        self.phm_seq=phm_seq_raw
#         for i in range(self.length):
#             sequence_ = []
#             lang_ = self.lang[i]
#
#             # some phonemes occupies two character space, we have to account for that
#             j = 0
#             special_phm = ["ɕ"]
#             while j < len(phm_seq_raw[i]):
#                 phm = phm_seq_raw[i][j]
#                 if phm == "ɕ" and lang_ == "ru":
#                     sequence_.append(lang_to_phoneme_to_id[lang_]["ɕː"])
#                     j += 2
#                 elif j < len(phm_seq_raw[i]) - 1 and phm_seq_raw[i][j+1] not in lang_to_phoneme_to_id[lang_] and phm_seq_raw[i][j+1] not in special_phm:
#                     next_phm = phm_seq_raw[i][j+1]
#                     new_phm = phm + next_phm
#                     sequence_.append(lang_to_phoneme_to_id[lang_][new_phm])
#                     j += 2
#                 else:
#                     sequence_.append(lang_to_phoneme_to_id[lang_][phm])
#                     j += 1
#             self.phm_seq.append(torch.Tensor(sequence_))
        #self.phm_seq = torch.Tensor(self.phm_seq)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return {"x_mfcc": self.x_mfcc[idx], "lang": self.lang[idx], "phm_seq": self.phm_seq[idx],}
                # "raw":self.raw[idx]}

mozilla_dataset_train = MozillaDataset(args,csv_file='validated_1000.csv',
                                       type_="train",
                                       pt_file="train_x_mfcc_1000.pt",
                                       audio_dir='mozilla/wav_1000_100_1000/')
mozilla_dataset_dev = MozillaDataset(args,csv_file='validated_1000.csv',
                                     type_="dev",
                                     pt_file="dev_x_mfcc_1000.pt",
                                     audio_dir='mozilla/wav_1000_100_1000/')
mozilla_dataset_test = MozillaDataset(args,csv_file='validated_1000.csv',
                                      type_="test",
                                      pt_file="test_x_mfcc_1000.pt",
                                      audio_dir='mozilla/wav_1000_100_1000/')
ucla_dataset_test = MozillaDataset(args,csv_file='ucla_utterances.csv',
                                      type_="test",
                                      pt_file="ucla_test_x_mfcc_1000.pt",
                                      audio_dir='data/')
# print("length of dataset: ", len(mozilla_dataset))

In [15]:
def collate_fn(batch):
    # print("printing all the raw phonemes in every batch: ")
    # for d in batch:
    #     print(d["raw"])
    x_mfcc_list = [d['x_mfcc'] for d in batch]
    # print("shape of first in x_mfcc_list", x_mfcc_list[0].shape)
    # print("shape of first in x_mfcc_list", x_mfcc_list[1].shape)
    # print("shape of first in x_mfcc_list", x_mfcc_list[2].shape)
    # print("shape of first in x_mfcc_list", x_mfcc_list[3].shape)
    x_mfcc_tensor = pad_sequence(x_mfcc_list, batch_first=True, padding_value=0)
    #x_mfcc_lengths = sorted([len(x) for x in x_mfcc_list], reverse=True)
    x_mfcc_lengths = [len(x) for x in x_mfcc_list]
    #print("x_mfcc_lengths: ", x_mfcc_lengths)
    x_mfcc_tensor = pack_padded_sequence(x_mfcc_tensor, lengths= x_mfcc_lengths, batch_first=True, enforce_sorted = False)
    lang_list = [d['lang'] for d in batch]
    #lang_list = torch.tensor(lang_list)
    phm_seq_list = [d['phm_seq'] for d in batch]
    #print("phm_seq_list", phm_seq_list)
    phm_seq_tensor = pad_sequence(phm_seq_list, batch_first=True, padding_value=0).to(int)
    phm_seq_lengths = [len(x) for x in phm_seq_list]
    # print("phm_seq_lengths: ", phm_seq_lengths)
    #phm_seq_pack_padded_sequence = pack_padded_sequence(phm_seq_tensor, lengths= phm_seq_lengths, batch_first=True, enforce_sorted = False)
    # print("packed padded sequence: ", x_mfcc_tensor)
    return {
        "x_mfcc": x_mfcc_tensor, # packed padded tensor
        "lang": lang_list, # python list
        "phm_seq": phm_seq_tensor, # padded tensor
        "x_mfcc_lengths": torch.Tensor(x_mfcc_lengths).to(int), # sorted in decreasing order
        "phm_seq_lengths": torch.Tensor(phm_seq_lengths).to(int) # sorted in decreasing order
    }

In [16]:
# for i in range(len(mozilla_dataset)):
#     sample = mozilla_dataset[i]
#     print(i, sample['x_mfcc'].shape, sample['lang'], sample["phm_seq"].shape)

#     if i == 1:
#         break

In [17]:
class BucketSampler(Sampler):
    '''
        creates a bucket sampler where we sample randomly from the same language without replacement
    '''
    def __init__(self, dataset, batch_size, generator=None) -> None:
        self.dataset = dataset
        self.batch_size = batch_size
        self.generator = generator
        start_pos_data = self.dataset.start_pos_data
        start_end_indices = []
        for i in range(len(start_pos_data) - 1):
            start_end_indices.append((start_pos_data[i], start_pos_data[i+1]))
        start_end_indices.append((start_pos_data[-1], len(self.dataset)))
        ranges  = [range(start, end) for start, end in start_end_indices]
        subset_samplers = [SubsetRandomSampler(range_, generator=generator) for range_ in ranges]
        self.samplers = [
            BatchSampler(subset_sampler, batch_size, drop_last=False) for subset_sampler in subset_samplers
        ]
        self._len = 0
        for sampler in self.samplers:
            self._len += len(sampler)
        
    def __iter__(self):
        iterators = [iter(sampler) for sampler in self.samplers]
        while iterators:
            randint = torch.randint(0, len(iterators),size=(1,), generator=self.generator)[0]
            try:
                yield next(iterators[randint])
            except StopIteration:
                iterators.pop(randint)
    def __len__(self):
        return self._len
bucketSampler = BucketSampler(mozilla_dataset_train, batch_size = 4)
dataloader = DataLoader(mozilla_dataset_train, batch_sampler = bucketSampler, collate_fn=collate_fn)#, num_workers=4)
for i in dataloader:
    # print("mfcc: ", i["x_mfcc"].data.shape)
    # print("language: ",i["lang"])
    # print("phm_seq: ", i["phm_seq"])
    break

In [18]:
np.sum(signature_matrix["daa"],axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0.])

# Allosaurus

In [19]:
device,args.hidden_size

(device(type='cuda'), 10)

In [20]:
all_langs=set(mozilla_dataset_train.lang)
all_langs=all_langs.union(set(mozilla_dataset_dev.lang))
all_langs=all_langs.union(set(mozilla_dataset_test.lang))
all_langs=all_langs.union(set(ucla_dataset_test.lang))
# all_langs=set(mozilla_dataset_train.langs)


In [1]:
len(all_langs)

NameError: name 'all_langs' is not defined

In [12]:
w = torch.empty(3, 5)
nn.init.xavier_normal_(w)

tensor([[-0.3832,  0.4862,  0.7070,  0.2622, -0.3949],
        [ 0.3594,  0.3629,  0.1502, -0.4447,  0.4887],
        [-0.6370, -0.1855, -0.5306,  0.5091,  1.3228]])

In [None]:
1-- [B x T x 2*H] [2*H x A] [A x P] 
# 2-- [B x T x H] [P x 2*H]


In [13]:
class Heirachical(nn.Module):
    def __init__(self, args, num_attrs, num_attrtypes, attrs2attrtype):
        """
            attrs2attrtype-- 1-0 matrix of shape [A x AT]
        """
        super().__init__()
        self.allosaurus_layer_matrix[lang_id]=torch.nn.Parameter(torch.Tensor(signature_matrix[locale_]).to(device),requires_grad=True)
        if args.original_heir==1:
            final_phone_emb_size=2*args.hidden_size
            self.attr_embeds=torch.nn.empty(num_attrs, final_phone_emb_size) # It's a (A x 2*H) embeddings matrix 
        else:
            final_phone_emb_size=2*args.hidden_size
            attr_emb_size=math.ceil(final_phone_emb_size/num_attrtypes)
            self.attr_embeds=torch.nn.empty(num_attrs, attr_emb_size) # It's a (A x [2*H/AT]) embeddings matrix 
        self.attrs_combination
        self.allosaurus_model=Allosaurus()
    def forward(self, X_mfcc, lang_id):
        """
            
        """
        packed_padded_sequence_output, (h_n, c_n) = self.encoder(X_mfcc) # returns (B x T x h)
        seq_unpacked, lens_unpacked = pad_packed_sequence(packed_padded_sequence_output, batch_first=True)
        # the hidden space is times 2 because it is bidirectional
        # print("shape of padded sequence in hidden space (Batch x Time x (hidden x2 because bi directional)): ", seq_unpacked.shape, flush=True)
        # print("lens_unpacked: ", lens_unpacked, flush=True)
        # returns (B x T x N)
        phone_distribution = self.phone_distribution_transf_matrix(seq_unpacked)  
        # print("shape of phone_distribution (Batch x Time x num_universal_phones): ",phone_distribution.shape ,flush=True)
        phoneme_distrib = self.compute_phoneme_distribution(phone_distribution, lang_id)
        # print("shape of phoneme after amax (Batch x Time x ?): ",phoneme_distrib.shape ,flush=True)
        #[compute_phoneme_distribution(allosaurus_layer_matrix[i],langs[i])
         #                for i in range(len(langs))] # (return )
        return phoneme_distrib
.
# lang -> phoneme id mapping        
        
        

class Allosaurus(nn.Module):
    def __init__(self, args,num_universal_phones, signature_matrix):
        '''
            signature_matrix: {lang (str): Mi x N}; locale_to_allophone_mappings_array
        '''
        super().__init__()
        self.num_universal_phones = num_universal_phones  # In short N
  
        self.encoder=Encoder(n_layers=args.num_layers,hidden_dim=args.hidden_size,input_size=40) # In short lstm has (n_lay, h, #mfcc) 
        self.phone_distribution_transf_matrix=nn.Linear(2 * args.hidden_size, self.num_universal_phones) # It's a (h x N) transformation matrix 
        self.allosaurus_layer_matrix = {}
        self.allosaurus_layer_matrix_orig = {}
        for lang_id in all_langs:
            # every such matrix is of shape (M_i x N)
            locale_=id_to_locale[lang_id]
            self.allosaurus_layer_matrix[lang_id]=torch.nn.Parameter(torch.Tensor(signature_matrix[locale_]).to(device),requires_grad=True)
            self.allosaurus_layer_matrix_orig[lang_id]=torch.nn.Parameter(torch.Tensor(signature_matrix[locale_]).to(device),requires_grad=False)

        self.loss_fn = torch.nn.CTCLoss(blank = 0, reduction='mean', zero_infinity=True)
    def compute_phoneme_distribution(self, phone_distrib,lang_id):
        # phone_distrib-- ( B x T x N), 
        # lang-- language index
        # retrieve sign_matrix-- (M_i x N)
        phoneme_phone_distrib = torch.amax(self.allosaurus_layer_matrix[lang_id]*(phone_distrib.unsqueeze(2)),dim=-1)

        # print("Final phoneme_phone_distrib matrix(B x T x phonemes):", phoneme_phone_distrib.shape)
        return torch.nn.functional.log_softmax(phoneme_phone_distrib, dim=-1)
    
    def forward(self, X_mfcc, lang_id):
        '''
            frames is (B x T x F) matrix where 
            B is batch_size
            T= time_dimension 
            F- numFeatures in every unit time 
            Langs is (B) matrix that tell us which lang_index does each batch_elemnet corresponds to.
            --  in our special case it is a single value
        '''
        packed_padded_sequence_output, (h_n, c_n) = self.encoder(X_mfcc) # returns (B x T x h)
        seq_unpacked, lens_unpacked = pad_packed_sequence(packed_padded_sequence_output, batch_first=True)
        # the hidden space is times 2 because it is bidirectional
        # print("shape of padded sequence in hidden space (Batch x Time x (hidden x2 because bi directional)): ", seq_unpacked.shape, flush=True)
        # print("lens_unpacked: ", lens_unpacked, flush=True)
        # returns (B x T x N)
        phone_distribution = self.phone_distribution_transf_matrix(seq_unpacked)  
        # print("shape of phone_distribution (Batch x Time x num_universal_phones): ",phone_distribution.shape ,flush=True)
        phoneme_distrib = self.compute_phoneme_distribution(phone_distribution, lang_id)
        # print("shape of phoneme after amax (Batch x Time x ?): ",phoneme_distrib.shape ,flush=True)
        #[compute_phoneme_distribution(allosaurus_layer_matrix[i],langs[i])
         #                for i in range(len(langs))] # (return )
        return phoneme_distrib
             
        
    def loss(self, y_ref, y_scores, src_length, tgt_length, lang_id, alpha=10):
        # y_ref is of shape (B x T_o) where T_o= time_stamps in output space. 
        # y_predicted is of shape [T_inp x M_o_1, T_inp x M_o_2, ..., T_inp x M_o_B] where B is the Batch_size
        #   where T_i= time_stamps in input space. Note T_o!=T_i 

        # ctc_loss= torch.mean([self.loss_fn(y_scores[i], y_ref[i]) for i in range(len(y_ref))])
        ctc_loss= torch.mean(self.loss_fn(y_scores.transpose(0,1), y_ref, src_length, tgt_length))
        # return ctc_loss+alpha*torch.mean([torc.norm((allosaurus_layer_matrix[i]-signature_matrix[i])) 
        #                    for i in range(len(languages_in_current_batch))])

        weights = self.allosaurus_layer_matrix[lang_id]
        weights_orig = self.allosaurus_layer_matrix_orig[lang_id]
        a=ctc_loss
        b=alpha * torch.square(torch.linalg.matrix_norm((weights-weights_orig)+1e-8))
        return a+b
        
class Encoder(nn.Module):
    # def __init__(self, n_layers=6,hidden_dim=1024,input_size=40):
    def __init__(self, n_layers=1,hidden_dim=10,input_size=40):
        super().__init__()

        self.bi_LSTM=torch.nn.LSTM(input_size=input_size,num_layers=n_layers,
                         hidden_size=hidden_dim,batch_first=True,
                            bidirectional=True)
    def forward(self, x):
        # x is (B x T x F=40), output (B x T x lstm_hidden_size)
        return self.bi_LSTM(x)
args.batch_size=64
# args.num_layers=6
args.num_layers=4
# args.hidden_size=1024
args.hidden_size=256
args.lr=5e-3
universal_phone_num = len(allophone_data['id to universal_phone'])
model = Allosaurus(args, universal_phone_num, signature_matrix).to(device)
# optimizer = optim.Adam(model.parameters(), lr=args.lr)
optimizer = optim.SGD(model.parameters(), lr=args.lr)
torch.autograd.set_detect_anomaly(True)

SyntaxError: invalid syntax (1509511437.py, line 9)

In [23]:
model.encoder.bi_LSTM.num_layers,model.encoder.bi_LSTM.hidden_size

(4, 256)

In [24]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, log_file,patience=7, verbose=False, delta=0, path='allosrs/model/checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
        self.log_file=log_file
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            with open(self.log_file,mode="a") as wfile:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}\n',file=wfile)
            self.trace_func(f'{bcolors.FAIL}EarlyStopping counter: {self.counter} out of {self.patience}{bcolors.ENDC}\n')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            with open(self.log_file,mode="a") as wfile:
                print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...\n',file=wfile)
            
            self.trace_func(f'{bcolors.OKGREEN}Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...{bcolors.ENDC}\n')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss


In [25]:
#TODO: 1. write the validation loss loop while using the new data csv file -- change locale to language
#TODO: 2. early stopping
#TODO: 3. Phoneme error 

In [26]:
def train(epoch,dl,log_file):
# training loop
    model.train()
    train_loss = 0
    # "x_mfcc": x_mfcc_tensor, # packed padded sequence
    # "lang": lang_list, # python list
    # "phm_seq": phm_seq_tensor, # packed padded tensor
    # "x_mfcc_lengths": x_mfcc_lengths, # sorted in decreasing order
    # "phm_seq_lengths": phm_seq_lengths # sorted in decreasing order
    batch_idx=0
    predictions=[]
    for data in tqdm(dl):
        # break
        batch_idx+=1
        #X_mfcc = data["x_mfcc"].to(device)
        X_mfcc = data["x_mfcc"]
        lang = data["lang"]
        phm_seq = data["phm_seq"]
        x_mfcc_lengths = data["x_mfcc_lengths"]
        phm_seq_lengths = data["phm_seq_lengths"]
        batch_size=len(lang)

        optimizer.zero_grad()
        # print("the language of this batch is: ", lang[0], flush=True)
        # print("batch_idx", batch_idx, flush=True)
        # we are assuming one language per batch here
        y_predicted = model(X_mfcc.to(device), lang[0])
        loss = model.loss(phm_seq.to(device), y_predicted, x_mfcc_lengths.to(device), phm_seq_lengths.to(device), lang[0])
        loss.backward()
        optimizer.step()
        train_loss += batch_size*loss.detach().cpu().item()
        if batch_idx % args.log_interval == 0:
            # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            #     epoch, batch_idx * len(lang), len(train_loader.dataset),
            #     100. * batch_idx / len(train_loader),
            #     loss.item() / len(data)))
            print('\t\tTrain Epoch: {} iter:{} \tLoss: {:.6f}'.format(
                epoch, batch_idx,loss.item() / len(data)))
            with open(log_file,mode="a") as wfile:
                print('\t\tTrain Epoch: {} iter:{} \tLoss: {:.6f}'.format(
                    epoch, batch_idx,loss.item() / len(data)),file=wfile)
    if epoch%1==0:
        print('Train Epoch: {} ====> Average loss: {:.4f}'.format(
          epoch, train_loss / len(dl.dataset)))
        with open(log_file,mode="a") as wfile:
            print('Train Epoch: {} ====> Average loss: {:.4f}'.format(
              epoch, train_loss / len(dl.dataset)),file=wfile)
            
    return train_loss

In [27]:
# test_loader = DataLoader(mozilla_dataset, batch_sampler = bucketSampler, collate_fn=collate_fn)
def evaluate(epoch,dl,log_file):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for data in tqdm(dl):
            # break
            X_mfcc = data["x_mfcc"]
            # print(X_mfcc.data.shape)
            lang = data["lang"]
            phm_seq = data["phm_seq"]
            x_mfcc_lengths = data["x_mfcc_lengths"]
            phm_seq_lengths = data["phm_seq_lengths"]
            batch_size=len(lang)
            y_predicted = model(X_mfcc.to(device), lang[0])
            loss = model.loss(phm_seq.to(device), y_predicted, x_mfcc_lengths.to(device), phm_seq_lengths.to(device), lang[0])
            test_loss += batch_size*loss.detach().cpu().item()
    test_loss /= len(dl.dataset)
    with open(log_file,mode="a") as wfile:
        print('Validation/Test Epoch {}====> average loss: {:.4f}'.format(epoch,test_loss),file=wfile)
    print('Validation/Test Epoch {}====> average loss: {:.4f}'.format(epoch,test_loss))
    return test_loss

In [None]:
args.batch_size=20
args.epochs=1000
bucketSampler_train = BucketSampler(mozilla_dataset_train, batch_size = args.batch_size)
bucketSampler_dev = BucketSampler(mozilla_dataset_dev, batch_size = 20)
# bucketSampler_test = BucketSampler(mozilla_dataset_test, batch_size = 64)
train_dataloader = DataLoader(mozilla_dataset_train, batch_sampler = bucketSampler_train, collate_fn=collate_fn)#, num_workers=4)
dev_dataloader = DataLoader(mozilla_dataset_dev, batch_sampler = bucketSampler_dev, collate_fn=collate_fn)#, num_workers=4)
# test_dataloader = DataLoader(mozilla_dataset_test, batch_sampler = bucketSampler_test, collate_fn=collate_fn)#, num_workers=4)

model_filename=f"allosrs/model/lstm_nl{model.encoder.bi_LSTM.num_layers}_hd{model.encoder.bi_LSTM.hidden_size}_lr{args.lr}_bs{args.batch_size}.pt"
# model_filename=f"allosrs/model/dummy.pt"
print("MODEL SAVE NAME---",model_filename)

log_filename=f"allosrs/log/lstm_nl{model.encoder.bi_LSTM.num_layers}_hd{model.encoder.bi_LSTM.hidden_size}_lr{args.lr}_bs{args.batch_size}.log"
with open(log_filename,mode="w") as wfile:
    print("",end="",file=wfile)


es=EarlyStopping(log_file=log_filename,verbose=True,path=model_filename)
for epoch in range(1, args.epochs + 1):
# for epoch in range(1, 5+ 1):
    dev_loss=evaluate(epoch-1,dev_dataloader,log_filename)
    # dev_loss=(epoch-5)**2
    es(dev_loss,model)
    if es.early_stop==True:
        break
    train_loss=train(epoch,train_dataloader,log_filename)


MODEL SAVE NAME--- allosrs/model/lstm_nl4_hd256_lr0.005_bs20.pt


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 0====> average loss: 57.8042
[92mValidation loss decreased (inf --> 57.804213).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 1 iter:300 	Loss: 0.763395
Train Epoch: 1 ====> Average loss: 4.8542


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 1====> average loss: 3.5306
[92mValidation loss decreased (57.804213 --> 3.530649).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 2 iter:300 	Loss: 0.709110
Train Epoch: 2 ====> Average loss: 3.4837


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 2====> average loss: 3.4441
[92mValidation loss decreased (3.530649 --> 3.444104).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 3 iter:300 	Loss: 0.791222
Train Epoch: 3 ====> Average loss: 3.3984


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 3====> average loss: 3.3203
[92mValidation loss decreased (3.444104 --> 3.320281).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 4 iter:300 	Loss: 0.726778
Train Epoch: 4 ====> Average loss: 3.3054


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 4====> average loss: 3.2379
[92mValidation loss decreased (3.320281 --> 3.237922).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 5 iter:300 	Loss: 0.698719
Train Epoch: 5 ====> Average loss: 3.2250


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 5====> average loss: 3.1611
[92mValidation loss decreased (3.237922 --> 3.161111).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 6 iter:300 	Loss: 0.761653
Train Epoch: 6 ====> Average loss: 3.1674


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 6====> average loss: 3.1315
[92mValidation loss decreased (3.161111 --> 3.131536).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 7 iter:300 	Loss: 0.757591
Train Epoch: 7 ====> Average loss: 3.1309


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 7====> average loss: 3.0869
[92mValidation loss decreased (3.131536 --> 3.086874).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 8 iter:300 	Loss: 0.676661
Train Epoch: 8 ====> Average loss: 3.1108


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 8====> average loss: 3.0665
[92mValidation loss decreased (3.086874 --> 3.066508).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 9 iter:300 	Loss: 0.575552
Train Epoch: 9 ====> Average loss: 3.0994


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 9====> average loss: 3.0571
[92mValidation loss decreased (3.066508 --> 3.057070).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 10 iter:300 	Loss: 0.551283
Train Epoch: 10 ====> Average loss: 3.0817


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 10====> average loss: 3.0366
[92mValidation loss decreased (3.057070 --> 3.036558).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 11 iter:300 	Loss: 0.684115
Train Epoch: 11 ====> Average loss: 3.0710


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 11====> average loss: 3.0479
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 12 iter:300 	Loss: 0.687688
Train Epoch: 12 ====> Average loss: 3.0650


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 12====> average loss: 3.0304
[92mValidation loss decreased (3.036558 --> 3.030394).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 13 iter:300 	Loss: 0.564369
Train Epoch: 13 ====> Average loss: 3.0583


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 13====> average loss: 3.0531
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 14 iter:300 	Loss: 0.496156
Train Epoch: 14 ====> Average loss: 3.0476


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 14====> average loss: 3.0246
[92mValidation loss decreased (3.030394 --> 3.024609).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 15 iter:300 	Loss: 0.630353
Train Epoch: 15 ====> Average loss: 3.0529


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 15====> average loss: 3.0182
[92mValidation loss decreased (3.024609 --> 3.018205).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 16 iter:300 	Loss: 0.512763
Train Epoch: 16 ====> Average loss: 3.0451


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 16====> average loss: 3.0264
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 17 iter:300 	Loss: 0.663007
Train Epoch: 17 ====> Average loss: 3.0487


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 17====> average loss: 3.0225
[91mEarlyStopping counter: 2 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 18 iter:300 	Loss: 0.572800
Train Epoch: 18 ====> Average loss: 3.0107


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 18====> average loss: 2.9761
[92mValidation loss decreased (3.018205 --> 2.976138).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 19 iter:300 	Loss: 0.647305
Train Epoch: 19 ====> Average loss: 3.0026


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 19====> average loss: 2.9822
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 20 iter:300 	Loss: 0.598766
Train Epoch: 20 ====> Average loss: 2.9974


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 20====> average loss: 3.0193
[91mEarlyStopping counter: 2 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



		Train Epoch: 21 iter:300 	Loss: 0.455180
Train Epoch: 21 ====> Average loss: 2.9943


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 21====> average loss: 3.0211
[91mEarlyStopping counter: 3 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 22 iter:300 	Loss: 0.604088
Train Epoch: 22 ====> Average loss: 3.0001


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 22====> average loss: 2.9829
[91mEarlyStopping counter: 4 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 23 iter:300 	Loss: 0.544923
Train Epoch: 23 ====> Average loss: 3.0026


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 23====> average loss: 2.9914
[91mEarlyStopping counter: 5 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 24 iter:300 	Loss: 0.508604
Train Epoch: 24 ====> Average loss: 2.9938


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 24====> average loss: 2.9634
[92mValidation loss decreased (2.976138 --> 2.963357).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 25 iter:300 	Loss: 0.430538
Train Epoch: 25 ====> Average loss: 2.9900


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 25====> average loss: 3.0432
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 26 iter:300 	Loss: 0.698824
Train Epoch: 26 ====> Average loss: 3.0011


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 26====> average loss: 2.9745
[91mEarlyStopping counter: 2 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 27 iter:300 	Loss: 0.534325
Train Epoch: 27 ====> Average loss: 2.9870


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 27====> average loss: 2.9987
[91mEarlyStopping counter: 3 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 28 iter:300 	Loss: 0.565815
Train Epoch: 28 ====> Average loss: 2.9870


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 28====> average loss: 2.9622
[92mValidation loss decreased (2.963357 --> 2.962219).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 29 iter:300 	Loss: 0.693060
Train Epoch: 29 ====> Average loss: 2.9812


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 29====> average loss: 2.9618
[92mValidation loss decreased (2.962219 --> 2.961824).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 30 iter:300 	Loss: 0.702064
Train Epoch: 30 ====> Average loss: 2.9924


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 30====> average loss: 2.9974
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 31 iter:300 	Loss: 0.667559
Train Epoch: 31 ====> Average loss: 2.9894


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 31====> average loss: 3.0005
[91mEarlyStopping counter: 2 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 32 iter:300 	Loss: 0.512055
Train Epoch: 32 ====> Average loss: 2.9861


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 32====> average loss: 2.9644
[91mEarlyStopping counter: 3 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 33 iter:300 	Loss: 0.545099
Train Epoch: 33 ====> Average loss: 2.9810


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 33====> average loss: 2.9549
[92mValidation loss decreased (2.961824 --> 2.954854).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 34 iter:300 	Loss: 0.677112
Train Epoch: 34 ====> Average loss: 2.9809


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 34====> average loss: 2.9530
[92mValidation loss decreased (2.954854 --> 2.952971).  Saving model ...[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 35 iter:300 	Loss: 0.560534
Train Epoch: 35 ====> Average loss: 2.9830


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 35====> average loss: 2.9597
[91mEarlyStopping counter: 1 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

		Train Epoch: 36 iter:300 	Loss: 0.629809
Train Epoch: 36 ====> Average loss: 2.9992


  0%|          | 0/45 [00:00<?, ?it/s]

Validation/Test Epoch 36====> average loss: 2.9823
[91mEarlyStopping counter: 2 out of 7[0m



  0%|          | 0/450 [00:00<?, ?it/s]

In [None]:
a=torch.nn.Parameter(torch.zeros(3,3))
torch.linalg.matrix_norm(a-a+1)

In [None]:
set(mozilla_dataset_train.lang)

In [None]:
signature_matrix