In [33]:
import argparse
import json
import os
import random
import time, math
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm
import numpy as np
from warpctc_pytorch import CTCLoss
from collections import OrderedDict
import pandas as pd

from data.data_loader import AudioDataLoader, SpectrogramDataset, BucketingSampler
from data.data_loader import get_accents
from decoder import GreedyDecoder
from model import DeepSpeech, supported_rnns, ForgetNet, Encoder, Decoder, DiscimnateNet
from utils import reduce_tensor, check_loss, Decoder_loss

import easydict

In [69]:
args = easydict.EasyDict({
    'train_manifest' : './data/csvs/train_sorted_EN_US.csv',
    'val_manifest' : './data/csvs/test_sorted_EN_US.csv',
    'sample_rate' : 16000,
    'labels_path' : 'labels.json',
    'window_size' : .02, 'window_stride' : .01, 'window' : 'hamming',
    'hidden_size' : 1024, 'hidden_layers' : 5, 'rnn_type' : 'gru',
    'epochs' : 50, 'batch_size' : 32, 'num_workers' : 4,
    'patience' : 10,
    'cuda' : True,
    'lr' : 0.001, 'momentum' : 0.9, 'max_norm' : 400, 'learning_anneal' : 1.1, 
    'silent' : True,
    'checkpoint' : True, 'checkpoint_per_batch' : 5000,
    'visdom' : True, 'tensorboard' : True,
    'log_dir' : './visualize/deepspeech_final', 'log_params' : True,
    'id' : 'Deepspeech training',
    'continue_from' : '', 'finetune' : True,
    'augment' : True,
    'noise_dir' : None, 'noise_prob' : 0.4, 'noise_min' : 0.0, 'noise_max' : 0.5,
    'no_shuffle' : True,
    'no_sorta_grad' : True,
    'bidirectional' : True,
    'spec_augment' : True,
    'dist_url' : 'tcp://127.0.0.1.:1550', 'dist_backend' : 'nccl',
    'world_size' : 1, 
    'rank' : 0,
    'enco_modules' : 2, 'enco_res' : True, 
    'disc_modules' : 1, 'disc_res' : False,
    'forg_modules' : 2, 'forg_res' : True,
    'gpu_rank' : 0,
    'seed' : 123456,
    'opt_level' : '',
    'keep_batchnorm_fp32' : None,
    'loss_scale' : None,
    'weights' : ' ',
    'update_rule' : 1,
    'train_asr' : False,
    'dummy' : True,
    'num_epochs' : 1,
    'mw_alpha' : 0.1, 'mw_beta' : 0.2, 'mw_gamma' : 0.6 ,
    'exp_name' : './exp/1224/'

})
device = torch.device("cuda" if args.cuda else "cpu")
torch.cuda.set_device(int(args.gpu_rank))
eps = 0.0000000001 # epsilon value

In [4]:
accent_dict = get_accents('./data/csvs/train_sorted_EN_US.csv')

updating accents


In [5]:
accent = list(accent_dict.values())

In [6]:
with open('labels.json') as label_file:
        labels = str(''.join(json.load(label_file)))

In [7]:
labels

"_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "

In [13]:
audio_conf = dict(sample_rate=16000,
                    window_size=.02,
                    window_stride=.01,
                    window='hamming',
                    noise_dir= None,
                    noise_prob= 0.4,
                    noise_levels=(0.0, 0.5))

In [29]:
# ASR
asr = DeepSpeech(rnn_hidden_size=1024,
          nb_layers = 5,
          rnn_type =nn.GRU,
          audio_conf = audio_conf,
          bidirectional = True)
asr = asr.to(device)

In [None]:
# python train.py --train-manifest data/csvs/train_sorted_EN_US.csv --val-manifest data/csvs/dev_sorted_EN_US.csv --cuda --rnn-type gru --hidden-layers 5 --hidden-size 1024 --epochs 50 --lr 0.001 --batch-size 32 --gpu-rank 0 --update-rule 1 --exp-name ./exp/1224/ --mw-alpha 0.1 --mw-beta 0.2 --mw-gamma 0.6 --enco-modules 2 --enco-res --forg-modules 2 --forg-res --num-epochs 1 --checkpoint-per-batch 5000 

In [74]:
models = {} # All the models with their loss and optimizer are saved in this dict

In [75]:
# ASR
asr_optimizer = torch.optim.Adam(asr.parameters(), lr=args.lr, weight_decay=1e-4, amsgrad=True)
criterion = CTCLoss()
models['predictor'] = [asr, criterion, asr_optimizer]

In [76]:
# ENCODER & Decoder

encoder = Encoder(num_modules = args.enco_modules, residual_bool = args.enco_res)
encoder = encoder.to(device)
models['encoder'] = [encoder, None, None]

decoder = Decoder()
decoder = decoder.to(device)
dec_loss = Decoder_loss(nn.MSELoss())

ed_optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=1e-4, amsgrad=True)
models['decoder'] = [decoder, dec_loss, ed_optimizer]

In [77]:
# Discriminator
if not args.train_asr:
    discriminator = DiscimnateNet(classes = len(accent), num_modules = args.disc_modules, residual_bool = args.disc_res)
    discriminator = discriminator.to(device)
    discriminator_optimizer = torch.optim.Adam(discriminator.parameters(), lr=args.lr,weight_decay=1e-4,amsgrad=True)
    accent_counts = pd.read_csv(args.train_manifest, header=None).iloc[:,[-1]].apply(pd.value_counts).to_dict()
    disc_loss_weights = torch.zeros(len(accent)) + eps
    for accent_type_f in accent_counts:
        if isinstance(accent_counts[accent_type_f], dict):
            for accent_type_in_f in accent_counts[accent_type_f]:
                if accent_type_in_f in accent_dict:
                    disc_loss_weights[accent_dict[accent_type_in_f]] += accent_counts[accent_type_f][accent_type_in_f]
    disc_loss_weights = torch.sum(disc_loss_weights) / disc_loss_weights  # [2,2]
    dis_loss = nn.CrossEntropyLoss(weight = disc_loss_weights.to(device))
    models['discriminator'] = [discriminator, dis_loss, discriminator_optimizer]

In [79]:
# see how it looks like
nn.Sequential(OrderedDict([ (k, v[0]) for k, v in  models.items()]))

Sequential(
  (predictor): DeepSpeech(
    (rnns): Sequential(
      (0): BatchRNN(
        (rnn): GRU(1312, 1024, bidirectional=True)
      )
      (1): BatchRNN(
        (batch_norm): SequenceWise (
        BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
        (rnn): GRU(1024, 1024, bidirectional=True)
      )
      (2): BatchRNN(
        (batch_norm): SequenceWise (
        BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
        (rnn): GRU(1024, 1024, bidirectional=True)
      )
      (3): BatchRNN(
        (batch_norm): SequenceWise (
        BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
        (rnn): GRU(1024, 1024, bidirectional=True)
      )
      (4): BatchRNN(
        (batch_norm): SequenceWise (
        BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
        (rnn): GRU(1024, 1024, bidirectional=True)
      )
    )
    (fc): Sequentia

In [None]:
# creating dataset


In [None]:
OrderedDict