In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.decomposition import PCA
import torch.utils.data
import pickle
from tqdm import tqdm
import time
import gc
import collections

In [2]:
'''
set parameters
'''
base_path = '.'
train_path = base_path + '/train.npy'
test_path = base_path + '/test.npy'
train_labels_path = base_path + '/train_labels.npy'
dev_labels_path = base_path +  '/dev_labels.npy'
dev_path = base_path + '/dev.npy'

pca_available = True
padding_method = 'self' 
# padding_method = 'zero' 

device = torch.device('cuda:0')
n_labels = 138
n_features = 40
n_epoch = 10
context_num = 18

In [3]:
def load_train_data():
    t0 = time.time()
    print("Start loading training data...")
    train = np.load(train_path, allow_pickle=True)
    train_labels = np.load(train_labels_path, allow_pickle=True)
    t1 = time.time()
    elapsed_time = t1 - t0
    print("Done loading training data in {0} minutes...".format(elapsed_time/60))
    
    return train, train_labels

In [4]:
def load_validation_data():
    t0 = time.time()
    print("Start loading validation data...")
    val = np.load(dev_path, allow_pickle=True)
    val_labels = np.load(dev_labels_path, allow_pickle=True)
    t1 = time.time()
    elapsed_time = t1 - t0
    print("Done loading validation data in {0} minutes...".format(elapsed_time/60))
    
    return val, val_labels

In [5]:
def load_test_data():
    t0 = time.time()
    print("Start loading test data...")
    test = np.load(test_path, allow_pickle=True)
    t1 = time.time()
    elapsed_time = t1 - t0
    print("Done loading test data in {0} minutes...".format(elapsed_time/60))
    return test

In [6]:
def load_and_process_data(features, labels, pca, context_num):
    '''
    use the first&last frame of one utterance to pad the empty frame
    '''
    t0 = time.time()
    padding_features = np.concatenate([np.concatenate(( \
                                                 np.ones((context_num, pca.n_components))*pca.transform(features[i][[0]]), \
                                                 pca.transform(features[i]), \
                                                 np.ones((context_num, pca.n_components))*pca.transform(features[i][[-1]]))) \
                                                 for i in range(len(features))])
    padding_features = torch.Tensor(padding_features)
    del features
    
    '''
    corresponding label for padding frames
    '''
    false_labels = np.array([-1]*context_num)
    padding_labels = np.concatenate([np.concatenate(( \
                                                  false_labels, \
                                                  labels[i], \
                                                  false_labels)) \
                                                  for i in range(len(labels))])
    padding_labels = torch.Tensor(padding_labels)
    del labels
    
    gc.collect()
    
    return padding_features, padding_labels

In [79]:
class ContextDataset(Dataset):
    
    def __init__(self, context_num, features, targets):
        
        self.context_num = context_num
        self.features = features
        self.targets = targets
    
    def __len__(self):
        
        return len(self.targets)
    
    def __getitem__(self, index):
        
        if index-self.context_num >= 0 and index+self.context_num+1 <= len(self.targets)-1:
            '''
            no need for padding
            '''
            X = self.features[index-self.context_num: index+self.context_num+1].reshape(-1)
            Y = self.targets[index].long()
        elif index-self.context_num < 0:
            '''
            padding for pre frames, actually doesnt matter since we drop this 'false' frame
            '''
            X = torch.cat((torch.zeros(self.context_num-index, self.features.shape[1]), self.features[:index+self.context_num+1]), 0).reshape(-1)
            Y = self.targets[index].long()
        else:
            '''
            padding for post frames, same as before
            '''
            X = torch.cat((self.features[index-self.context_num:], torch.zeros(index+self.context_num+1-len(self.targets), self.features.shape[1])),0).reshape(-1)
            Y = self.targets[index].long()
        
        return index, X, Y

In [8]:
class SpeechDataset(Dataset):
    
    def __init__(self, speechdataset):
        
        self.features = [speechdataset[i][1] for i in range(len(speechdataset)) if speechdataset[i][2] != torch.Tensor([-1])[0]]
        self.targets = [speechdataset[i][2] for i in range(len(speechdataset)) if speechdataset[i][2] != torch.Tensor([-1])[0]]
    
    def __len__(self):
        
        return len(self.targets)
    
    def __getitem__(self, index):
        
        return index, self.features[index], self.targets[index]

In [9]:
#1000,2048,1024,512,256+2,138
class SpeechNet(nn.Module):
    
    def __init__(self, context_num):
        
        super(SpeechNet, self).__init__()
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.relu3 = nn.ReLU()
        self.relu4 = nn.ReLU()
        self.relu5 = nn.ReLU()
        self.relu6 = nn.ReLU()
        self.relu7 = nn.ReLU()
        self.relu8 = nn.ReLU()
        
        self.linear1 = nn.Linear((2*context_num+1)*pca.n_components, 2048)
        self.linear2 = nn.Linear(2048, 1024)
        self.linear3 = nn.Linear(1024, 810)
        self.linear4 = nn.Linear(810, 720)
        self.linear5 = nn.Linear(720, 512)
        self.linear6 = nn.Linear(512, 428)
        self.linear7 = nn.Linear(428, 300)
        self.linear8 = nn.Linear(300, 256)
        self.out = nn.Linear(256+2, 138)
        
        self.batchnorm1 = nn.BatchNorm1d(2048)
        self.batchnorm2 = nn.BatchNorm1d(1024)
        self.batchnorm3 = nn.BatchNorm1d(810)
        self.batchnorm4 = nn.BatchNorm1d(720)
        self.batchnorm5 = nn.BatchNorm1d(512)
        self.batchnorm6 = nn.BatchNorm1d(428)
        self.batchnorm7 = nn.BatchNorm1d(300)
        self.batchnorm8 = nn.BatchNorm1d(256+2)
        
        
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.05)
        self.dropout3 = nn.Dropout(0.1)
        self.dropout4 = nn.Dropout(0.05)
        self.dropout5 = nn.Dropout(0.05)
        self.dropout6 = nn.Dropout(0.05)
        self.dropout7 = nn.Dropout(0.05)
    
    def forward(self, x):
        
        x = self.linear1(x)
        x = self.batchnorm1(x)
        x = self.relu1(x)
        #2048
        x = self.dropout1(x)
        x = self.linear2(x)
        x = self.batchnorm2(x)
        x = self.relu2(x)
        #1024
        x = self.dropout2(x)
        x = self.linear3(x)
        x = self.batchnorm3(x)
        x = self.relu3(x)
        #810
        x = self.dropout3(x)
        x = self.linear4(x)
        x = self.batchnorm4(x)
        x = self.relu4(x)
        
        #512
        x = self.dropout4(x)
        x = self.linear5(x)
        x = self.batchnorm5(x)
        x = self.relu5(x)
        
        #512
        x = self.dropout5(x)
        x = self.linear6(x)
        x = self.batchnorm6(x)
        x = self.relu6(x)
        #428
        x = self.dropout6(x)
        x = self.linear7(x)
        x = self.batchnorm7(x)
        x = self.relu7(x)
        #300 
        x = self.dropout7(x)
        x = self.linear8(x)
        #300
        
        avg_pool1 = torch.mean(x, 1, keepdims = True)
        max_pool1,_ = torch.max(x, 1, keepdims = True)
        
        conc = torch.cat((x, avg_pool1, max_pool1), 1)
        conc = self.batchnorm8(conc)
        output = self.out(conc)

        return output

In [10]:
def generate_dataset(context_num, features, labels):
    t0 = time.time()
    print("It may takes 20 minutes to generate train dataset...")
    context_dataset = ContextDataset(context_num, features, labels)
    dataset = SpeechDataset(context_dataset)
    t1 = time.time()
    print("Dataset generated. Elapsed time: {0}".format((t1-t0)/60))
    return dataset

In [11]:
def weights_init(m):
    if isinstance(m, nn.Conv2d):
        xavier(m.weight.data)
        xavier(m.bias.data)

In [12]:
def scale_cos(x):
    start = 5e-3
    end = 1e-5
    return start + (1 + np.cos(np.pi * (1 - x))) * (end - start) / 2

In [13]:
def second_scale_cos(x):
    start = 1e-4
    end = 1e-8
    return start + (1 + np.cos(np.pi * (1 - x))) * (end - start) / 2

In [14]:
class ParamScheduler:
    
    def __init__(self, optimizer, scale_fn, total_steps):
        
        self.optimizer = optimizer
        self.scale_fn = scale_fn
        self.total_steps = total_steps
        self.current_iteration = 0
        
    def batch_step(self):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.scale_fn(self.current_iteration/self.total_steps)
        
        self.current_iteration += 1

In [94]:
def train_model(train_dataloader, val_dataloader, n_epochs = 15):
    
    model = SpeechNet(context_num).to(device)
    model.apply(weights_init)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
    
    '''
    set scheduler for decaying learning rate
    '''
    parameter_scheduler = ParamScheduler(optimizer, scale_cos, n_epoch*len(train_dataloader))
    candidate_model = 1
    print('Start training...')
    for i in range(n_epochs):
        
        t0 = time.time()
        
        avg_loss_1000_batch = 0
        val_correct = 0
        val_predicted = 0
        model.train()
        
        for index, (idx, features, labels) in enumerate(train_dataloader):
            
            optimizer.zero_grad()
            mask = [i for i in range(len(labels)) if labels[i] != torch.Tensor([-1]).long()[0]]
            features = features[mask].cuda()
            labels = labels[mask].cuda()
            '''
            forward and backward
            '''
            output = model(features)
            loss = criterion(output, labels.long())
            avg_loss_1000_batch += loss.item()
            loss.backward()
            
            parameter_scheduler.batch_step()
            optimizer.step()
            
            if index % 2000 == 0 and index != 0:
                
                predictions = torch.max(output.data, 1)[1]
                predicted = len(features)
                correct = int(sum(predictions == labels.to(device)).cpu())
                print("Epoch: {0}/{1} Train batch:{2}/{3}   acc: {4}  loss: {5}".format(i+1, \
                                                                              n_epochs, \
                                                                              index, \
                                                                              len(train_dataloader), \
                                                                              correct/predicted, avg_loss_1000_batch/512))
                avg_loss_1000_batch = 0
                
        for index, (idx, val_features, val_labels) in enumerate(val_dataloader):
            
            mask = [i for i in range(len(val_labels)) if val_labels[i] != torch.Tensor([-1])[0]]
            val_features = val_features[mask].to(device)
            val_labels = val_labels[mask].to(device)
            model.eval()
            outputs = model(val_features)
            predictions = torch.max(outputs.data, 1)[1]
            val_predicted += len(val_features)
            val_correct += sum(predictions == val_labels.to(device))
            
        epoch_acc = int(val_correct.cpu())/val_predicted
        if epoch_acc >= 0.70:
            pickle.dump(model, open("candidate_model_{0}.pkl".format(candidate_model), 'wb'))
            print("Save one candidate model.")
            candidate_model += 1
            
        t1 = time.time()
        print("Validation Accuracy: {0}. Cost time: {1} minutes".format(int(val_correct.cpu())/val_predicted, (t1-t0)/60))
        print("===================================================")
        
    
    return model

In [16]:
if __name__ == '__main__':
    
    train_features, train_labels = load_train_data()
    val_features, val_labels = load_validation_data()
    test_features = load_test_data()
    
    if pca_available:
        '''
        load local pca file
        '''
        pca = pickle.load(open('pca.pkl', 'rb'))
    else:
        '''
        10 features will be enough
        '''
        pca = PCA(10).fit(np.concatenate(train_features))
        pickle.dump(pca, open('pca_{0}_features.pkl'.format(pca.n_components), 'wb'))
        
    train_features, train_labels = load_and_process_data(train_features, train_labels, pca, context_num)
    val_features, val_labels = load_and_process_data(val_features, val_labels, pca, context_num)
    
    train_context_dataset = ContextDataset(context_num, train_features, train_labels)
    val_context_dataset = ContextDataset(context_num, val_features, val_labels)
    
    '''
    It may takes more than 20 minutes to process since it loops over all 15 millions frames
    But it could speed up later dataloader process
    And I could save this data by pickle
    The drawback is it's not flexible to feature engineering, like change context number
    '''
    '''    
    train_dataset = SpeechDataset(train_context_dataset)
    val_dataset = SpeechDataset(val_context_dataset)
    '''
    '''
    train_mask = (train_dataset.targets.numpy() != -1)*1
    train_sampler = WeightedRandomSampler(weights=train_mask, num_samples=int(train_mask.sum()), replacement=False)
    val_mask = (val_dataset.targets.numpy() != -1)*1
    val_sampler = WeightedRandomSampler(weights=val_mask, num_samples=int(val_mask.sum()), replacement=False)
    '''
    '''    
    train_dataloader = DataLoader(train_dataset,
                              shuffle = True,
                              batch_size = 512,
                              num_workers = 0,
                              pin_memory = True)
    
    val_dataloader = DataLoader(val_dataset,
                              shuffle = True,
                              batch_size = 512,
                              num_workers = 0,
                              pin_memory = True)
    '''
    '''
    Start train model.
    15 epochs by default.
    Cost about 1 hour.
    '''
    
    train_dataloader = DataLoader(train_context_dataset,
                              shuffle = True,
                              batch_size = 512,
                              num_workers = 0,
                              pin_memory = True)
    
    val_dataloader = DataLoader(val_context_dataset,
                                  shuffle = True,
                                  batch_size = 512,
                                  num_workers = 0,
                                  pin_memory = True)

    model = train_model(train_dataloader, val_dataloader)
    
    pickle.dump(model, open("submission_model_68.pkl", "wb"))
    #make_submission(model, test_features)

Start loading training data...
Done loading training data in 0.7778591275215149 minutes...
Start loading validation data...
Done loading validation data in 0.04492399295171102 minutes...
Start loading test data...
Done loading test data in 0.014317631721496582 minutes...
Start training...
Epoch: 1/15 Train batch:2000/30057   acc: 0.435546875  loss: 9.695134555222467
Epoch: 1/15 Train batch:4000/30057   acc: 0.525390625  loss: 8.058894955553114
Epoch: 1/15 Train batch:6000/30057   acc: 0.50390625  loss: 7.55276261921972
Epoch: 1/15 Train batch:8000/30057   acc: 0.51171875  loss: 7.251677069114521
Epoch: 1/15 Train batch:10000/30057   acc: 0.552734375  loss: 6.99999346002005
Epoch: 1/15 Train batch:12000/30057   acc: 0.568359375  loss: 6.822190935024992
Epoch: 1/15 Train batch:14000/30057   acc: 0.578125  loss: 6.700407363474369
Epoch: 1/15 Train batch:16000/30057   acc: 0.529296875  loss: 6.569748811190948
Epoch: 1/15 Train batch:18000/30057   acc: 0.57421875  loss: 6.4731716946698725


KeyboardInterrupt: 

In [87]:
train_context_dataset = ContextDataset(context_num, train_features, train_labels)
val_context_dataset = ContextDataset(context_num, val_features, val_labels)

In [88]:
train_dataloader = DataLoader(train_context_dataset,
                              shuffle = True,
                              batch_size = 512,
                              num_workers = 0,
                              pin_memory = True)
    
val_dataloader = DataLoader(val_context_dataset,
                              shuffle = True,
                              batch_size = 512,
                              num_workers = 0,
                              pin_memory = True)

In [89]:
def train_model(n_epochs = 15):
    
    model = SpeechNet(context_num).to(device)
    model.apply(weights_init)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
    
    '''
    set scheduler for decaying learning rate
    '''
    parameter_scheduler = ParamScheduler(optimizer, scale_cos, n_epoch*len(train_dataloader))
    candidate_model = 1
    print('Start training...')
    for i in range(n_epochs):
        
        t0 = time.time()
        
        avg_loss_1000_batch = 0
        val_correct = 0
        val_predicted = 0
        model.train()
        
        for index, (idx, features, labels) in enumerate(train_dataloader):
            
            optimizer.zero_grad()
            mask = [i for i in range(len(labels)) if labels[i] != torch.Tensor([-1]).long()[0]]
            features = features[mask].cuda()
            labels = labels[mask].cuda()
            '''
            forward and backward
            '''
            output = model(features)
            loss = criterion(output, labels.long())
            avg_loss_1000_batch += loss.item()
            loss.backward()
            
            parameter_scheduler.batch_step()
            optimizer.step()
            
            if index % 2000 == 0 and index != 0:
                
                predictions = torch.max(output.data, 1)[1]
                predicted = len(features)
                correct = int(sum(predictions == labels.to(device)).cpu())
                print("Epoch: {0}/{1} Train batch:{2}/{3}   acc: {4}  loss: {5}".format(i+1, \
                                                                              n_epochs, \
                                                                              index, \
                                                                              len(train_dataloader), \
                                                                              correct/predicted, avg_loss_1000_batch/512))
                avg_loss_1000_batch = 0
                
        for index, (idx, val_features, val_labels) in enumerate(val_dataloader):
            
            mask = [i for i in range(len(val_labels)) if val_labels[i] != torch.Tensor([-1])[0]]
            val_features = val_features[mask].to(device)
            val_labels = val_labels[mask].to(device)
            model.eval()
            outputs = model(val_features)
            predictions = torch.max(outputs.data, 1)[1]
            val_predicted += len(val_features)
            val_correct += sum(predictions == val_labels.to(device))
            
        epoch_acc = int(val_correct.cpu())/val_predicted
        if epoch_acc >= 0.70:
            pickle.dump(model, open("candidate_model_{0}.pkl".format(candidate_model), 'wb'))
            print("Save one candidate model.")
            candidate_model += 1
            
        t1 = time.time()
        print("Validation Accuracy: {0}. Cost time: {1} minutes".format(int(val_correct.cpu())/val_predicted, (t1-t0)/60))
        print("===================================================")
        
    
    return model

In [92]:
model = train_model_1(n_epochs = 15)

Start training...


KeyboardInterrupt: 