# **Homework 2: Phoneme Classification**


In [1]:
!pip install wandb -qU

# Log in to your W&B account
import wandb
wandb.login()


/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.1 requires wandb<0.13.0,>=0.10.0, but you have wandb 0.15.3 which is incompatible.[0m[31m
[0m

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import gc
gc.collect()

1263

In [4]:
# Lstm + wandb sweep
concat_nframes = 1              # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
train_ratio = 0.75               # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 1213                        # random seed
batch_size = 8# batch size
num_epoch = 40                   # the number of training epoch
learning_rate = 2e-3         # learning rate
model_path = './model.ckpt'     # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 8               # the number of hidden layers
hidden_dim = 256              # the hidden dim

import torch
from torch.utils.data import Dataset

class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            # self.label = torch.LongTensor(y)
            self.label = y
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

    def totalSeqLen(self):
        # return self.data.shape[0] * self.data.shape[1]
        x_seq_len_list = [s.shape[0] for s in self.data]
        return sum(x_seq_len_list)

import numpy as np
import torch
import random
import os
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pack_sequence, pad_packed_sequence
import gc


def same_seeds(seed):
    random.seed(seed) 
    np.random.seed(seed)  
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


def load_feat(path):
    feat = torch.load(path)
    return feat

def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    assert concat_n % 2 == 1 # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n) 
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, random_seed=1213):
    class_num = 41 # NOTE: pre-computed, should not need change

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    label_dict = {}
    if mode == 'train':
        for line in open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines():
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]
        
        # split training and validation data
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.seed(random_seed)
        random.shuffle(usage_list)
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

    elif mode == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    x_tensor_list = []
    if mode == 'train':
        y_tensor_list = []

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode == 'train':
          label = torch.LongTensor(label_dict[fname])

        x_tensor_list.append(feat)
        if mode == 'train':
            y_tensor_list.append(label)
          

    # X = torch.nn.utils.rnn.pad_sequence(x_tensor_list, batch_first=True)
    # X = torch.stack(x_tensor_list, dim=0)
    X = x_tensor_list



    if mode == 'train':
        # y = torch.nn.utils.rnn.pad_sequence(y_tensor_list, batch_first=True)
        # y = torch.stack(y_tensor_list, dim=0)
        y = y_tensor_list

    print(f'[INFO] {split} set')
    if mode == 'train':
        # print(f'[INFO] x shape: {X.shape} y shape: {y.shape}')
        return X, y
    else:
        return X


def collate_fn(data):
    return data


    # data.sort(key= lambda data: len(data[0]), reverse=True) 

    # x_seq_list = [dataItem[0] for dataItem in data] 
    # y_seq_list = [dataItem[1] for dataItem in data]
    # seq_len = [s.shape[0] for s in x_seq_list]
    # x_pad_seq = pad_sequence(x_seq_list, batch_first=True) 
    # x_seq_pack = pack_padded_sequence(x_pad_seq, seq_len, batch_first=True)
    # print(x_pad_seq.data)
    

    # print(isinstance(data, list)) 
    # features, labels = data 

    # # if(isinstance(data[0], tuple)){
    # #     xD

    # # }
    
    # # x.sort(key=lambda x: len(x), reverse=True)
    # # seq_len = [x.size(0) for x,y in data] # 获取数据真实的长度
    # # data = pad_sequence(data, batch_first=True)
    # # data = pack_padded_sequence(data, seq_len, batch_first=True)
    # return x_pad_seq, y_seq_list

import torch.nn as nn
import torch
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.block(x)
        return x


class LstmClassifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=4, hidden_dim=256, batch_size = 8, dropout = 0.4):
        super(LstmClassifier, self).__init__()
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.hidden_layers = hidden_layers
        self.fc =  nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, hidden_layers, dropout=0.35, batch_first = True)
        self.bc =  nn.Sequential(
            nn.Linear(hidden_dim, output_dim)
        )


    def forward(self, x, seq_lenght_list):
        h0 = torch.randn(self.hidden_layers, x.shape[0], self.hidden_dim).to('cuda')
        c0 = torch.randn(self.hidden_layers, x.shape[0], self.hidden_dim).to('cuda')
        x = self.fc(x)
        x_pad =pack_padded_sequence(x, batch_first=True, lengths=seq_lenght_list).to('cuda')
        out, (hn, cn) = self.lstm(x_pad,(h0,c0))
        out = pad_packed_sequence(out,batch_first=True)
        out = self.layer_norm(out[0])
        out = self.bc(out)
        return out

import numpy as np
import torch
import torch.nn as nn
import math
from torch.optim.lr_scheduler import LambdaLR

import random
import os
import torch
from tqdm import tqdm


def train(config=None):

    with wandb.init(config=config):
    # Copy your config 
        config = wandb.config

        same_seeds(seed)
        device = 'cuda'
        print(f'DEVICE: {device}')

        # preprocess data
        train_X, train_y = preprocess_data(split='train', feat_dir='/kaggle/input/ml2023spring-hw2/libriphone/feat', phone_path='/kaggle/input/ml2023spring-hw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)
        val_X, val_y = preprocess_data(split='val', feat_dir='/kaggle/input/ml2023spring-hw2/libriphone/feat', phone_path='/kaggle/input/ml2023spring-hw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)

        # get dataset
        train_set = LibriDataset(train_X, train_y)
        val_set = LibriDataset(val_X, val_y)
        # remove raw feature to save memory
        del train_X, train_y, val_X, val_y
        gc.collect()

        # get dataloader
        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
        n_steps_per_epoch = math.ceil(len(train_set) / batch_size)

        model = LstmClassifier(input_dim=input_dim, hidden_layers=config.hidden_layers, hidden_dim=hidden_dim, dropout = config.dropout).to(device)
        #if(os.path.exists(model_path)):
        #    model.load_state_dict(torch.load(model_path))
        criterion = nn.CrossEntropyLoss() 
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=config.weight_decay)
        # 定义一个自定义函数来计算学习率
        def lr_lambda(epoch):
            if epoch < 5:
                return epoch / 5
            elif epoch < 20:
                return 1
            else:
                return 0.5 * (1 + math.cos(math.pi * (epoch - 20) / (num_epoch-20)))

        scheduler = LambdaLR(optimizer, lr_lambda)


        best_acc = 0
        example_ct = 0
        for epoch in range(num_epoch):
            train_acc = 0.0
            train_loss = 0.0
            val_acc = 0.0
            val_loss = 0.0

            # training
            model.train() # set the model to training mode
            for step, batch in enumerate(tqdm(train_loader)):
                batch.sort(key= lambda batch: len(batch[0]), reverse=True) 
                x_seq_list = [dataItem[0] for dataItem in batch] 
                y_seq_list = [dataItem[1] for dataItem in batch]

                x_seq_len_list = [s.shape[0] for s in x_seq_list]
                x_pad_seq = pad_sequence(x_seq_list, batch_first=True).to(device) 

                optimizer.zero_grad() 
                outputs = model(x_pad_seq, x_seq_len_list) 
                result = torch.cat([outputs[i][:x_seq_len_list[i]] for i in range(outputs.size(0))], dim=0)
                y_seq_tensor = torch.cat(y_seq_list, dim=0).to(device)
                # print(f"reslut shape {result.shape} , y_seq_tensor shape {y_seq_tensor.shape}")


                loss = criterion(result, y_seq_tensor)
                loss.backward() 
                optimizer.step() 

                _, train_pred = torch.max(result, 1) # get the index of the class with the highest probability
                # print(f"train_pred.shape: {train_pred.shape}")
                # print(f"y_seq_tensor.shape: {y_seq_tensor.shape}")

                train_acc_batch = (train_pred.detach() == y_seq_tensor.detach()).sum().item()
                train_acc += train_acc_batch
                train_loss += loss.item()

                example_ct += len(x_pad_seq)
                metrics = {"train/train_loss": loss.item(), 
                           "train/acc": train_acc_batch/y_seq_tensor.shape[0],
                           "train/epoch": (step + 1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch, 
                           "train/example_ct": example_ct,
                           "train/learning_rate": optimizer.state_dict()['param_groups'][0]['lr']}
                if step % 100 == 0:
                    print(f'Train Acc: {train_acc_batch/y_seq_tensor.shape[0]} Loss: {loss.item()}')
            scheduler.step()
            # validation
            model.eval() # set the model to evaluation mode
            with torch.no_grad():
                for i, batch in enumerate(tqdm(val_loader)):
                    
                    batch.sort(key= lambda batch: len(batch[0]), reverse=True) 
                    x_seq_list = [dataItem[0] for dataItem in batch] 
                    y_seq_list = [dataItem[1] for dataItem in batch]

                    x_seq_len_list = [s.shape[0] for s in x_seq_list]
                    x_pad_seq = pad_sequence(x_seq_list, batch_first=True).to(device) 

                    outputs = model(x_pad_seq, x_seq_len_list)

                    result = torch.cat([outputs[i][:x_seq_len_list[i]] for i in range(outputs.size(0))], dim=0)
                    y_seq_tensor = torch.cat(y_seq_list, dim=0).to(device)
                    #print(f"reslut shape {result.shape} , y_seq_tensor shape {y_seq_tensor.shape}")

                    loss = criterion(result, y_seq_tensor) 

                    _, val_pred = torch.max(result, 1) 
                    val_acc += (val_pred.cpu() == y_seq_tensor.cpu()).sum().item() # get the index of the class with the highest probability
                    val_loss += loss.item()
                    
                print(f'[{epoch+1:03d}/{num_epoch:03d}] Train Acc: {train_acc/train_set.totalSeqLen():3.5f} Loss: {train_loss/len(train_loader):3.5f} | Val Acc: {val_acc/val_set.totalSeqLen():3.5f} loss: {val_loss/len(val_loader):3.5f}')

                # if the model improves, save a checkpoint at this epoch
                val_metrics = {"val/val_loss": val_loss/len(val_loader), 
                "val/val_accuracy": val_acc/val_set.totalSeqLen()}
                wandb.log({**metrics, **val_metrics})

            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), model_path)
                print(f'saving model with acc {best_acc/val_set.totalSeqLen():.5f}')

        wandb.finish()
if __name__ == '__main__':
    
    '''
    parameters_dict = {
        'weight_decay': {
          'values': [0.03, 0.04, 0.05]
        },
        'dropout': {
          'values': [0.3, 0.4, 0.5]
        },
        "hidden_layers": {
          'values': [7, 8, 9]
        }
    }

    sweep_config = {
        'method': 'random',
        'metric':{
            'name': "val/val_accuracy",
            'goal': "maximize"
        },
        'parameters': parameters_dict,
        'early_terminate':{
            'type': "hyperband",
            'min_iter': 3,
            'eta': 1
        }
    }
    parameters_dict.update({
    'epochs': {
        'value': num_epoch
    },
    'learning_rate': {
        'value': learning_rate
      },
    'batch_size': {
        'value': batch_size
      }
    })

    import pprint
    pprint.pprint(sweep_config)
    sweep_id = wandb.sweep(sweep_config, project="lstm-for-phoneme-recognition")
    wandb.agent("liweixin2021/lstm-for-phoneme-recognition/lc1wsklf", train, count=36)  

'''



In [3]:
# BiLstm + wandb log
concat_nframes = 1              # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
train_ratio = 0.75               # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 1213                        # random seed
batch_size = 8# batch size
num_epoch = 30                   # the number of training epoch
learning_rate = 2e-3         # learning rate
model_path = './model.ckpt'     # the path where the checkpoint will be saved

# model parameters
input_dim = 39 * concat_nframes # the input dim of the model, you should not change the value
hidden_layers = 7               # the number of hidden layers
hidden_dim = 256              # the hidden dim
dropout = 0.35
weight_decay = 0.05

import torch
from torch.utils.data import Dataset

class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            # self.label = torch.LongTensor(y)
            self.label = y
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

    def totalSeqLen(self):
        # return self.data.shape[0] * self.data.shape[1]
        x_seq_len_list = [s.shape[0] for s in self.data]
        return sum(x_seq_len_list)

import numpy as np
import torch
import random
import os
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pack_sequence, pad_packed_sequence
import gc


def same_seeds(seed):
    random.seed(seed) 
    np.random.seed(seed)  
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


def load_feat(path):
    feat = torch.load(path)
    return feat

def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    assert concat_n % 2 == 1 # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n) 
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, random_seed=1213):
    class_num = 41 # NOTE: pre-computed, should not need change

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    label_dict = {}
    if mode == 'train':
        for line in open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines():
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]
        
        # split training and validation data
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.seed(random_seed)
        random.shuffle(usage_list)
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

    elif mode == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    x_tensor_list = []
    if mode == 'train':
        y_tensor_list = []

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode == 'train':
          label = torch.LongTensor(label_dict[fname])

        x_tensor_list.append(feat)
        if mode == 'train':
            y_tensor_list.append(label)
          

    # X = torch.nn.utils.rnn.pad_sequence(x_tensor_list, batch_first=True)
    # X = torch.stack(x_tensor_list, dim=0)
    X = x_tensor_list



    if mode == 'train':
        # y = torch.nn.utils.rnn.pad_sequence(y_tensor_list, batch_first=True)
        # y = torch.stack(y_tensor_list, dim=0)
        y = y_tensor_list

    print(f'[INFO] {split} set')
    if mode == 'train':
        # print(f'[INFO] x shape: {X.shape} y shape: {y.shape}')
        return X, y
    else:
        return X


def collate_fn(data):
    return data


import torch.nn as nn
import torch
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.block(x)
        return x


class LstmClassifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=4, hidden_dim=256, batch_size = 8):
        super(LstmClassifier, self).__init__()
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.hidden_layers = hidden_layers
        self.fc =  nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout (dropout)
        )
        self.layer_norm = nn.LayerNorm(hidden_dim * 2 )
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, hidden_layers, dropout=dropout, bidirectional = True, batch_first = True)
        self.bc =  nn.Sequential(
            nn.Dropout (dropout),
            nn.Linear(hidden_dim *2, output_dim)
        )


    def forward(self, x, seq_lenght_list):
        h0 = torch.randn(self.hidden_layers*2, x.shape[0], self.hidden_dim).to('cuda')
        c0 = torch.randn(self.hidden_layers*2, x.shape[0], self.hidden_dim).to('cuda')
        x = self.fc(x)
        x_pad =pack_padded_sequence(x, batch_first=True, lengths=seq_lenght_list).to('cuda')
        out, (hn, cn) = self.lstm(x_pad,(h0,c0))
        out = pad_packed_sequence(out,batch_first=True)
        out = self.layer_norm(out[0])
        out = self.bc(out)
        return out

import numpy as np
import torch
import torch.nn as nn
import math
from torch.optim.lr_scheduler import LambdaLR

import random
import os
import torch
from tqdm import tqdm



def train():
    wandb.init(
        project="lstm-for-phoneme-recognition",
        config={
            "epochs": num_epoch,
            "batch_size": batch_size,
            "lr": learning_rate,
            "dropout": dropout,
            "weight_decay": weight_decay
            }
    )
    # Copy your config 
    config = wandb.config
    same_seeds(seed)
    device = 'cuda'
    print(f'DEVICE: {device}')

    # preprocess data
    train_X, train_y = preprocess_data(split='train', feat_dir='/kaggle/input/ml2023spring-hw2/libriphone/feat', phone_path='/kaggle/input/ml2023spring-hw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)
    val_X, val_y = preprocess_data(split='val', feat_dir='/kaggle/input/ml2023spring-hw2/libriphone/feat', phone_path='/kaggle/input/ml2023spring-hw2/libriphone', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)

    # get dataset
    train_set = LibriDataset(train_X, train_y)
    val_set = LibriDataset(val_X, val_y)
    # remove raw feature to save memory
    del train_X, train_y, val_X, val_y
    gc.collect()

    # get dataloader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    n_steps_per_epoch = math.ceil(len(train_set) / batch_size)

    model = LstmClassifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
    #if(os.path.exists(model_path)):
    #    model.load_state_dict(torch.load(model_path))
    criterion = nn.CrossEntropyLoss() 
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    # 定义一个自定义函数来计算学习率
    def lr_lambda(epoch):
        if epoch < 5:
            return epoch / 5
        elif epoch < 20:
            return 1
        else:
            return 0.5 * (1 + math.cos(math.pi * (epoch - 20) / (num_epoch-20)))

    scheduler = LambdaLR(optimizer, lr_lambda)


    best_acc = 0
    example_ct = 0
    for epoch in range(num_epoch):
        print(optimizer.state_dict()['param_groups'][0]['lr'])
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0

        # training
        model.train() # set the model to training mode
        for step, batch in enumerate(tqdm(train_loader)):
            batch.sort(key= lambda batch: len(batch[0]), reverse=True) 
            x_seq_list = [dataItem[0] for dataItem in batch] 
            y_seq_list = [dataItem[1] for dataItem in batch]

            x_seq_len_list = [s.shape[0] for s in x_seq_list]
            x_pad_seq = pad_sequence(x_seq_list, batch_first=True).to(device) 

            optimizer.zero_grad() 
            outputs = model(x_pad_seq, x_seq_len_list) 
            result = torch.cat([outputs[i][:x_seq_len_list[i]] for i in range(outputs.size(0))], dim=0)
            y_seq_tensor = torch.cat(y_seq_list, dim=0).to(device)
            # print(f"reslut shape {result.shape} , y_seq_tensor shape {y_seq_tensor.shape}")


            loss = criterion(result, y_seq_tensor)
            loss.backward() 
            optimizer.step() 

            _, train_pred = torch.max(result, 1) # get the index of the class with the highest probability
            # print(f"train_pred.shape: {train_pred.shape}")
            # print(f"y_seq_tensor.shape: {y_seq_tensor.shape}")

            train_acc_batch = (train_pred.detach() == y_seq_tensor.detach()).sum().item()
            train_acc += train_acc_batch
            train_loss += loss.item()

            example_ct += len(x_pad_seq)
            metrics = {"train/train_loss": loss.item(), 
                       "train/acc": train_acc_batch/y_seq_tensor.shape[0],
                       "train/epoch": (step + 1 + (n_steps_per_epoch * epoch)) / n_steps_per_epoch, 
                       "train/example_ct": example_ct,
                        "train/lr": optimizer.state_dict()['param_groups'][0]['lr']
                      }
            if step % 100 == 0:
                print(f'Train Acc: {train_acc_batch/y_seq_tensor.shape[0]} Loss: {loss.item()}')
        scheduler.step()
        # validation
        model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, batch in enumerate(tqdm(val_loader)):
                
                batch.sort(key= lambda batch: len(batch[0]), reverse=True) 
                x_seq_list = [dataItem[0] for dataItem in batch] 
                y_seq_list = [dataItem[1] for dataItem in batch]

                x_seq_len_list = [s.shape[0] for s in x_seq_list]
                x_pad_seq = pad_sequence(x_seq_list, batch_first=True).to(device) 

                outputs = model(x_pad_seq, x_seq_len_list)

                result = torch.cat([outputs[i][:x_seq_len_list[i]] for i in range(outputs.size(0))], dim=0)
                y_seq_tensor = torch.cat(y_seq_list, dim=0).to(device)
                #print(f"reslut shape {result.shape} , y_seq_tensor shape {y_seq_tensor.shape}")

                loss = criterion(result, y_seq_tensor) 

                _, val_pred = torch.max(result, 1) 
                val_acc += (val_pred.cpu() == y_seq_tensor.cpu()).sum().item() # get the index of the class with the highest probability
                val_loss += loss.item()
                
            print(f'[{epoch+1:03d}/{num_epoch:03d}] Train Acc: {train_acc/train_set.totalSeqLen():3.5f} Loss: {train_loss/len(train_loader):3.5f} | Val Acc: {val_acc/val_set.totalSeqLen():3.5f} loss: {val_loss/len(val_loader):3.5f}')

            # if the model improves, save a checkpoint at this epoch
            val_metrics = {"val/val_loss": val_loss/len(val_loader), 
            "val/val_accuracy": val_acc/val_set.totalSeqLen()}
            wandb.log({**metrics, **val_metrics})

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), model_path)
            print(f'saving model with acc {best_acc/val_set.totalSeqLen():.5f}')

    wandb.finish()
if __name__ == '__main__':
    train()

    
  

[34m[1mwandb[0m: Currently logged in as: [33mliweixin2021[0m. Use [1m`wandb login --relogin`[0m to force relogin


DEVICE: cuda
[Dataset] - # phone classes: 41, number of utterances for train: 2571


2571it [00:13, 187.01it/s]


[INFO] train set
[Dataset] - # phone classes: 41, number of utterances for val: 858


858it [00:04, 181.61it/s]


[INFO] val set
0.0


  0%|          | 1/322 [00:04<23:49,  4.45s/it]

Train Acc: 0.042 Loss: 3.851104974746704


 31%|███▏      | 101/322 [01:20<02:56,  1.25it/s]

Train Acc: 0.041337590320954465 Loss: 3.8776588439941406


 62%|██████▏   | 201/322 [02:33<01:25,  1.42it/s]

Train Acc: 0.04041237113402062 Loss: 3.847459316253662


 76%|███████▌  | 245/322 [03:07<00:58,  1.31it/s]


KeyboardInterrupt: 

In [None]:
  
import os
import numpy as np
import torch
import torch.nn as nn

import random
import os
import torch
from tqdm import tqdm


from torchsummary import summary
# load data
test_X = preprocess_data(split='test', feat_dir='./libriphone/feat', phone_path='./libriphone', concat_nframes=concat_nframes)
test_set = LibriDataset(test_X, None)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False,collate_fn=collate_fn)
model = LstmClassifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to("cuda")
#model = LstmClassifier(input_dim=input_dim).to(device)
model.load_state_dict(torch.load(model_path))

pred = np.array([], dtype=np.int32)

model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):
        x_seq_list = batch 

        seq_len = [s.shape[0] for s in x_seq_list]
        # print(sum(seq_len))
        x_pad_seq = pad_sequence(x_seq_list, batch_first=True).to('cuda') 


        outputs = model(x_pad_seq)
        
        result = torch.cat([outputs[i][:seq_len[i]] for i in range(outputs.size(0))], dim=0)

        # for i in range(len(outputs)):
        #     print(outputs[i].shape)
        #     outputs[i] = outputs[i][:seq_len[i]]
        # outputs = torch.reshape(outputs, (outputs.shape[0] * outputs.shape[1] , outputs.shape[2]))

        # print(outputs.shape)
        

        _, test_pred = torch.max(result, 1) # get the index of the class with the highest probability
        print(test_pred.shape)
        pred = np.concatenate((pred, test_pred.cpu().numpy()), axis=0)


with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(pred):
        f.write('{},{}\n'.format(i, y))

In [None]:
!nvidia-smi

In [None]:
!pip install git+https://github.com/huggingface/accelerate

In [None]:
#import os
#from accelerate.utils import write_basic_config
#write_basic_config() # Write a config file
#os._exit(0)

In [None]:
from torch.utils.data import Dataset
import torch.nn as nn
#from torch.optim.lr_scheduler import LambdaLR
#import random
from torch.utils.data import DataLoader

import numpy as np
#import math
import os
from tqdm import tqdm
import datetime
import gc

# Accelerate parts
from accelerate import Accelerator, notebook_launcher # main interface, distributed launcher
from accelerate.utils import set_seed # reproducability across devices

In [None]:
#!accelerate env

In [None]:
#!accelerate config

In [None]:
class LibriDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            # self.label = torch.LongTensor(y)
            self.label = y
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)

    def totalSeqLen(self):
        x_seq_len_list = [s.shape[0] for s in self.data]
        return sum(x_seq_len_list)

In [None]:

def load_feat(path):
    import torch
    feat = torch.load(path)
    return feat

def preprocess_data(split, feat_dir, phone_path, train_ratio=0.8, random_seed=1213):

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    label_dict = {}
    if mode == 'train':
        for line in open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines():
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]
        
        # split training and validation data
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

    elif mode == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip('\n') for line in usage_list]

    x_tensor_list = []
    if mode == 'train':
        y_tensor_list = []

    idx = 0
    for i, fname in enumerate(usage_list):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        if mode == 'train':
            label = label_dict[fname]

        x_tensor_list.append(feat)
        if mode == 'train':
            y_tensor_list.append(label)
          

    X = x_tensor_list



    if mode == 'train':
        y = y_tensor_list

    if mode == 'train':
        return X, y
    else:
        return X


def collate_fn(data):
    return data


    # data.sort(key= lambda data: len(data[0]), reverse=True) 

    # x_seq_list = [dataItem[0] for dataItem in data] 
    # y_seq_list = [dataItem[1] for dataItem in data]
    # seq_len = [s.shape[0] for s in x_seq_list]
    # x_pad_seq = pad_sequence(x_seq_list, batch_first=True) 
    # x_seq_pack = pack_padded_sequence(x_pad_seq, seq_len, batch_first=True)
    # print(x_pad_seq.data)
    

    # print(isinstance(data, list)) 
    # features, labels = data 

    # # if(isinstance(data[0], tuple)){
    # #     xD

    # # }
    
    # # x.sort(key=lambda x: len(x), reverse=True)
    # # seq_len = [x.size(0) for x,y in data] # 获取数据真实的长度
    # # data = pad_sequence(data, batch_first=True)
    # # data = pack_padded_sequence(data, seq_len, batch_first=True)
    # return x_pad_seq, y_seq_list 

In [None]:

#print(torch.cuda.is_initialized())

In [None]:

class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.block(x)
        return x


class LstmClassifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=4, hidden_dim=256, batch_size = 8):
        super(LstmClassifier, self).__init__()
        self.input_dim = input_dim
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.hidden_layers = hidden_layers
        self.fc =  nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.bc =  nn.Sequential(
            nn.Linear(hidden_dim, output_dim)
        )



    def forward(self, x, seq_lenght_list , device):
        #h0 = torch.randn(self.hidden_layers, x.shape[0], self.hidden_dim).to(device)
        #c0 = torch.randn(self.hidden_layers, x.shape[0], self.hidden_dim).to(device)
        x = self.fc(x)
        out = self.layer_norm(out[0])
        out = self.bc(out)
        return out


In [None]:

#print(torch.cuda.is_initialized())

In [None]:



def training_loop(
        mixed_precision:str="no",     
        train_ratio = 0.75, 
        seed = 1213, 
        batch_size = 8, 
        input_dim = 39,
        hidden_layers = 7,
        hidden_dim = 256 ):
    # initialize accelerator and auto move data/model to accelerator.device

    set_seed(42)
    accelerator = Accelerator(mixed_precision=mixed_precision)
    
  


    # preprocess data
    train_X, train_y = preprocess_data(split='train', feat_dir='/kaggle/input/ml2023spring-hw2/libriphone/feat', phone_path='/kaggle/input/ml2023spring-hw2/libriphone', train_ratio=train_ratio, random_seed=seed)
    val_X, val_y = preprocess_data(split='val', feat_dir='/kaggle/input/ml2023spring-hw2/libriphone/feat', phone_path='/kaggle/input/ml2023spring-hw2/libriphone',  train_ratio=train_ratio, random_seed=seed)

    # get dataset
    train_set = LibriDataset(train_X, train_y)
    val_set = LibriDataset(val_X, val_y)
    # remove raw feature to save memory
    del train_X, train_y, val_X, val_y
    gc.collect()


    
 #======================================================================

    # get dataloader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    model = LstmClassifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim)
    
    #if(os.path.exists(model_path)):       
    #    model.load_state_dict(torch.load(model_path))
    criterion = nn.CrossEntropyLoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)
    # 定义一个自定义函数来计算学习率
    #def lr_lambda(epoch):
    #    if epoch < 10:
    #        return epoch / 10
    #    else:
    #        return 0.5 * (1 + math.cos(math.pi * (epoch - 10) / 20))
    #scheduler = LambdaLR(optimizer, lr_lambda)
    

    
    model, optimizer, train_loader, val_loader = accelerator.prepare(
        model, optimizer, train_loader, val_loader)
    
    accelerator.print(f'device {str(accelerator.device)} is used!')
    
    accelerator.print(mixed_precision, train_ratio, seed, batch_size,learning_rate, model_path , input_dim, hidden_layers ,hidden_dim)
    same_seeds(seed)
    #======================================================================

    best_acc = 0
    for epoch in range(40):
        accelerator.print(optimizer.state_dict()['param_groups'][0]['lr'])
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0

        # training
        model.train() # set the model to training mode
        for i, batch in enumerate(tqdm(train_loader)):
            batch.sort(key= lambda batch: len(batch[0]), reverse=True) 
            x_seq_list = [dataItem[0] for dataItem in batch] 
            y_seq_list = [dataItem[1] for dataItem in batch]

            x_seq_len_list = [s.shape[0] for s in x_seq_list]
            x_pad_seq = pad_sequence(x_seq_list, batch_first=True) 

            optimizer.zero_grad() 
            outputs = model(x_pad_seq, x_seq_len_list, device = accelerator.device) 
            result = torch.cat([outputs[i][:x_seq_len_list[i]] for i in range(outputs.size(0))], dim=0)
            y_seq_tensor = torch.cat(y_seq_list, dim=0)
            # print(f"reslut shape {result.shape} , y_seq_tensor shape {y_seq_tensor.shape}")


            loss = criterion(result, y_seq_tensor)
            
            #======================================================================
            #attention here! 
            #loss.backward() 
            accelerator.backward(loss) #loss.backward()
            #======================================================================


            _, train_pred = torch.max(result, 1) # get the index of the class with the highest probability
            # print(f"train_pred.shape: {train_pred.shape}")
            # print(f"y_seq_tensor.shape: {y_seq_tensor.shape}")

            train_acc_batch = (train_pred.detach() == y_seq_tensor.detach()).sum().item()
            train_acc += train_acc_batch
            train_loss += loss.item()
            if i % 100 == 0:
                accelerator.print(f'Train Acc: {train_acc_batch/y_seq_tensor.shape[0]} Loss: {loss.item()}')
        # validation
        model.eval() # set the model to evaluation mode
        with torch.no_grad():
            for i, batch in enumerate(tqdm(val_loader)):
                
                batch.sort(key= lambda batch: len(batch[0]), reverse=True) 
                x_seq_list = [dataItem[0] for dataItem in batch] 
                y_seq_list = [dataItem[1] for dataItem in batch]

                x_seq_len_list = [s.shape[0] for s in x_seq_list]
                x_pad_seq = pad_sequence(x_seq_list, batch_first=True)

                outputs = model(x_pad_seq, x_seq_len_list, device = accelerator.device)

                result = torch.cat([outputs[i][:x_seq_len_list[i]] for i in range(outputs.size(0))], dim=0)
                y_seq_tensor = torch.cat(y_seq_list, dim=0)
       
                loss = criterion(result, y_seq_tensor) 

                _, val_pred = torch.max(result, 1) 
            
                #======================================================================
                #gather data from multi-gpus (used when in ddp mode)
                val_pred = accelerator.gather(val_pred)
                y_seq_tensor = accelerator.gather(y_seq_tensor)
                #======================================================================

    
                val_acc += (val_pred.cpu() == y_seq_tensor.cpu()).sum().item() # get the index of the class with the highest probability
                val_loss += loss.item()
                
            accelerator.print(f'[{epoch+1:03d}/{num_epoch:03d}] Train Acc: {train_acc/train_set.totalSeqLen():3.5f} Loss: {train_loss/len(train_loader):3.5f} | Val Acc: {val_acc/val_set.totalSeqLen():3.5f} loss: {val_loss/len(val_loader):3.5f}')

        #======================================================================
        #print logs and save ckpt  
        model_path="model.ckpt"
        accelerator.wait_for_everyone()
        nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        accelerator.print(f"epoch【{epoch}】@{nowtime} --> val_acc= {100 * val_acc:.2f}%")
        unwrapped_net = accelerator.unwrap_model(model)
        accelerator.save(unwrapped_net.state_dict(),model_path+"_"+str(epoch))
        #======================================================================

        '''
        # if the model improves, save a checkpoint at this epoch
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), model_path)
            print(f'saving model with acc {best_acc/val_set.totalSeqLen():.5f}')
        '''

In [None]:
#import torch
#print(torch.cuda.is_initialized())

In [None]:
notebook_launcher(training_loop, ("no",  0.75, 1213, 8, 39, 7,256 ) ,num_processes=2)