In [None]:
# numerical operation
import numpy as np
import math
import random

# data i\o
import csv
import pandas as pd
import os

# garbage collect
import gc

# progress bar
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# plotting learning curve
from torch.utils.tensorboard import SummaryWriter

# run time
from time import time

In [None]:
# Main link
!wget -O libriphone.zip "https://github.com/xraychen/shiny-robot/releases/download/v1.0/libriphone.zip"

# Backup Link 0
# !pip install --upgrade gdown
# !gdown --id '1o6Ag-G3qItSmYhTheX6DYiuyNzWyHyTc' --output libriphone.zip

# Backup link 1
# !pip install --upgrade gdown
# !gdown --id '1R1uQYi4QpX0tBfUWt2mbZcncdBsJkxeW' --output libriphone.zip

# Backup link 2
# !wget -O libriphone.zip "https://www.dropbox.com/s/wqww8c5dbrl2ka9/libriphone.zip?dl=1"

# Backup link 3
# !wget -O libriphone.zip "https://www.dropbox.com/s/p2ljbtb2bam13in/libriphone.zip?dl=1"

!unzip -q libriphone.zip
!ls libriphone

--2024-07-25 03:31:23--  https://github.com/xraychen/shiny-robot/releases/download/v1.0/libriphone.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/463868124/343908dd-b2e4-4b8e-b7d6-7f0f040179ce?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240725%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240725T033123Z&X-Amz-Expires=300&X-Amz-Signature=05c736a7f1f5c2d3d4b7ce81233e88e05f9a6f73e5e93f04c940eb9a950be37c&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=463868124&response-content-disposition=attachment%3B%20filename%3Dlibriphone.zip&response-content-type=application%2Foctet-stream [following]
--2024-07-25 03:31:23--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/463868124/343908dd-b2e4-4b8e-b7d6-7f0f040179ce?X-Amz-

In [None]:
"""fix random seed"""
def same_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [None]:
"""Hyperparameters"""
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'concat_feat_num': 19,
    'train_ratio': 0.8,
    'seed': 5201314,
    'batch_size': 2048,
    'lr': 1e-4,
    'epochs': 50,
    'save_path': './model/model.pth',
    'early_stop': 10
}

In [None]:
"""Dataset"""
class hw2_Dataset(Dataset):
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.LongTensor(y)
        self.x = x

    def __getitem__(self, index):
        if self.y is None:
            return self.x[index]
        else:
            return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [None]:
"""Pretreatment data"""
same_seed(config['seed'])


def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]

    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)


def concat_feat(x, concat_n):
    assert concat_n % 2 == 1  # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n)
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2)  # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid + 1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)


def pretreatment_feat(data, label_dict, concat_num, mode='train'):
    max_len = 3000000
    x = torch.empty(max_len, 39 * config['concat_feat_num'])
    if mode == 'train':
        y = torch.empty(max_len, dtype=torch.long)
    idx = 0
    for i, fname in enumerate(data):
        feat = torch.load(os.path.join(path, 'feat', mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_num)
        x[idx:idx + cur_len, :] = feat
        if mode == 'train':
            label = torch.LongTensor(label_dict[fname])
            y[idx:idx + cur_len] = label
        idx += cur_len

    x = x[:idx, :]
    if mode == 'train':
        y = y[:idx]
        return x, y
    else:
        return x


path = './libriphone'
# train_label_dict
train_label = open(os.path.join(path, 'train_labels.txt')).readlines()
label_dict = {}
for line in train_label:
    line = line.strip('\n').split(' ')
    label_dict[line[0]] = [int(p) for p in line[1:]]

# train_data_list
train_data = open(os.path.join(path, 'train_split.txt')).readlines()
train_data = [line.strip('\n') for line in train_data]
random.shuffle(train_data)

# split train data to train set and valid set
x_train_data = train_data[:int(config['train_ratio'] * len(train_data))]
x_valid_data = train_data[int(config['train_ratio'] * len(train_data)):]

# pretreatment
x_train, y_train = pretreatment_feat(x_train_data, label_dict, config['concat_feat_num'])
x_valid, y_valid = pretreatment_feat(x_valid_data, label_dict, config['concat_feat_num'])

# test data, same operation as train data
test_data = open(os.path.join(path, 'test_split.txt')).readlines()
test_data = [line.strip('\n') for line in test_data]
x_test = pretreatment_feat(test_data, label_dict, config['concat_feat_num'], mode='test')

print(f"""train data size: {len(x_train)},
valid data size: {len(x_valid)},
test data size: {len(x_test)}""")

train_dataset, valid_dataset, test_dataset = hw2_Dataset(x_train, y_train), hw2_Dataset(x_valid, y_valid), \
    hw2_Dataset(x_test)

train_loader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

del train_data, x_train_data, label_dict, x_valid_data, train_label, test_data, x_train, \
    y_train, x_valid, y_valid, x_test
gc.collect()

train data size: 2112269,
valid data size: 531889,
test data size: 646268


188

In [None]:
"""Model"""
class hw2_model(nn.Module):
    def __init__(self, input_dim, output_dim=41):
        super(hw2_model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.35),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.35),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.35),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.35),
            nn.Linear(1024, output_dim),
        )

    def forward(self, x):
        x = self.layers(x)
        return x

In [None]:
"""training loop"""
def trainer(train_loader, valid_loader, model, config, device, train_len, valid_len):
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
                                                                     T_0=10, T_mult=2, eta_min=config['lr'] / 2)

    writer = SummaryWriter()

    if not os.path.isdir('model'):
        os.mkdir('./model')

    n_epochs, best_acc, step, early_stop_count = config['epochs'], -math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train()
        train_acc, train_loss, valid_acc, valid_loss = 0, 0, 0, 0
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            _, train_pred = torch.max(outputs, 1)  # get index
            train_acc += (train_pred.detach() == y.detach()).sum().item()
            train_loss += loss.item()

            step += 1
            train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        train_acc_rate = train_acc / train_len
        writer.add_scalar('acc_rate/train', train_acc_rate, step)

        model.eval()
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                outputs = model(x)
                loss = criterion(outputs, y)
                _, valid_pred = torch.max(outputs, 1)
                valid_acc += (valid_pred.cpu() == y.cpu()).sum().item()
                valid_loss += loss

        valid_acc_rate = valid_acc / valid_len
        writer.add_scalar('acc_rate/valid', valid_acc_rate, step)

        print(f'Epoch [{epoch + 1}/{n_epochs}]: Train acc: {train_acc_rate}, Valid acc: {valid_acc_rate}')

        if valid_acc > best_acc:
            best_acc = valid_acc
            torch.save(model.state_dict(), config['save_path'])
            print('save model with acc: {:.3f}'.format(valid_acc_rate))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\n model is not improving, so we halt the training')
            return
        scheduler.step()


"""start training"""
st_time = time()
model = hw2_model(39 * config['concat_feat_num'], 41).to(device)
trainer(train_loader, valid_loader, model, config, device, len(train_dataset), len(valid_dataset))
end_time = time()
print(f'Total train time: {end_time - st_time}')

Epoch [1/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.84it/s, loss=1.38]


Epoch [1/50]: Train acc: 0.5293942201490435, Valid acc: 0.6275031068512416
save model with acc: 0.628


Epoch [2/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.25it/s, loss=1.27]


Epoch [2/50]: Train acc: 0.6055762784001469, Valid acc: 0.657765060003121
save model with acc: 0.658


Epoch [3/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.21it/s, loss=1.22]


Epoch [3/50]: Train acc: 0.630153640469088, Valid acc: 0.6742252612857194
save model with acc: 0.674


Epoch [4/50]: 100%|██████████| 1032/1032 [00:47<00:00, 21.53it/s, loss=1.11]


Epoch [4/50]: Train acc: 0.645745404586253, Valid acc: 0.6849756246134062
save model with acc: 0.685


Epoch [5/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.47it/s, loss=1.14]


Epoch [5/50]: Train acc: 0.6559846307454211, Valid acc: 0.6929885746838156
save model with acc: 0.693


Epoch [6/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.42it/s, loss=1.01]


Epoch [6/50]: Train acc: 0.6643505159617454, Valid acc: 0.699858429108329
save model with acc: 0.700


Epoch [7/50]: 100%|██████████| 1032/1032 [00:47<00:00, 21.64it/s, loss=1.05]


Epoch [7/50]: Train acc: 0.6709192815877144, Valid acc: 0.7034606844661198
save model with acc: 0.703


Epoch [8/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.27it/s, loss=1.11]


Epoch [8/50]: Train acc: 0.6763134809060778, Valid acc: 0.7080894697953897
save model with acc: 0.708


Epoch [9/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.33it/s, loss=1.06]


Epoch [9/50]: Train acc: 0.6808242700148514, Valid acc: 0.71069715673759
save model with acc: 0.711


Epoch [10/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.97it/s, loss=1.03]


Epoch [10/50]: Train acc: 0.6847446987102495, Valid acc: 0.7136977828080671
save model with acc: 0.714


Epoch [11/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.80it/s, loss=0.964]


Epoch [11/50]: Train acc: 0.6849014022361736, Valid acc: 0.7145682651831491
save model with acc: 0.715


Epoch [12/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.51it/s, loss=0.984]


Epoch [12/50]: Train acc: 0.6890907360757555, Valid acc: 0.7176140134501747
save model with acc: 0.718


Epoch [13/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.45it/s, loss=0.885]


Epoch [13/50]: Train acc: 0.6937563350122546, Valid acc: 0.7218649003833507
save model with acc: 0.722


Epoch [14/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.68it/s, loss=0.918]


Epoch [14/50]: Train acc: 0.697693806991439, Valid acc: 0.724134170851437
save model with acc: 0.724


Epoch [15/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.99it/s, loss=0.887]


Epoch [15/50]: Train acc: 0.701236442896241, Valid acc: 0.7265030861702347
save model with acc: 0.727


Epoch [16/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.94it/s, loss=0.969]


Epoch [16/50]: Train acc: 0.7046735051264778, Valid acc: 0.7281274852459818
save model with acc: 0.728


Epoch [17/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.28it/s, loss=0.921]


Epoch [17/50]: Train acc: 0.7080674857226992, Valid acc: 0.7306524481611765
save model with acc: 0.731


Epoch [18/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.50it/s, loss=0.92]


Epoch [18/50]: Train acc: 0.7108029327704, Valid acc: 0.7329743611919028
save model with acc: 0.733


Epoch [19/50]: 100%|██████████| 1032/1032 [00:49<00:00, 21.06it/s, loss=0.897]


Epoch [19/50]: Train acc: 0.7133944587550165, Valid acc: 0.7340704545497275
save model with acc: 0.734


Epoch [20/50]: 100%|██████████| 1032/1032 [00:47<00:00, 21.55it/s, loss=0.898]


Epoch [20/50]: Train acc: 0.7159954532306254, Valid acc: 0.7357813378355258
save model with acc: 0.736


Epoch [21/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.27it/s, loss=0.896]


Epoch [21/50]: Train acc: 0.7183251754393025, Valid acc: 0.7367571053358878
save model with acc: 0.737


Epoch [22/50]: 100%|██████████| 1032/1032 [00:48<00:00, 21.21it/s, loss=0.909]


Epoch [22/50]: Train acc: 0.7206118160139642, Valid acc: 0.7382047758084863
save model with acc: 0.738


Epoch [23/50]: 100%|██████████| 1032/1032 [00:51<00:00, 20.20it/s, loss=0.841]


Epoch [23/50]: Train acc: 0.7228094527732973, Valid acc: 0.7396317652743336
save model with acc: 0.740


Epoch [24/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.43it/s, loss=0.88]


Epoch [24/50]: Train acc: 0.724108529737453, Valid acc: 0.7409572297979465
save model with acc: 0.741


Epoch [25/50]: 100%|██████████| 1032/1032 [00:49<00:00, 21.02it/s, loss=0.921]


Epoch [25/50]: Train acc: 0.7256315365135785, Valid acc: 0.7419160764746028
save model with acc: 0.742


Epoch [26/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.69it/s, loss=0.851]


Epoch [26/50]: Train acc: 0.7279925047425304, Valid acc: 0.7428091199479591
save model with acc: 0.743


Epoch [27/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.67it/s, loss=0.807]


Epoch [27/50]: Train acc: 0.7291339313316628, Valid acc: 0.7435273149096898
save model with acc: 0.744


Epoch [28/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.90it/s, loss=0.814]


Epoch [28/50]: Train acc: 0.7304046028228413, Valid acc: 0.7439127336718752
save model with acc: 0.744


Epoch [29/50]: 100%|██████████| 1032/1032 [00:51<00:00, 19.88it/s, loss=0.788]


Epoch [29/50]: Train acc: 0.7314574990211947, Valid acc: 0.7449731052907655
save model with acc: 0.745


Epoch [30/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.45it/s, loss=0.867]


Epoch [30/50]: Train acc: 0.7325075546722506, Valid acc: 0.7449167025450799


Epoch [31/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.47it/s, loss=0.818]


Epoch [31/50]: Train acc: 0.7291618633800904, Valid acc: 0.743096773950956


Epoch [32/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.57it/s, loss=0.87]


Epoch [32/50]: Train acc: 0.7299619508689471, Valid acc: 0.7441928673087805


Epoch [33/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.49it/s, loss=0.779]


Epoch [33/50]: Train acc: 0.7308325786156972, Valid acc: 0.7446610100979716


Epoch [34/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.45it/s, loss=0.858]


Epoch [34/50]: Train acc: 0.7323016149931662, Valid acc: 0.7464132553972728
save model with acc: 0.746


Epoch [35/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.70it/s, loss=0.869]


Epoch [35/50]: Train acc: 0.7332782898390309, Valid acc: 0.7460729588316359


Epoch [36/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.42it/s, loss=0.898]


Epoch [36/50]: Train acc: 0.7351279595543939, Valid acc: 0.7469716425795607
save model with acc: 0.747


Epoch [37/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.76it/s, loss=0.784]


Epoch [37/50]: Train acc: 0.7360113697639836, Valid acc: 0.747242375758852
save model with acc: 0.747


Epoch [38/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.83it/s, loss=0.776]


Epoch [38/50]: Train acc: 0.7368677947742451, Valid acc: 0.7476541158023573
save model with acc: 0.748


Epoch [39/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.52it/s, loss=0.847]


Epoch [39/50]: Train acc: 0.7381039062733014, Valid acc: 0.7485509194587593
save model with acc: 0.749


Epoch [40/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.39it/s, loss=0.786]


Epoch [40/50]: Train acc: 0.7388968923939139, Valid acc: 0.7488573743769846
save model with acc: 0.749


Epoch [41/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.53it/s, loss=0.835]


Epoch [41/50]: Train acc: 0.7404113775281462, Valid acc: 0.7492841551526729
save model with acc: 0.749


Epoch [42/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.58it/s, loss=0.786]


Epoch [42/50]: Train acc: 0.7413700622411256, Valid acc: 0.7499553478263322
save model with acc: 0.750


Epoch [43/50]: 100%|██████████| 1032/1032 [00:51<00:00, 19.99it/s, loss=0.824]


Epoch [43/50]: Train acc: 0.741962789777249, Valid acc: 0.7501640379853691
save model with acc: 0.750


Epoch [44/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.55it/s, loss=0.904]


Epoch [44/50]: Train acc: 0.7430829122616485, Valid acc: 0.7512375702449196
save model with acc: 0.751


Epoch [45/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.31it/s, loss=0.832]


Epoch [45/50]: Train acc: 0.7445893491785374, Valid acc: 0.7522114576537586
save model with acc: 0.752


Epoch [46/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.42it/s, loss=0.811]


Epoch [46/50]: Train acc: 0.7457426113814102, Valid acc: 0.7516643510206077


Epoch [47/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.86it/s, loss=0.806]


Epoch [47/50]: Train acc: 0.7459925795436093, Valid acc: 0.7532680690896033
save model with acc: 0.753


Epoch [48/50]: 100%|██████████| 1032/1032 [00:49<00:00, 20.75it/s, loss=0.747]


Epoch [48/50]: Train acc: 0.7471269047644973, Valid acc: 0.7530161368255407


Epoch [49/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.54it/s, loss=0.743]


Epoch [49/50]: Train acc: 0.7480496092117055, Valid acc: 0.7534485578757973
save model with acc: 0.753


Epoch [50/50]: 100%|██████████| 1032/1032 [00:50<00:00, 20.62it/s, loss=0.777]


Epoch [50/50]: Train acc: 0.7488435421814172, Valid acc: 0.7535538430010773
save model with acc: 0.754
Total train time: 2883.774839401245


In [None]:
"""get model"""
model = hw2_model(39 * config['concat_feat_num'], 41).to(device)
model.load_state_dict(torch.load(config['save_path']))

"""predict test and save result"""


def predict(test_loader, model, device):
    preds = []
    model.eval()
    for x in tqdm(test_loader):
        x = x.to(device)
        with torch.no_grad():
            y = model(x)
            _, pred = torch.max(y, 1)
            preds.append(pred.detach().cpu())
    preds = torch.cat(preds, dim=0).numpy()
    return preds


def save_pred(preds, file):
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['Id', 'Class'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])


preds = predict(test_loader, model, device)
save_pred(preds, 'pred.csv')

100%|██████████| 316/316 [00:04<00:00, 68.91it/s]
