In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset
import torchaudio
from torch.utils.data import DataLoader
import os
import numpy as np
import torch.nn.functional as F
from konlpy.tag import Okt

In [2]:
df = pd.read_csv("text/part1.csv")

In [3]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(0.1)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2).float() * (-math.log(100000.0) / d_model))
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        x = self.dropout(x)
        return x

class SelfAttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttentionPooling, self).__init__()
        self.W = nn.Linear(input_dim, 1)
    def forward(self, batch_rep):
        softmax = nn.functional.softmax
        att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)
        utter_rep = torch.sum(batch_rep * att_w, dim=1)
        return utter_rep

class myRNN(torch.nn.Module):
    def __init__(self,out_size, max_len):
        
        super().__init__()
        
        self.max_len = max_len
        self.pos_encoder = PositionalEncoding(512, 0.2)
        self.self_att_pool = SelfAttentionPooling(512)
        self.embedding = torch.nn.Embedding(max_len, 512)
                
        self.features = torch.nn.GRU(input_size=512,
                          hidden_size=512,
                          num_layers=2,
                          batch_first=True,
                          bidirectional=True,
                          dropout=0.3)
        
        encoder_layer = torch.nn.TransformerEncoderLayer(d_model=512,
                                                        nhead=4,
                                                        dim_feedforward=256,
                                                        dropout=0.2,
                                                        activation="relu")
        self.encoder = torch.nn.TransformerEncoder(encoder_layer,
                                                   num_layers=2)

        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(443232,512),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(512, 512),
            torch.nn.Dropout(0.3),
        )
        self.hidden = torch.nn.Linear(512*4, 1024)
        
        
        self.output = torch.nn.Sequential(
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(512, output_size),
        )
        
        self.output_prob = torch.nn.Sequential(
            torch.nn.Conv2d(1,29,3,1,1),
            torch.nn.ReLU(),
            torch.nn.Conv2d(29,29,3,1,1),
            torch.nn.Softmax(),
        )
        
    def init_weights(self):
        torch.nn.init.xavier_normal_(self.classifier.weight.data, gain=1.0)
        torch.nn.init.xavier_normal_(self.encoder.weight.data, gain=1.0)
        torch.nn.init.xavier_uniform_(self.features.weight.data, gain=1.0)

    def forward(self, x, length):
        idx = torch.randint(0,self.max_len,(1,length)).to(device)
        #print("inital x shape : ", x.shape)
        x = self.classifier(x)
#         print("conv : ",x.shape)
        x = self.pos_encoder(x) ## [batch, embedding dim, sequence]
#         print("pos_encoder : ",x.shape)
        x = self.encoder(x)
#         print("encoder : ", x.shape)
        x = x.permute(1,0,2)
        x = self.self_att_pool(x)
#         print("att_pool : ",x.shape)
        emb = self.embedding(idx)
        emb = self.pos_encoder(emb)
        emb = self.self_att_pool(emb)
        
        x = torch.add(x,emb)        
        features, h = self.features(x)
        h = self.hidden(h.flatten()).unsqueeze(0)
        features = torch.add(features, h)
#         print("GRU : ", features.shape)
        output = self.output(features)
#         print("output : ", output.shape)
        output = output.unsqueeze(1)
#         print(output.shape)
        output = self.output_prob(output)
        output = output.permute(1,0,2)
#         print(output.shape)
        return output

In [4]:
df = df[:1000]
df = df.dropna()
df = df[df['sex'] == '여성']
df = df.reset_index()
okt = Okt()
df['token']=df['form'].apply(okt.morphs)

In [5]:
bin_list = []
for i in range(len(df.index)):
    if df["token"][i] not in bin_list:
        bin_list.append(df["token"][i])
bin_list = sum(bin_list,[])

In [6]:
bin_list2 = []
for i in range(len(bin_list)):
    temp = bin_list[i]
    if temp not in bin_list2:
        bin_list2.append(temp)

In [7]:
lookup = {tkn: i+4 for i, tkn in enumerate(bin_list2)}
lookup["unk"] = 0
lookup["pad"] = 1
lookup["bos"] = 2
lookup["eos"] = 3

In [8]:
max = 0
for i in range(len(df)):
    temp = len(df["token"][i])
    if max < temp:
        max = temp

In [9]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, wav_dir, bin_list):
        self.data = dataframe
        self.wav_dir = wav_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        file_id = self.data.iloc[index,1] ##id
        wav_path = os.path.join(self.wav_dir, f'{file_id}.wav')
        audio, _ = torchaudio.load(wav_path)
        text = self.data.iloc[index,-1]
        
        token_indices = [lookup["bos"]]
        for i in range(len(text)):
            token_indices.append(lookup[text[i]])
        token_indices.append(lookup["eos"])
        desired_length = max
        l = len(token_indices)
        
        token_indices += [1] * (desired_length - len(token_indices))
        token_indices = torch.tensor(token_indices).type(torch.LongTensor)

        return audio, text, token_indices, l

In [10]:
a,b,c = int(len(df)*7/10), int(len(df)*2/10), int(len(df)*1/10)
train_df = df[:a]
train_df.reset_index(drop=True, inplace=True)
valid_df = df[a:a+b]
valid_df.reset_index(drop=True, inplace=True)
test_df = df[a+b:]
test_df.reset_index(drop=True, inplace=True)
train_df.shape, valid_df.shape, test_df.shape

((700, 10), (200, 10), (100, 10))

In [11]:
# 데이터셋 생성
batch_size = 1
v_batch_size = 1
num_classes = len(df)  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

wav_dir = './wav_fixed_10mil'
dataset = CustomDataset(train_df, wav_dir,bin_list2)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8)

vdataset = CustomDataset(valid_df, wav_dir,bin_list2)
vdataloader = DataLoader(vdataset, batch_size=v_batch_size, shuffle=False, num_workers=8)

tsdataset = CustomDataset(test_df, wav_dir,bin_list2)
tsdataloader = DataLoader(tsdataset, batch_size=1, shuffle=False)

In [12]:
audio, text, token, l = next(iter(dataloader))
audio.shape, len(text), token.shape, l

(torch.Size([1, 2, 100000]), 12, torch.Size([1, 29]), tensor([14]))

In [13]:
input_size = audio.shape[-1]
hidden_size = 512
output_size = max

In [14]:
model = myRNN(output_size, len(lookup)).to(device)
# criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(reduction="mean")
optimizer = optim.AdamW(model.parameters(),
                        lr=1e-03,
                        betas=(0.9,0.98))
scheduler = optim.lr_scheduler.StepLR(optimizer,
                                      step_size=30.0,
                                      gamma=0.95)
total_step = len(dataloader)

In [15]:
!nvidia-smi

Sat Jun 10 19:29:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4500    Off  | 00000000:73:00.0 Off |                  Off |
| 30%   39C    P2    54W / 200W |  18847MiB / 20470MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [17]:
num_epochs = 512
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    total_correct = 0
    vtotal_loss = 0
    vtotal_correct = 0
    if (epoch + 1) % 2 == 0:
        torch.save(model.state_dict(), "./model/Speech2Text.pt")

    for i, (audio, token, text, l) in enumerate(dataloader):
        audio = audio.to(device)
        audio = torch.reshape(audio, (1, 2*221616))
        text = text.to(device)

        outputs = model(audio, l)
        print(outputs.shape, text.shape)
#         print(outputs.dtype, text.dtype)
        loss = criterion(outputs, text)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        text = text.view(-1, 1)
        total_correct += (predicted == text).sum().item()
        if (i + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')
            
        if epoch%32==0 and i==0:
            model.eval()
            for i, (audio, token,text,l) in enumerate(vdataloader):
                audio,text = audio.to(device), text.to(device)
                outputs = model(audio, l)
                loss = criterion(outputs, text)
                vtotal_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                text = text.view(-1,1)
                vtotal_correct += (predicted == text).sum().item()
                
                
                epoch_loss = vtotal_loss / total_step
                epoch_acc = vtotal_correct / (batch_size * total_step)
                print("---"*10)
                print("Validation")
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
                print("---"*10)

    epoch_loss = total_loss / total_step
    epoch_acc = total_correct / (batch_size * total_step)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
    scheduler.step()

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [27]:
model.train()
for epoch in range(10000):
    total_loss = 0.0
    total_correct = 0
    
    if (epoch + 1) % 2 == 0:
        torch.save(model, "./model/Speech2Text.pt")

    for i, (audio, text) in enumerate(dataloader):
        audio = audio.to(device)
        #print(audio.shape)
        audio = torch.reshape(audio, (1, 2*221616))
        text = text.to(device)

        outputs = model(audio)
#         print(outputs)
#         print(text.shape)

        loss = criterion(outputs, text)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        text = text.view(-1, 1)
        total_correct += (predicted == text).sum().item()
        if (i + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')
#             print(f"Transcription : {[reverse[int(i*len(lookup))] for i in outputs[0]]}")
#             print(f"Text : {[reverse[int(i*len(lookup))] for i in text]}")

    epoch_loss = total_loss / total_step
    epoch_acc = total_correct / (batch_size * total_step)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')
    scheduler.step()

In [20]:
def predict(model, input_file):
    model.eval()  # 모델을 평가 모드로 설정
    
    audio, _ = torchaudio.load(input_file)  
    audio = audio.unsqueeze(0).to(device) 

    output = model(audio) 

    return output.int().detach()

In [21]:
output = predict(model,'./wav_all_stereo(fixed_length)/SDRW2000000414.1.1.18.wav')

In [22]:
output[0,1]

In [24]:
reverse = dict(map(reversed,lookup.items()))

In [25]:
out = []
for i in range(len(output[0])):
    out.append(reverse[output[0,i].item()])
out