In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset
import torchaudio
from torch.utils.data import DataLoader
import os
import numpy as np
import torch.nn.functional as F
from konlpy.tag import Okt

In [2]:
df = pd.read_csv("text/part1.csv")

In [3]:
df

Unnamed: 0,id,form,original_form,speaker_id,start,end,age,sex
0,SDRW2000000319.1.1.1,병역 특례를 받아,병역 특례를 받아,SD2001645,4.04903,5.83905,10대,여성
1,SDRW2000000319.1.1.2,법정 봉사활동 기 시간을 채워야 하는,법정 봉사활동 기 시간을 채워야 하는,SD2001645,5.84901,8.89405,10대,여성
2,SDRW2000000319.1.1.3,예술,예술,SD2001645,8.90407,9.52506,10대,여성
3,SDRW2000000319.1.1.4,또는 체육 요원의 절반가량이,또는 체육 요원의 절반가량이,SD2001645,9.53506,12.05203,10대,여성
4,SDRW2000000319.1.1.5,허위 자료를 내거나,허위 자료를 내거나,SD2001645,12.06204,13.79504,10대,여성
...,...,...,...,...,...,...,...,...
213188,SDRW2000000418.1.1.326,우선,우선,SD2000552,908.12707,909.98106,10대,여성
213189,SDRW2000000418.1.1.327,맛있는 음식들 먹으면서,맛있는 음식들 먹으면서,SD2000552,909.99104,912.25405,10대,여성
213190,SDRW2000000418.1.1.328,겝,겝,SD2000552,912.26403,913.64807,10대,여성
213191,SDRW2000000418.1.1.329,먹으면서 저도 같이 맛있어 보이는 느낌이라서,먹으면서 저도 같이 맛있어 보이는 느낌이라서,SD2000552,913.65802,917.87305,10대,여성


In [4]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        output, hidden = self.rnn(x)
        output = self.fc(output[:, -1, :])
        return self.relu(output)

In [5]:
df.isnull().sum()

id                0
form             39
original_form     0
speaker_id        0
start             0
end               0
age               0
sex               0
dtype: int64

In [6]:
df = df.dropna()

In [7]:
df['sex'].value_counts()

sex
여성    149045
남성     64109
Name: count, dtype: int64

In [8]:
df = df[df['sex'] == '여성']

In [9]:
df = df.reset_index()

In [10]:
df['sex'].value_counts()

sex
여성    149045
Name: count, dtype: int64

In [11]:
okt = Okt()
df['token']=df['form'].apply(okt.morphs)

In [12]:
df

Unnamed: 0,index,id,form,original_form,speaker_id,start,end,age,sex,token
0,0,SDRW2000000319.1.1.1,병역 특례를 받아,병역 특례를 받아,SD2001645,4.04903,5.83905,10대,여성,"[병역, 특례, 를, 받아]"
1,1,SDRW2000000319.1.1.2,법정 봉사활동 기 시간을 채워야 하는,법정 봉사활동 기 시간을 채워야 하는,SD2001645,5.84901,8.89405,10대,여성,"[법정, 봉사활동, 기, 시간, 을, 채워야, 하는]"
2,2,SDRW2000000319.1.1.3,예술,예술,SD2001645,8.90407,9.52506,10대,여성,[예술]
3,3,SDRW2000000319.1.1.4,또는 체육 요원의 절반가량이,또는 체육 요원의 절반가량이,SD2001645,9.53506,12.05203,10대,여성,"[또는, 체육, 요원, 의, 절반, 가량, 이]"
4,4,SDRW2000000319.1.1.5,허위 자료를 내거나,허위 자료를 내거나,SD2001645,12.06204,13.79504,10대,여성,"[허위, 자료, 를, 내, 거나]"
...,...,...,...,...,...,...,...,...,...,...
149040,213188,SDRW2000000418.1.1.326,우선,우선,SD2000552,908.12707,909.98106,10대,여성,[우선]
149041,213189,SDRW2000000418.1.1.327,맛있는 음식들 먹으면서,맛있는 음식들 먹으면서,SD2000552,909.99104,912.25405,10대,여성,"[맛있는, 음식, 들, 먹으면서]"
149042,213190,SDRW2000000418.1.1.328,겝,겝,SD2000552,912.26403,913.64807,10대,여성,[겝]
149043,213191,SDRW2000000418.1.1.329,먹으면서 저도 같이 맛있어 보이는 느낌이라서,먹으면서 저도 같이 맛있어 보이는 느낌이라서,SD2000552,913.65802,917.87305,10대,여성,"[먹으면서, 저, 도, 같이, 맛있어, 보이는, 느낌, 이라서]"


In [13]:
bin_list = []
for i in range(len(df.index)):
    bin_list.append(df["token"][i])
bin_list = sum(bin_list,[])

In [14]:
bin_list2 = []
for i in range(len(bin_list)):
    temp = bin_list[i]
    if temp not in bin_list2:
        bin_list2.append(temp)

In [15]:
lookup = {tkn: i+2 for i, tkn in enumerate(bin_list2)}
lookup["unk"] = 0
lookup["pad"] = 1

In [16]:
# lookup

In [17]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, wav_dir, bin_list):
        self.data = dataframe
        self.wav_dir = wav_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        file_id = self.data.iloc[index]['id']
        wav_path = os.path.join(self.wav_dir, f'{file_id}.wav')
        audio, _ = torchaudio.load(wav_path)
        text = self.data.iloc[index]['token']
        
        token_indices = [lookup[t] for t in text]
        desired_length = 100
        
        token_indices += [1] * (desired_length - len(token_indices))
        token_indices = torch.tensor(token_indices).type(torch.float32)

        return audio, token_indices

In [18]:
num_epochs = 100
batch_size = 100
learning_rate = 2e-03
num_classes = len(df)  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
wav_dir = './wav_all_stereo(fixed_length)'
dataset = CustomDataset(df, wav_dir, bin_list2)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [20]:
audio, text = dataset[0]
print("Audio:", audio)
print("Text:", text)

Audio: tensor([[-9.1553e-05,  3.0518e-05,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-9.1553e-05,  3.0518e-05,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])
Text: tensor([2., 3., 4., 5., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])


In [21]:
input_size = 221616
hidden_size = 100
output_size = 100

In [22]:
model = RNNModel(input_size, hidden_size, output_size).to(device)
criterion = nn.HuberLoss()
optimizer = optim.NAdam(model.parameters(), lr=learning_rate)
total_step = len(dataloader)

In [23]:
torch.save(model, "./model/speech2text.pt")

In [24]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    total_correct = 0
    
    if (epoch + 1) % 2 == 0:
        torch.save(model, "./model/Speech2Text.pt")

    for i, (audio, text) in enumerate(dataloader):
        audio = audio.to(device)
        text = text.to(device)

        outputs = model(audio)
#         print(outputs.dtype)
#         print(text.dtype)

        loss = criterion(outputs, text)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        text = text.view(-1, 1)
        total_correct += (predicted == text).sum().item()

        if (i + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')

    epoch_loss = total_loss / total_step
    epoch_acc = total_correct / (batch_size * total_step)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')


Epoch [1/100], Step [10/1491], Loss: 113.6651
Epoch [1/100], Step [20/1491], Loss: 132.6153
Epoch [1/100], Step [30/1491], Loss: 126.5062
Epoch [1/100], Step [40/1491], Loss: 157.0523
Epoch [1/100], Step [50/1491], Loss: 108.9767
Epoch [1/100], Step [60/1491], Loss: 146.2619
Epoch [1/100], Step [70/1491], Loss: 151.1003
Epoch [1/100], Step [80/1491], Loss: 152.7087
Epoch [1/100], Step [90/1491], Loss: 104.4436
Epoch [1/100], Step [100/1491], Loss: 154.5409
Epoch [1/100], Step [110/1491], Loss: 124.9002
Epoch [1/100], Step [120/1491], Loss: 129.5215
Epoch [1/100], Step [130/1491], Loss: 133.3344
Epoch [1/100], Step [140/1491], Loss: 122.3453
Epoch [1/100], Step [150/1491], Loss: 125.1144
Epoch [1/100], Step [160/1491], Loss: 89.4354
Epoch [1/100], Step [170/1491], Loss: 134.0513
Epoch [1/100], Step [180/1491], Loss: 99.1525
Epoch [1/100], Step [190/1491], Loss: 119.6255
Epoch [1/100], Step [200/1491], Loss: 118.1781
Epoch [1/100], Step [210/1491], Loss: 133.0045
Epoch [1/100], Step [220

Epoch [2/100], Step [250/1491], Loss: 122.6092
Epoch [2/100], Step [260/1491], Loss: 132.4609
Epoch [2/100], Step [270/1491], Loss: 108.2197
Epoch [2/100], Step [280/1491], Loss: 120.4148
Epoch [2/100], Step [290/1491], Loss: 134.2198
Epoch [2/100], Step [300/1491], Loss: 141.4025
Epoch [2/100], Step [310/1491], Loss: 114.5742
Epoch [2/100], Step [320/1491], Loss: 172.1622
Epoch [2/100], Step [330/1491], Loss: 138.3727
Epoch [2/100], Step [340/1491], Loss: 141.2471
Epoch [2/100], Step [350/1491], Loss: 135.9932
Epoch [2/100], Step [360/1491], Loss: 143.6316
Epoch [2/100], Step [370/1491], Loss: 140.0350
Epoch [2/100], Step [380/1491], Loss: 103.4232
Epoch [2/100], Step [390/1491], Loss: 144.6303
Epoch [2/100], Step [430/1491], Loss: 128.0889
Epoch [2/100], Step [440/1491], Loss: 151.8863
Epoch [2/100], Step [450/1491], Loss: 108.5473
Epoch [2/100], Step [460/1491], Loss: 126.8167
Epoch [2/100], Step [470/1491], Loss: 135.9303
Epoch [2/100], Step [480/1491], Loss: 125.4854
Epoch [2/100]

KeyboardInterrupt: 

In [None]:
def predict(model, input_file):
    model.eval()  # 모델을 평가 모드로 설정
    
    audio, _ = torchaudio.load(input_file)  
    audio = audio.unsqueeze(0).to(device) 

    output = model(audio) 

    return output.int().detach()

In [None]:
output = predict(model,'./wav_all_stereo(fixed_length)/SDRW2000000414.1.1.18.wav')

In [None]:
output[0,1]

In [None]:
reverse = dict(map(reversed,lookup.items()))

In [None]:
out = []
for i in range(len(output[0])):
    out.append(reverse[output[0,i].item()])
out