In [1]:
import os
from random import random
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

root_path = 'data/transcriptions'
data_path = os.path.join(root_path,'train.csv')

In [None]:
train_data = []
test_data = []
characters = set()
transcripts = set()

with open(data_path) as file:
    rows = file.readlines()
    for i,row in enumerate(rows[1:]):
        splitted_data = row.split(',')
        data = splitted_data[1].strip()
        target = splitted_data[2].strip().replace(' ','')
        
        for character in data:
            characters.add(character)
        for transcript in target:
            transcripts.add(transcript)
        full_data = {'data':data,'target':target}
                
        if i > 80000:
            test_data.append(full_data)
            continue
        train_data.append(full_data)

In [None]:
int2char_characters = dict(enumerate(characters,start=1))
char2int_characters = {ch:ind for ind,ch in int2char_characters.items()}
int2char_transcripts = dict(enumerate(transcripts,start=1))
char2int_transcripts = {ch:ind for ind,ch in int2char_transcripts.items()}

In [None]:
encoded_train_data_x = []
encoded_train_data_y = []
for word in train_data:
    encoded_train_data_x.append([char2int_characters[ch] for ch in word['data']])
    encoded_train_data_y.append([char2int_transcripts[ch] for ch in word['target']])    

In [None]:
def get_batches(arr_x,arr_y, batch_size):
   
    batch = {'x':[],'y':[]}
    for i in range(batch_size):
        rand = int(random()*len(arr_x))
        batch['x'].append(arr_x[rand])
        batch['y'].append(arr_y[rand])
        
    return batch

def normalize_batch(x,y):
    x_lengths = []
    y_lengths = []
    x = sorted(x,key=len,reverse=True)
    y = sorted(y,key=len,reverse=True)
    max1 = len(x[0])
    max2 = len(y[0])
    for i in range(len(x)):
        x_lengths.append(len(x[i]))
        y_lengths.append(len(y[i]))
        
        for _ in range(max1-len(x[i])):
            x[i].append(0)
        for _ in range(max2-len(y[i])):
            y[i].append(0)
    return x,y,x_lengths,y_lengths

In [None]:
import pandas as pd
file_path = 'data/transcriptions/train.csv'
characters = pd.read_csv(file_path,'r')

In [None]:
len(characters)

In [2]:
class Vocab:
    def __init__(self, counter,for_encoder=False, min_freq=None, max_freq=None):
        self.sos = "<sos>"
        self.eos = "<eos>"
        self.pad = "<pad>"
        self.unk = "<unk>"
        
        self.pad_idx = 0
        self.unk_idx = 1
        self.sos_idx = 2
        self.eos_idx = 3
        
        if for_encoder:
            self._token2idx = {
                self.pad:self.pad_idx,
                self.unk:self.unk_idx,
            }
        else:
            self._token2idx = {
                self.sos: self.sos_idx,
                self.eos: self.eos_idx,
                self.pad: self.pad_idx,
                self.unk: self.unk_idx,
            }
        self._idx2token = {idx:token for token, idx in self._token2idx.items()}
        
        
        idx = len(self._token2idx)
        min_freq = 0 if min_freq is None else min_freq
        max_freq = len(counter) if max_freq is None else max_freq
        
        for token, count in counter.items():
            if count > min_freq and count < max_freq:
                self._token2idx[token] = idx
                self._idx2token[idx]   = token
                idx += 1
        
        self.vocab_size = len(self._token2idx)
        self.tokens     = list(self._token2idx.keys())
    
    def token2idx(self, token):
        return self._token2idx.get(token, self.pad_idx)
    
    def idx2token(self, idx):
        return self._idx2token.get(idx, self.pad)
    
    def sent2idx(self, sent):
        return [self.token2idx(i) for i in sent]
    
    def idx2sent(self, idx):
        return [self.idx2token(i) for i in idx]
    
    def __len__(self):
        return len(self._token2idx)
    
    def __repr__(self):
        
        return '{}'.format(self._token2idx)

In [111]:
class CharactersDataset(Dataset):
    
    def __init__(self,csv_file_path,transform = None):
        self.file = pd.read_csv(csv_file_path,'r')
        self.transform = transform
        self.data = []
        self.characters_vocab = None
        self.transcripts_vocab = None
        self.make_dataset()
       
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        
        x = self.data[idx]['x']
        y = self.data[idx]['y']
        data = {'x':x,'y':y}
        if self.transform:
            data = self.transform(data)
        
        return data
    
    def make_dataset(self):
        characters = set()
        transcripts = set()
        non_needed_symbold = '\'#$?\\_({)}-:\";!%.1234567890'
        for idx in range(len(self.file)):
            item = str(self.file.iloc[idx][0]).split(',')
            
            x = item[1].strip()
            for symbol in non_needed_symbold:
                x = x.replace(symbol,'')
            y = item[2].replace(' ','')
            self.data.append({'x':x,'y':y})
            for character in x:
                characters.add(character)
            for transcript in y:
                transcripts.add(transcript)
        
        self.characters_vocab = Vocab({v:k for k,v in dict(enumerate(characters)).items()},for_encoder=True)
        self.transcripts_vocab = Vocab({v:k for k,v in dict(enumerate(transcripts)).items()})
        
            
    def collate_fn(self, batch): 
        x_values = []
        y_values_in = []
        x_lengths = []
        y_lengths = []
        for item in batch:
            
            x_values.append([self.characters_vocab.token2idx(ch) for ch in item['x']])
            y_values_in.append([self.transcripts_vocab.token2idx(tr) for tr in item['y']])
        
        x_values = sorted(x_values,key=len,reverse=True)
        y_values_in = sorted(y_values_in,key=len,reverse=True)
        
        max_x = len(x_values[0])
        max_y = len(y_values_in[0])
        
        for word_index in range(len(x_values)):
            
            x_lengths.append(len(x_values[word_index]))
            y_lengths.append(len(y_values_in[word_index]))
            
            for _ in range(max_x - len(x_values[word_index])):
                x_values[word_index].append(0)
            for _ in range(max_y - len(y_values_in[word_index])):
                y_values_in[word_index].append(0)
            
            y_values_in[word_index].insert(0,2)
            
        x_values = torch.tensor(x_values)
        y_values_in_tensor = torch.tensor(y_values_in)
        
        y_values_out = y_values_in        
        for arr_index in range(len(y_values_out)):
            y_values_out[arr_index] = y_values_out[arr_index][1:] + [3]
        y_values_out_tensor = torch.tensor(y_values_out)
        
        return x_values,y_values_in_tensor,y_values_out_tensor

In [112]:
import pandas as pd

dataset = CharactersDataset(data_path)


In [113]:
dataloader = DataLoader(dataset,batch_size=32,shuffle=True,num_workers=2,collate_fn = dataset.collate_fn)

In [115]:
for batch in dataloader:
    kek = batch
    break

In [116]:
x_values,y_values = [],[]
for item in kek:
    x_values.append()

[{'x': 'GRIN', 'y': 'GRIHN'},
 {'x': 'MARANISS', 'y': 'MERAENIHS'},
 {'x': 'LOFFREDO', 'y': 'LOWFREYDOW'},
 {'x': 'FINDLING', 'y': 'FIHNDAHLIHNG'},
 {'x': 'ISAKSEN', 'y': 'IHSAHKSAHN'},
 {'x': 'OVERRATE', 'y': 'OWVERREYT'},
 {'x': 'ATHEARN', 'y': 'EYTHERN'},
 {'x': 'PRECEDING', 'y': 'PRIYSIYDIHNG'},
 {'x': 'BRUTALITY', 'y': 'BRUWTAELIHTIY'},
 {'x': 'CULLOM', 'y': 'KAHLAHM'},
 {'x': 'CECILE', 'y': 'SIHSIYL'},
 {'x': 'HOPES', 'y': 'HHOWPS'},
 {'x': 'KAYA', 'y': 'KAAYAH'},
 {'x': 'CABLES', 'y': 'KEYBAHLZ'},
 {'x': 'REGRETTING', 'y': 'RIHGREHTIHNG'},
 {'x': 'CHAZOV', 'y': 'CHAEZAAV'},
 {'x': 'SUPERVISE', 'y': 'SUWPERVAYZ'},
 {'x': 'STANBERY', 'y': 'STAENBERIY'},
 {'x': 'BAGMAN', 'y': 'BAEGMAHN'},
 {'x': 'ANGER', 'y': 'AENGGER'},
 {'x': 'BANKROLL', 'y': 'BAENGKROWL'},
 {'x': 'BRACELET', 'y': 'BREYSLAHT'},
 {'x': 'GAHLI', 'y': 'GAALIY'},
 {'x': 'SPORTSCHANNEL', 'y': 'SPAORTSCHAENAHL'},
 {'x': 'JIMMERSON', 'y': 'JHIHMERSAHN'},
 {'x': 'ECOLOGISTS', 'y': 'IYKAALAHJHIHSTS'},
 {'x': 'EGE', 'y': '

In [None]:
for item in kek:
    print(item['x'])
    print(item['y'])
    x_values.append([dataset.characters_vocab.token2idx(ch) for ch in item['x']])
    y_values.append([dataset.transcripts_vocab.token2idx(tr) for tr in item['y']])    

In [124]:
dataset.characters_vocab.token2idx(6234)

0

In [6]:
from torch.utils.data.sampler import SubsetRandomSampler
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(0.2 * dataset_size))

np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=32, 
                                           sampler=train_sampler,collate_fn = dataset.collate_fn)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=32,
                                                sampler=valid_sampler,collate_fn = dataset.collate_fn)

In [7]:
class EncoderLSTM(nn.Module):
    def __init__(self,dict_len,hidden_size,n_layers = 1,dropout=0):
        super(EncoderLSTM,self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(dict_len,self.hidden_size,padding_idx = 0)
        self.LSTM = nn.LSTM(hidden_size,hidden_size,n_layers,dropout=(0 if n_layers == 1 else dropout),batch_first=True)
    
    def forward(self,input_seq,hidden=None):
        embedded = self.embedding(input_seq)
        #packed = nn.utils.rnn.pack_padded_sequence(embedded,input_lengths)
        outputs,(hidden,cell) = self.LSTM(embedded)
        #outputs,_ = nn.utils.rnn.pad_packed_sequence(outputs)
        return hidden,cell

In [8]:
class DecoderLSTM(nn.Module):
    def __init__(self,hidden_size,output_size,n_layers=1,dropout=0):
        super(DecoderLSTM,self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        #layers
        self.embedding = nn.Embedding(output_size,hidden_size, padding_idx = 0)
        self.LSTM = nn.LSTM(hidden_size,hidden_size,n_layers,dropout = (0 if n_layers == 1 else self.dropout),batch_first=True)
        self.out = nn.Linear(hidden_size,output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
        
    def forward(self,input_step,last_hidden, last_cell):
        #input_step (batch_size,seq_len)
        embedded = self.embedding(input_step)
        #embedded(batch_size,seq_len,hidden_dim)
        output,(hidden,cell) = self.LSTM(embedded,(last_hidden, last_cell))
        #output(batch_size,seq_len,hidden_dim)
        prediction = self.out(output)
        #prediction(batch_size,seq_len,output_dim)
        return prediction,hidden,cell

In [9]:
encoder = EncoderLSTM(len(dataset.characters_vocab),128)

In [10]:
decoder = DecoderLSTM(128,len(dataset.transcripts_vocab))



In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
class seq2seq(nn.Module):
    def __init__(self,encoder,decoder,device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self,x,y,teacher_forcing_ratio = 0.5):
        
        
        hidden,cell = self.encoder(x)

        output,hidden,cell = self.decoder(y,hidden,cell)
        
        return output
    
    def predict(self,x):

        #batch_size = 1!
        hidden,cell = self.encoder(x)
        out_input = torch.LongTensor([[2]]).to(device)
        preds = []
        while True:
            output, hidden, cell = self.decoder(out_input, hidden, cell)
            output = torch.argmax(output)
            our_value = output.item()
            if our_value == 3:
                break
            preds.append(our_value)
            out_input = output.unsqueeze(0).unsqueeze(0)
        return preds

In [22]:
model = seq2seq(encoder,decoder,device).to(device)

In [23]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

seq2seq(
  (encoder): EncoderLSTM(
    (embedding): Embedding(27, 128, padding_idx=0)
    (LSTM): LSTM(128, 128, batch_first=True)
  )
  (decoder): DecoderLSTM(
    (embedding): Embedding(27, 128, padding_idx=0)
    (LSTM): LSTM(128, 128, batch_first=True)
    (out): Linear(in_features=128, out_features=27, bias=True)
    (softmax): LogSoftmax()
  )
)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 274,587 trainable parameters


In [25]:
optimizer = optim.Adam(model.parameters())

In [26]:
for batch in train_loader:
    kek = batch
    break


In [27]:
criterion = nn.CrossEntropyLoss(ignore_index = 0)


In [28]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        x = batch[0].to(device)
        y_in = batch[1].to(device)
        y_out = batch[2].to(device)
        
        optimizer.zero_grad()
        
        output = model(x, y_in)
        #output dim (y_seq_len,batch_size,output_dim)
        output = output.view(output.shape[0]*output.shape[1],-1)
        y_out = y_out.view(-1)
        loss = criterion(output, y_out)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [29]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            x = batch[0].to(device)
            y_in = batch[1].to(device)
            y_out = batch[2].to(device)
            
            output = model(x,y_in) #turn off teacher forcing
            
            output = output.view(output.shape[0]*output.shape[1],-1)
            y_out = y_out.view(-1)

            loss = criterion(output, y_out)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)



In [30]:
import math

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print(N_EPOCHS)
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, validation_loader, criterion)
    
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

10
	Train Loss: 1.667 | Train PPL:   5.296
10
	Train Loss: 1.509 | Train PPL:   4.522
10
	Train Loss: 1.471 | Train PPL:   4.354
10
	Train Loss: 1.446 | Train PPL:   4.248
10
	Train Loss: 1.428 | Train PPL:   4.170
10
	Train Loss: 1.413 | Train PPL:   4.109
10
	Train Loss: 1.401 | Train PPL:   4.061
10
	Train Loss: 1.391 | Train PPL:   4.018
10
	Train Loss: 1.382 | Train PPL:   3.984
10
	Train Loss: 1.374 | Train PPL:   3.950


In [None]:
for i,batch in enumerate(validation_loader):
    kek = batch
    break

In [35]:
torch.save(encoder.state_dict(),'encoder_weights')
torch.save(decoder.state_dict(),'decoder_weights')
torch.save(model.state_dict(),'model_weights')

In [40]:
encoder = EncoderLSTM(len(dataset.characters_vocab),128).to(device)
decoder = DecoderLSTM(128,len(dataset.transcripts_vocab)).to(device)


In [41]:
encoder.load_state_dict(torch.load('encoder_weights'))
decoder.load_state_dict(torch.load('decoder_weights'))
model = seq2seq(encoder,decoder,device)
model.load_state_dict(torch.load('model_weights'))

In [74]:
for batch in validation_loader:
    kek = batch
    break

In [79]:
x = kek[0][0][1:].unsqueeze(0).to(device)
y_pred = model.predict(x)

In [81]:
dataset.characters_vocab,dataset.transcripts_vocab

({'<pad>': 0, '<unk>': 1, 'P': 2, 'N': 3, 'X': 4, 'J': 5, 'W': 6, 'L': 7, 'T': 8, 'Z': 9, 'K': 10, 'B': 11, 'E': 12, 'C': 13, 'V': 14, 'H': 15, 'O': 16, 'S': 17, 'D': 18, 'G': 19, 'R': 20, 'Y': 21, 'M': 22, 'U': 23, 'I': 24, 'F': 25, 'Q': 26},
 {'<sos>': 2, '<eos>': 3, '<pad>': 0, '<unk>': 1, 'P': 4, 'N': 5, 'J': 6, 'W': 7, 'L': 8, 'T': 9, 'Z': 10, 'K': 11, 'B': 12, 'E': 13, 'C': 14, 'V': 15, 'H': 16, 'O': 17, 'S': 18, 'D': 19, 'G': 20, 'R': 21, 'Y': 22, 'M': 23, 'U': 24, 'I': 25, 'F': 26})

In [82]:
output_token = ''
for pred in y_pred:
    output_token += dataset.transcripts_vocab.idx2token(pred)

In [83]:
input_token = ''
for x_val in x.squeeze(0):
    input_token += dataset.characters_vocab.idx2token(x_val.item())


In [85]:
x

tensor([[18, 22, 24,  3, 24, 17,  8, 20,  0,  8, 24, 14, 12,  7, 21]],
       device='cuda:0')

In [None]:
for k in x.squeeze(0):
    print(k)