# Phoneme to Grapheme Conversion with a Recurrent Generative Model 
This project will discuss...

In [88]:
import torch
import torch.nn as nn
import random
import torch.optim as optim
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.tensorboard import SummaryWriter

# find phoneme vocabulary
data = pd.read_csv("phonemes-words.csv")
phonemes_col = data["phonemes"]
phonemes = ['0', '1']

for word in phonemes_col:
    for phoneme in word:
        if phoneme not in phonemes:
            phonemes.append(phoneme)
print(phonemes)

['0', '1', 's', 'ə', 'b', 't', 'ō', 'ᵊ', 'l', 'ī', 'r', 'i', 'v', 'm', 'd', 'ä', 'h', 'y', 'ü', 'a', 'ē', 'p', 'n', 'e', 'k', 'j', 'ŋ', 'g', 'z', 'u', '̇', 'c', 'ȯ', 'w', 'f', 'ā', 'ḵ', '͟']


In [20]:


# known phonemes/graphemes
# phonemes = ['0', '1', 'ᵊ', 'ə̇', 'ȯ', 'ē', 'ī', 'ō', 'a', 'ä', 'ɑ', 'ɒ', 'æ', 'b', 'ḇ', 'β', 'c', 'č', 'ɔ', 'ɕ', 'ç', 'd', 'ḏ', 'ḍ', 'ð', 'e', 'ə', 'ɚ', 'ɛ', 'ɝ', 'f', 'g', 'ḡ', 'h', 'ʰ', 'ḥ', 'ḫ', 'ẖ', 'i', 'ɪ', 'ỉ', 'ɨ', 'j', 'ʲ', 'ǰ', 'k', 'ḳ', 'ḵ', 'l', 'ḷ', 'ɬ', 'ɫ', 'm', 'n', 'ŋ', 'ṇ', 'ɲ', 'ɴ', 'o', 'ŏ', 'ɸ', 'θ', 'p', 'p', '̅', 'þ', 'q', 'r', 'ɹ', 'ɾ', 'ʀ', 'ʁ', 'ṛ', 's', 'š', 'ś', 'ṣ', 'ʃ', 't', 'ṭ', 'ṯ', 'ʨ', 't', 'ʂ', 'u', 'ʊ', 'ŭ', 'ü', 'v', 'ʌ', 'ɣ', 'w', 'ʍ', 'x', 'χ', 'y', 'ʸ', 'ʎ', 'z', 'ẓ', 'ž', 'ʒ', 'ʔ', 'ʕ']

graphemes = ['0', '1', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

# one hot encodes the word: returns an array of one hot encoded characters
def nemes_to_1_hot_seq(string, nemes="phonemes"):
    string = '0' + string + '1'
    l = phonemes if nemes == "phonemes" else graphemes
    seq = []
    for i in string:
        vec = [0] * len(l)
        vec[l.index(i)] = 1
        seq.append(vec)

    return torch.FloatTensor([seq])

def one_hot_to_nemes(arr, nemes="phonemes"):
    seq = []
    l = phonemes if nemes == "phonemes" else graphemes
    for hot in arr:
        x = torch.argmax(hot)
        seq.append(l[x])
    return seq

class P2GDataset(Dataset):
    def __init__(self, phoneme_file, device):
        self.data = pd.read_csv(phoneme_file)
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        p, g = self.data.iloc[idx]
        return nemes_to_1_hot_seq(p, nemes = "phonemes").to(self.device), nemes_to_1_hot_seq(g, nemes = "graphemes").long()


In [128]:
# define model architecture
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.GRU(len(phonemes), 1024, 1, batch_first=True, bidirectional=False, dropout=0.1)
        
    def forward(self, x):
        # push vector through encoder
        out, h_n = self.encoder(x)

        # return context vector
        return h_n

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = nn.GRU(len(graphemes), 1024, 1, batch_first=True, bidirectional=False, dropout=0.1)
        self.fc = nn.Sequential(
            nn.Linear(1024, len(graphemes))
        )
        

    def forward(self, input, hidden_layer):
        """
        Since this function gets called once at a time rather than taking in
        a sequence of vectors, we need to pass it the last output. This will be just
        a vector of numbers that can be converted to the embedding representing that last output
        """
        out, h_n = self.decoder(input, hidden_layer)
        # print("H")
        return self.fc(h_n), h_n

class seq2seq(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.device = device
    
    def forward(self, in_seq, out_seq, tf_ratio=0.5):
        out_len = out_seq.shape[1]
        # storing the outputs of the sequence
        outputs = torch.zeros(out_len, 1, len(graphemes)).to(self.device)

        hidden = self.encoder(in_seq)

        out_seq = out_seq.squeeze(0)

        input = out_seq[0].unsqueeze(0).unsqueeze(0).float().to(device)
        
        for i in range(1, out_len):
            out, hidden = self.decoder(input, hidden)
            outputs[i] = out

            if random.random() > tf_ratio:
                # teacher forcing (make next input what the current output token should be)
                input = out_seq[i].unsqueeze(0).unsqueeze(0).float().to(device)
            else:
                x = input.argmax(1)[0]
                input = torch.zeros(1, 1, len(graphemes)).to(self.device)
                input[0][0][x] = 1
                
        return outputs

    def pred_new(self, in_seq):
        hidden = self.encoder(in_seq)
        input = torch.zeros(1, 1, len(graphemes)).to(device)
        outs = []
        while True:
            out, hidden = self.decoder(input, hidden)
            outs.append(out)
            x = input.argmax(1)[0]
            input = torch.zeros(1, 1, len(graphemes)).to(device)
            input[0][0][x] = 1
            if one_hot_to_nemes(out) == ['1']:
                break
        return outs

In [129]:
"""training"""
from torch.utils.data import random_split
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 100
model = seq2seq(device).to(device)
# what a beautiful architecture
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_func = nn.CrossEntropyLoss()
dataset = P2GDataset("phonemes-words.csv", device)
train, test = random_split(dataset, [100, len(dataset)-100])
dataloader = DataLoader(dataset=train, batch_size=1)
print(len(train))


seq2seq(
  (encoder): Encoder(
    (encoder): GRU(38, 1024, batch_first=True, dropout=0.1)
  )
  (decoder): Decoder(
    (decoder): GRU(28, 1024, batch_first=True, dropout=0.1)
    (fc): Sequential(
      (0): Linear(in_features=1024, out_features=28, bias=True)
    )
  )
)
100




In [135]:
avg_losses = []
writer = SummaryWriter("tensorboard_data")

for epoch in range(50):
    tot_loss = 0
    for (in_seq, out_seq) in dataloader:
        in_seq = in_seq.squeeze(0)
        out_seq = out_seq.squeeze(0)
        model_output = model(in_seq, out_seq)
        model_output = model_output[1:]
        model_output = model_output.squeeze(1)
        out_seq = out_seq.squeeze(0)[1:]
        loss = loss_func(model_output, out_seq.argmax(1).to(device))
        tot_loss+=loss.detach().item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    tot_loss/=len(dataset)
    writer.add_scalar("tensorboard_data", loss.detach().item(), epoch)
    # print(tot_loss)
    avg_losses.append(tot_loss)



In [142]:
dataset = P2GDataset("data.csv", "cuda")
p, g = dataset[10]

print(one_hot_to_nemes(p[0], "phonemes"))
p.shape



['0', 'i', 'r', 'ō', 'd', '1']


torch.Size([1, 6, 38])

In [143]:
def get_0_1_accuracy(test_set, model):
    correct = 0
    dataloader = DataLoader(dataset=test_set, batch_size=1)
    for (in_seq, out_seq) in dataloader:
        # print(out_seq.shape)
        # break
        prediction = model.pred_new(in_seq[0])
        # print(in_seq[0].shape)
        print(one_hot_to_nemes(out_seq[0][0], "graphemes"))
        print(one_hot_to_nemes(prediction, "graphemes"))
        
        if prediction.insert(0, '0') == out_seq:
            correct+= 1
    if correct == 0:
        return correct
    return correct/len(test_set)

def print_preds(path):
    global p
    print(one_hot_to_nemes(p[0], "phonemes"))
    s = model.pred_new(p)
    
    print(one_hot_to_nemes(s, "graphemes"))

# print_preds("data.csv")
print(get_0_1_accuracy(dataset, model))
# 36 great for train set
# print(test[0])
# print(one_hot_to_graphemes(torch.FloatTensor([[3,2,1],[0,0,1],[0,0,1]])))

['0', 'v', 'i', 'n', 'd', 'i', 'c', 'a', 't', 'e', 'd', '1']
['v', 'i', 'n', 'd', 'i', 'c', 'a', 't', 'e', 'd', '1']
['0', 'w', 'a', 'l', 'r', 'u', 's', '1']
['w', 'a', 'l', 'r', 'u', 's', '1']
['0', 's', 'u', 'b', 'l', 'e', 't', '1']
['s', 'u', 'b', 'l', 'e', 't', '1']
['0', 'o', 'r', 'e', 'o', 'c', 'a', 'r', 'y', 'a', '1']
['o', 'r', 'e', 'o', 'c', 'a', 'r', 'y', 'a', '1']
['0', 'f', 'e', 'i', 'r', 'i', 'e', '1']
['f', 'e', 'i', 'r', 'i', 'e', '1']
['0', 'c', 'u', 'p', 'l', 'i', 'k', 'e', '1']
['c', 'u', 'p', 'l', 'i', 'k', 'e', '1']
['0', 'w', 'a', 'l', 'l', 'f', 'l', 'o', 'w', 'e', 'r', '1']
['w', 'a', 'l', 'l', 'f', 'l', 'o', 'w', 'e', 'r', '1']
['0', 'k', 'r', 'e', 'm', 'l', 'i', 'n', 's', '1']
['k', 'r', 'e', 'c', 'l', 'i', 'n', 'n', '1']
['0', 'i', 'n', 'v', 'e', 'r', 's', 'i', 'o', 'n', '1']
['i', 'n', 'v', 'e', 'u', 's', 'i', 'o', 'n', '1']
['0', 's', 'u', 'b', 'c', 'o', 'm', 'p', 'o', 'n', 'e', 'n', 't', 's', '1']
['s', 'u', 'p', 'c', 'o', 'v', 'p', 'o', 'n', 'e', 'n', 't', 

In [None]:
model.encoder.encoder.weight_ih_l0

Parameter containing:
tensor([[ 0.0025,  0.0370, -0.0175,  ...,  0.0274, -0.0038,  0.0100],
        [-0.0024,  0.0330, -0.0179,  ..., -0.0662,  0.0178,  0.0120],
        [-0.0210,  0.0213, -0.0293,  ..., -0.0202,  0.0141,  0.0164],
        ...,
        [ 0.0003, -0.0465, -0.0308,  ...,  0.0141, -0.0076, -0.0158],
        [-0.0091, -0.0290,  0.0079,  ..., -0.0585,  0.0197, -0.0206],
        [ 0.0105,  0.0423,  0.0480,  ..., -0.0146,  0.0046,  0.0034]],
       device='cuda:0', requires_grad=True)