# Phoneme to Grapheme Conversion with a Recurrent Generative Model 
This project will discuss...

In [68]:
import torch
import torch.nn as nn
import random
import torch.optim as optim
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.tensorboard import SummaryWriter

# find phoneme vocabulary
data = pd.read_csv("phonemes-words.csv")
phonemes_col = data["phonemes"]
graphemes_col = data["graphemes"]
phonemes = ['0', '1']
graphemes = ['0', '1']

for word in phonemes_col:
    for phoneme in word:
        if phoneme not in phonemes:
            phonemes.append(phoneme)
for word in graphemes_col:
    for grapheme in word:
        if grapheme not in graphemes:
            graphemes.append(grapheme)
print(phonemes)

['0', '1', 'd', 'a', 'ɪ', 'n', 'æ', 's', 't', 'k', 'ə', 'ɛ', 'm', 'p', 'b', 'l', 'z', 'i', 'r', 'ʃ', 'ŋ', 'e', 'ʊ', 'v', 'f', 'ɒ', 'ʌ', 'o', 'θ', 'ɜ', 'ɑ', 'ʒ', 'w', 'y', 'u', 'g', 'ː', 'ɔ', 'h', 'x', 'j', '̃', 'ð', 'ʰ', 'ɡ', 'c', 'œ', 'S', 'F', 'Y', 'G', 'ü', '3', 'I']


In [69]:

# one hot encodes the word: returns an array of one hot encoded characters
def nemes_to_1_hot_seq(string, nemes="phonemes"):
    string = '0' + string + '1'
    l = phonemes if nemes == "phonemes" else graphemes
    seq = []
    for i in string:
        vec = [0] * len(l)
        vec[l.index(i)] = 1
        seq.append(vec)

    return torch.FloatTensor([seq])

def one_hot_to_nemes(arr, nemes="phonemes"):
    seq = []
    l = phonemes if nemes == "phonemes" else graphemes
    for hot in arr:
        x = torch.argmax(hot)
        seq.append(l[x])
    return seq

class P2GDataset(Dataset):
    def __init__(self, phoneme_file, device):
        df = pd.read_csv(phoneme_file)
        self.data = df.drop(df[df["phonemes"].map(len) > 8].index)

        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        p, g = self.data.iloc[idx]
        return nemes_to_1_hot_seq(p, nemes = "phonemes").to(self.device), nemes_to_1_hot_seq(g, nemes = "graphemes").long()


In [70]:
# define model architecture
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.GRU(len(phonemes), 2048, 1, batch_first=True, bidirectional=False, dropout=0.1)
        
    def forward(self, x):
        # push vector through encoder
        out, h_n = self.encoder(x)

        # return context vector
        return h_n

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = nn.GRU(len(graphemes), 2048, 1, batch_first=True, bidirectional=False, dropout=0.1)
        self.fc = nn.Sequential(
            nn.Linear(2048, len(graphemes))
        )
        

    def forward(self, input, hidden_layer):
        """
        Since this function gets called once at a time rather than taking in
        a sequence of vectors, we need to pass it the last output. This will be just
        a vector of numbers that can be converted to the embedding representing that last output
        """
        out, h_n = self.decoder(input, hidden_layer)
        # print("H")
        return self.fc(h_n), h_n

class seq2seq(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.device = device
    
    def forward(self, in_seq, out_seq, tf_ratio=0.5):
        out_len = out_seq.shape[1]
        # storing the outputs of the sequence
        outputs = torch.zeros(out_len, 1, len(graphemes)).to(self.device)

        hidden = self.encoder(in_seq)

        out_seq = out_seq.squeeze(0)

        input = out_seq[0].unsqueeze(0).unsqueeze(0).float().to(device)
        
        for i in range(1, out_len):
            out, hidden = self.decoder(input, hidden)
            outputs[i] = out

            if random.random() > tf_ratio:
                # teacher forcing (make next input what the current output token should be)
                input = out_seq[i].unsqueeze(0).unsqueeze(0).float().to(device)
            else:
                x = input.argmax(1)[0]
                input = torch.zeros(1, 1, len(graphemes)).to(self.device)
                input[0][0][x] = 1
                
        return outputs

    def pred_new(self, in_seq):
        hidden = self.encoder(in_seq)
        input = torch.zeros(1, 1, len(graphemes)).to(device)
        outs = []
        while True:
            out, hidden = self.decoder(input, hidden)
            outs.append(out)
            x = input.argmax(1)[0]
            input = torch.zeros(1, 1, len(graphemes)).to(device)
            input[0][0][x] = 1
            if one_hot_to_nemes(out) == ['1']:
                break
        return outs

In [75]:
"""training"""
from torch.utils.data import random_split
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 100
model = seq2seq(device).to(device)
# what a beautiful architecture
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_func = nn.CrossEntropyLoss()
dataset = P2GDataset("phonemes-words.csv", device)
train, test = random_split(dataset, [15000, len(dataset)-15000])
dataloader = DataLoader(dataset=train, batch_size=1)
print(len(test))


seq2seq(
  (encoder): Encoder(
    (encoder): GRU(54, 2048, batch_first=True, dropout=0.1)
  )
  (decoder): Decoder(
    (decoder): GRU(28, 2048, batch_first=True, dropout=0.1)
    (fc): Sequential(
      (0): Linear(in_features=2048, out_features=28, bias=True)
    )
  )
)
3967




In [76]:
avg_losses = []
writer = SummaryWriter("tensorboard_data")

# 15 quite good
for epoch in range(10):
    tot_loss = 0
    for (in_seq, out_seq) in dataloader:
        in_seq = in_seq.squeeze(0)
        out_seq = out_seq.squeeze(0)
        model_output = model(in_seq, out_seq)
        model_output = model_output[1:]
        model_output = model_output.squeeze(1)
        out_seq = out_seq.squeeze(0)[1:]
        loss = loss_func(model_output, out_seq.argmax(1).to(device))
        tot_loss+=loss.detach().item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    tot_loss/=len(dataset)
    writer.add_scalar("tensorboard_data", loss.detach().item(), epoch)
    avg_losses.append(tot_loss)



KeyboardInterrupt: 

In [192]:
dataset = P2GDataset("data.csv", "cuda")
p, g = dataset[0]

print(one_hot_to_nemes(p[0], "phonemes"))
p.shape



KeyError: 'phonemes'

In [77]:
def get_0_1_accuracy(test_set, model):
    correct = 0
    dataloader = DataLoader(dataset=test_set, batch_size=1)
    for (in_seq, out_seq) in dataloader:
        # print(out_seq.shape)
        # break
        prediction = model.pred_new(in_seq[0])
        # print(in_seq[0].shape)
        print(one_hot_to_nemes(out_seq[0][0], "graphemes"))
        print(one_hot_to_nemes(prediction, "graphemes"))
        
        if prediction.insert(0, '0') == out_seq:
            correct+= 1
    if correct == 0:
        return correct
    return correct/len(test_set)

def print_preds(path):
    global p
    print(one_hot_to_nemes(p[0], "phonemes"))
    s = model.pred_new(p)
    
    print(one_hot_to_nemes(s, "graphemes"))

# print_preds("data.csv")
print(get_0_1_accuracy(test, model))
# 36 great for train set
# print(test[0])
# print(one_hot_to_graphemes(torch.FloatTensor([[3,2,1],[0,0,1],[0,0,1]])))

['0', 'm', 'a', 'c', 'h', 'a', 'n', '1']
['m', 'a', 't', 'c', 'h', 'a', 'n', '1']
['0', 'd', 'e', 'n', 'd', 'r', 'o', 'i', 'd', '1']
['d', 'e', 'n', 'd', 'r', 'o', 'i', 'd', 'd', '1']
['0', 'e', 'f', 'f', '1']
['e', 'f', 'f', '1']
['0', 'v', 'a', 'n', 'i', 's', 'h', '1']
['v', 'a', 'n', 'i', 's', 's', '1']
['0', 'u', 'n', 'c', 'u', 'r', 'l', '1']
['u', 'n', 'c', 'u', 'e', 'l', '1']
['0', 'p', 'o', 's', 'i', 't', 'i', 'v', 'e', '1']
['p', 'o', 's', 'i', 't', 'i', 'v', 'e', '1']
['0', 'b', 'o', 'u', 'g', 'h', 'p', 'o', 't', '1']
['b', 'o', 'u', 'g', 'p', 'p', 'o', 't', '1']
['0', 'd', 'r', 'i', 'p', 'p', 'a', 'g', 'e', '1']
['d', 'r', 'i', 'p', 'p', 'a', 'g', 'e', '1']
['0', 'q', 'u', 'e', 'e', 'n', 'i', 'n', 'g', '1']
['q', 'u', 'e', 'e', 'n', 'i', 'n', 'g', '1']
['0', 'p', 'e', 'c', 'k', 'y', '1']
['p', 'e', 'c', 'k', 'y', '1']
['0', 'p', 'a', 'r', 'k', 'y', '1']
['p', 'a', 'r', 'k', 'y', '1']
['0', 'e', 'n', 'v', 'y', '1']
['e', 'n', 'v', 'y', '1']
['0', 'p', 'o', 'm', 'p', 'o', 'u', 

KeyboardInterrupt: 

In [162]:
model.encoder.encoder.weight_ih_l0

Parameter containing:
tensor([[ 0.0039,  0.1068,  0.0548,  ...,  0.0142, -0.0038, -0.0277],
        [-0.0430,  0.0747,  0.0009,  ...,  0.0160,  0.0036,  0.0260],
        [-0.0219,  0.0604, -0.0052,  ..., -0.0307,  0.0280,  0.0013],
        ...,
        [-0.0097,  0.0188,  0.0462,  ...,  0.0221, -0.0120,  0.0051],
        [ 0.0131,  0.0074,  0.0188,  ...,  0.0239,  0.0245, -0.0284],
        [-0.0040,  0.0412,  0.0591,  ...,  0.0262, -0.0278, -0.0228]],
       device='cuda:0', requires_grad=True)