# Phoneme to Grapheme Conversion with a Recurrent Generative Model 
This project will discuss...

In [113]:
import torch
import torch.nn as nn
import random
import torch.optim as optim
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.tensorboard import SummaryWriter

# find phoneme vocabulary
data = pd.read_csv("phonemes-words.csv")
phonemes_col = data["phonemes"]
graphemes_col = data["graphemes"]
phonemes = ['0', '1']
graphemes = ['0', '1']

for word in phonemes_col:
    for phoneme in word:
        if phoneme not in phonemes:
            phonemes.append(phoneme)
for word in graphemes_col:
    # print(word)
    for grapheme in word:
        if grapheme not in graphemes:
            graphemes.append(grapheme)
print(phonemes)

['0', '1', 'd', 'a', 'ɪ', 'n', 'æ', 's', 't', 'k', 'ə', 'ɛ', 'm', 'p', 'b', 'l', 'z', 'i', 'r', 'ʃ', 'ŋ', 'e', 'ʊ', 'v', 'f', 'ɒ', 'ʌ', 'o', 'θ', 'ɜ', 'ɑ', 'ʒ', 'w', 'y', 'u', 'g', 'ː', 'ɔ', 'h', 'x', 'j', '̃', 'ð', 'ʰ', 'ɡ', 'c', 'œ', 'S', 'F', 'Y', 'G', 'ü', 'ø', 'ĩ', 'ɘ']


In [118]:

# one hot encodes the word: returns an array of one hot encoded characters
def nemes_to_1_hot_seq(string, nemes="phonemes"):
    string = '0' + string + '1'
    l = phonemes if nemes == "phonemes" else graphemes
    seq = []
    for i in string:
        vec = [0] * len(l)
        vec[l.index(i)] = 1
        seq.append(vec)

    return torch.FloatTensor([seq])

def one_hot_to_nemes(arr, nemes="phonemes"):
    seq = []
    l = phonemes if nemes == "phonemes" else graphemes
    for hot in arr:
        x = torch.argmax(hot)
        seq.append(l[x])
    return seq

class P2GDataset(Dataset):
    def __init__(self, phoneme_file, device):
        df = pd.read_csv(phoneme_file)
        self.data = df.drop(df[df["phonemes"].map(len) > 7].index)

        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        p, g = self.data.iloc[idx]
        return nemes_to_1_hot_seq(p, nemes = "phonemes").to(self.device), nemes_to_1_hot_seq(g, nemes = "graphemes").long()


In [119]:
# define model architecture
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.LSTM(len(phonemes), 512, 2, batch_first=True, bidirectional=False, dropout=0.5)
        
    def forward(self, x):
        # push vector through encoder
        out, (hidden, cell) = self.encoder(x)

        # return context vector
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = nn.LSTM(len(graphemes), 512, 2, batch_first=True, bidirectional=False, dropout=0.5)
        self.fc = nn.Sequential(
            nn.Linear(512*2, len(graphemes))
        )
        
    def forward(self, input, hidden_layer, cell_state):
        """
        Since this function gets called once at a time rather than taking in
        a sequence of vectors, we need to pass it the last output. This will be just
        a vector of numbers that can be converted to the embedding representing that last output
        """
        # print(hidden_layer.shape)
        # print(cell_state)
        out, (hidden, cell) = self.decoder(input, (hidden_layer, cell_state))
        # print("H")
        return self.fc(hidden.reshape(1,1,2*512)), hidden, cell

class seq2seq(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.device = device
    
    def forward(self, in_seq, out_seq, tf_ratio=0.5):
        out_len = out_seq.shape[1]
        # storing the outputs of the sequence
        outputs = torch.zeros(out_len, 1, len(graphemes)).to(self.device)

        hidden, cell = self.encoder(in_seq)

        out_seq = out_seq.squeeze(0)

        input = out_seq[0].unsqueeze(0).unsqueeze(0).float().to(device)
        
        for i in range(1, out_len):
            out, hidden, cell = self.decoder(input, hidden, cell)
            outputs[i] = out

            if random.random() > tf_ratio:
                # teacher forcing (make next input what the current output token should be)
                input = out_seq[i].unsqueeze(0).unsqueeze(0).float().to(device)
            else:
                x = input.argmax(1)[0]
                input = torch.zeros(1, 1, len(graphemes)).to(self.device)
                input[0][0][x] = 1
                
        return outputs

    def pred_new(self, in_seq):
        hidden, cell = self.encoder(in_seq)
        input = torch.zeros(1, 1, len(graphemes)).to(self.device)
        outs = []
        while True:
            out, hidden, cell = self.decoder(input, hidden, cell)
            outs.append(out)
            x = input.argmax(1)[0]
            input = torch.zeros(1, 1, len(graphemes)).to(self.device)
            input[0][0][x] = 1
            if one_hot_to_nemes(out) == ['1']:
                break
        return outs

In [123]:
"""training"""
from torch.utils.data import random_split
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 100
model = seq2seq(device).to(device)
# what a beautiful architecture
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_func = nn.CrossEntropyLoss()
dataset = P2GDataset("phonemes-words.csv", device)
train, test = random_split(dataset, [40000, len(dataset)-40000])
dataloader = DataLoader(dataset=train, batch_size=1)
print(len(test))


seq2seq(
  (encoder): Encoder(
    (encoder): LSTM(55, 512, num_layers=2, batch_first=True, dropout=0.5)
  )
  (decoder): Decoder(
    (decoder): LSTM(28, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc): Sequential(
      (0): Linear(in_features=1024, out_features=28, bias=True)
    )
  )
)
7810


In [124]:
avg_losses = []
writer = SummaryWriter("tensorboard_data")

# 15 quite good
for epoch in range(15):
    tot_loss = 0
    for (in_seq, out_seq) in dataloader:
        in_seq = in_seq.squeeze(0)
        out_seq = out_seq.squeeze(0)
        model_output = model(in_seq, out_seq)
        model_output = model_output[1:]
        model_output = model_output.squeeze(1)
        out_seq = out_seq.squeeze(0)[1:]
        loss = loss_func(model_output, out_seq.argmax(1).to(device))
        tot_loss+=loss.detach().item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    tot_loss/=len(dataset)
    writer.add_scalar("tensorboard_data", loss.detach().item(), epoch)
    avg_losses.append(tot_loss)



In [None]:
dataset = P2GDataset("data.csv", "cuda")
p, g = dataset[0]

print(one_hot_to_nemes(p[0], "phonemes"))
p.shape



In [125]:
model.eval()
def get_0_1_accuracy(test_set, model):
    correct = 0
    dataloader = DataLoader(dataset=test_set, batch_size=1)
    for (in_seq, out_seq) in dataloader:
        # print(out_seq.shape)
        # break
        prediction = model.pred_new(in_seq[0])
        # print(in_seq[0].shape)
        true = "".join(one_hot_to_nemes(out_seq[0][0], "graphemes"))[1:-1]
        print(true)
        # print(one_hot_to_nemes(prediction, "graphemes"))
        pred = "".join(one_hot_to_nemes(prediction, "graphemes"))[0:-1]
        print(pred)
        
        if true == pred:
            correct+= 1
    if correct == 0:
        return correct
    return correct/len(test_set)

def print_preds(path):
    global p
    print(one_hot_to_nemes(p[0], "phonemes"))
    s = model.pred_new(p)
    
    print(one_hot_to_nemes(s, "graphemes"))

# print_preds("data.csv")
print(get_0_1_accuracy(train, model))
# 36 great for train set
# print(test[0])
# print(one_hot_to_graphemes(torch.FloatTensor([[3,2,1],[0,0,1],[0,0,1]])))

sucrose
sucroose
viscous
viscous
maxilla
maxilla
chromic
chromic
siskin
siskin
redeeming
redeeming
pierhead
peerhead
selden
selden
evzone
evzone
dogfish
dogfish
catalo
cataloo
amesace
amesase
outrank
outrank
cordis
coodic
threepenny
thrippenny
jubbah
jubbah
lesser
lesser
barm
barm
feel
feal
dipody
dipody
officer
offacer
lavish
lavish
bakery
bacery
halide
hallee
ob
aub
gooney
goony
demission
demission
sapphira
sufire
benison
benison
submit
submit
shellfish
shellfshh
unpicked
unpicked
lingam
lingum
lmma
imm
caplet
caplle
cadenced
cadenced
jewess
jewis
houseless
houseless
brahms
braam
braless
brallss
inherit
inherit
twicer
twisee
france
france
purge
purge
darning
darning
trona
trona
baffies
baffie
debar
debar
finitude
finitude
lotion
locisnan
inert
inert
aulos
allo
cutline
cutlinn
zaptiah
zuptia
sound
sound
aggrade
aggaid
laud
lawd
evermore
evermore
brandy
brandy
coalfish
coalfish
fibula
fibula
koradji
coraaje
cession
session
heave
heave
matzoh
matzah
sothic
soethc
nouveau
nevev
cretan
cr

In [92]:
model.encoder.encoder.weight_ih_l0

Parameter containing:
tensor([[-0.0545,  0.0437,  0.0922,  ..., -0.0272,  0.0518,  0.0321],
        [-0.2908,  0.0712, -0.0298,  ...,  0.0417, -0.0094,  0.0039],
        [-0.0850,  0.1052, -0.1394,  ...,  0.0355,  0.0076,  0.0039],
        ...,
        [-0.1207,  0.1131,  0.0215,  ..., -0.0365,  0.0794,  0.0161],
        [-0.1674,  0.0671,  0.0768,  ..., -0.0378,  0.0006,  0.0227],
        [-0.1742,  0.0136, -0.0371,  ...,  0.0183,  0.0078,  0.0222]],
       device='cuda:0', requires_grad=True)