# Phoneme to Grapheme Conversion with a Recurrent Generative Model 
This project will discuss...

In [140]:
import torch
import torch.nn as nn
import random
import torch.optim as optim
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.tensorboard import SummaryWriter

# find phoneme vocabulary
data = pd.read_csv("phonemes-words.csv")
phonemes_col = data["phonemes"]
graphemes_col = data["graphemes"]
phonemes = ['0', '1']
graphemes = ['0', '1']

for word in phonemes_col:
    # print(word)
    for phoneme in word:
        if phoneme not in phonemes:
            phonemes.append(phoneme)
for word in graphemes_col:
    # print(word)
    for grapheme in word:
        if grapheme not in graphemes:
            graphemes.append(grapheme)
print(phonemes)

['0', '1', 'k', 'y', 'ʊ', 'r', 'ɑ', 'a', 'ɪ', 'z', 'æ', 't', 'ə', 'b', 'n', 'u', 'l', 'm', 'ɛ', 'd', 'e', 'ʃ', 's', 'ʌ', 'g', 'f', 'o', 'θ', 'ɒ', 'i', 'ʒ', 'p', 'ɔ', 'v', 'ː', 'h', 'ŋ', 'w', '̃', 'ɡ', 'j', 'ɜ', 'ð', 'ʰ', 'x', 'c', 'œ', 'ü', 'ɘ', 'ø', 'ĩ']


In [141]:

# one hot encodes the word: returns an array of one hot encoded characters
def nemes_to_1_hot_seq(string, nemes="phonemes"):
    string = '0' + string + '1'
    l = phonemes if nemes == "phonemes" else graphemes
    seq = []
    for i in string:
        vec = [0] * len(l)
        vec[l.index(i)] = 1
        seq.append(vec)

    return torch.FloatTensor([seq])

def one_hot_to_nemes(arr, nemes="phonemes"):
    seq = []
    l = phonemes if nemes == "phonemes" else graphemes
    for hot in arr:
        x = torch.argmax(hot)
        seq.append(l[x])
    return seq

class P2GDataset(Dataset):
    def __init__(self, phoneme_file, device):
        df = pd.read_csv(phoneme_file)
        self.data = df.drop(df[df["phonemes"].map(len) > 7].index)

        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        p, g = self.data.iloc[idx]
        return nemes_to_1_hot_seq(p, nemes = "phonemes").to(self.device), nemes_to_1_hot_seq(g, nemes = "graphemes").long()


In [142]:
# define model architecture
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.LSTM(len(phonemes), 512, 2, batch_first=True, bidirectional=False, dropout=0.5)
        
    def forward(self, x):
        # push vector through encoder
        out, (hidden, cell) = self.encoder(x)

        # return context vector
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = nn.LSTM(len(graphemes), 512, 2, batch_first=True, bidirectional=False, dropout=0.5)
        self.fc = nn.Sequential(
            nn.Linear(512*2, len(graphemes))
        )
        
    def forward(self, input, hidden_layer, cell_state):
        """
        Since this function gets called once at a time rather than taking in
        a sequence of vectors, we need to pass it the last output. This will be just
        a vector of numbers that can be converted to the embedding representing that last output
        """
        # print(hidden_layer.shape)
        # print(cell_state)
        out, (hidden, cell) = self.decoder(input, (hidden_layer, cell_state))
        # print("H")
        return self.fc(hidden.reshape(1,1,2*512)), hidden, cell

class seq2seq(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.device = device
    
    def forward(self, in_seq, out_seq, tf_ratio=0.5):
        out_len = out_seq.shape[1]
        # storing the outputs of the sequence
        outputs = torch.zeros(out_len, 1, len(graphemes)).to(self.device)

        hidden, cell = self.encoder(in_seq)

        out_seq = out_seq.squeeze(0)

        input = out_seq[0].unsqueeze(0).unsqueeze(0).float().to(device)
        
        for i in range(1, out_len):
            out, hidden, cell = self.decoder(input, hidden, cell)
            outputs[i] = out

            if random.random() > tf_ratio:
                # teacher forcing (make next input what the current output token should be)
                input = out_seq[i].unsqueeze(0).unsqueeze(0).float().to(device)
            else:
                x = input.argmax(1)[0]
                input = torch.zeros(1, 1, len(graphemes)).to(self.device)
                input[0][0][x] = 1
                
        return outputs

    def pred_new(self, in_seq):
        hidden, cell = self.encoder(in_seq)
        input = torch.zeros(1, 1, len(graphemes)).to(self.device)
        outs = []
        while True:
            out, hidden, cell = self.decoder(input, hidden, cell)
            outs.append(out)
            x = input.argmax(1)[0]
            input = torch.zeros(1, 1, len(graphemes)).to(self.device)
            input[0][0][x] = 1
            if one_hot_to_nemes(out) == ['1']:
                break
        return outs

In [148]:
"""training"""
from torch.utils.data import random_split
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 100
model = seq2seq(device).to(device)
# what a beautiful architecture
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
loss_func = nn.CrossEntropyLoss()
dataset = P2GDataset("phonemes-words.csv", device)
train, test = random_split(dataset, [35000, len(dataset)-35000])
dataloader = DataLoader(dataset=train, batch_size=1)
print(len(test))


seq2seq(
  (encoder): Encoder(
    (encoder): LSTM(51, 512, num_layers=2, batch_first=True, dropout=0.5)
  )
  (decoder): Decoder(
    (decoder): LSTM(28, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc): Sequential(
      (0): Linear(in_features=1024, out_features=28, bias=True)
    )
  )
)
2228


In [163]:
avg_losses = []
writer = SummaryWriter("tensorboard_data")

# 15 quite good
for epoch in range(15):
    tot_loss = 0
    for (in_seq, out_seq) in dataloader:
        in_seq = in_seq.squeeze(0)
        out_seq = out_seq.squeeze(0)
        model_output = model(in_seq, out_seq)
        model_output = model_output[1:]
        model_output = model_output.squeeze(1)
        out_seq = out_seq.squeeze(0)[1:]
        loss = loss_func(model_output, out_seq.argmax(1).to(device))
        tot_loss+=loss.detach().item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    tot_loss/=len(dataset)
    writer.add_scalar("tensorboard_data", loss.detach().item(), epoch)
    avg_losses.append(tot_loss)



In [None]:
dataset = P2GDataset("data.csv", "cuda")
p, g = dataset[0]

print(one_hot_to_nemes(p[0], "phonemes"))
p.shape



In [165]:
model.train()
def get_0_1_accuracy(test_set, model):
    correct = 0
    dataloader = DataLoader(dataset=test_set, batch_size=1)
    for (in_seq, out_seq) in dataloader:
        # print(out_seq.shape)
        # break
        prediction = model.pred_new(in_seq[0])
        # print(in_seq[0].shape)
        true = "".join(one_hot_to_nemes(out_seq[0][0], "graphemes"))[1:-1]
        print(true)
        # print(one_hot_to_nemes(prediction, "graphemes"))
        pred = "".join(one_hot_to_nemes(prediction, "graphemes"))[0:-1]
        print(pred)
        
        if true == pred:
            correct+= 1
    if correct == 0:
        return correct
    return correct/len(test_set)

def print_preds(path):
    global p
    print(one_hot_to_nemes(p[0], "phonemes"))
    s = model.pred_new(p)
    
    print(one_hot_to_nemes(s, "graphemes"))

# print_preds("data.csv")
print(get_0_1_accuracy(test, model))
# 36 great for train set
# print(test[0])
# print(one_hot_to_graphemes(torch.FloatTensor([[3,2,1],[0,0,1],[0,0,1]])))

swish
swish
lochus
loccus
phosphate
poosphate
gudgeon
gudgoon
hearty
hearty
naissance
naissace
kea
caya
empower
empowrr
drawdown
drawdown
cromlech
cromlec
rebar
rebar
unlike
unlike
randem
randem
moccasin
mockcii
besiege
besagee
valid
vallidd
haji
hajji
reheating
reteating
praxis
praxis
wiper
wiper
mooring
mooing
algae
algy
frock
frock
reviewer
reviurr
lodging
lodging
rockiness
rockuness
gnosis
noois
attuned
attuned
ohmage
omage
rappel
rappl
sinitic
cigetic
mishmi
mishmi
pivotal
pivoaale
adonic
adonic
ceyx
syyix
trouser
trowser
lungi
lungyi
molasse
malas
scatter
scatter
fellow
felloe
props
props
bundled
bundled
amidin
amidin
reread
reread
psyllium
cilium
quilted
quilled
zebrula
zebrila
brr
buhr
ess
s
ging
ghing
jink
jink
cacao
cacayo
strake
straie
verona
verone
salify
salify
wizard
wizzrrd
great
grate
saiva
siver
seaware
seawarr
benign
benine
solgel
solguelle
maltase
maltace
adipose
adapose
malison
mallion
grivet
grivet
figwort
figwort
embosom
embosom
stound
stouned
hymnary
hymnery
hafi

In [152]:
model.encoder.encoder.weight_ih_l0

Parameter containing:
tensor([[-0.7538,  0.1556,  0.0660,  ..., -0.0320, -0.0349, -0.0978],
        [-0.0865,  0.0106, -0.2135,  ...,  0.0184,  0.0110,  0.0298],
        [ 0.1121, -0.0444, -0.1053,  ...,  0.0754, -0.0277,  0.0016],
        ...,
        [-0.1423,  0.2361, -0.6792,  ...,  0.0250, -0.0080, -0.0016],
        [ 0.0889,  0.2124,  0.0398,  ..., -0.0352,  0.0361, -0.0685],
        [-0.2383,  0.0234, -0.0281,  ...,  0.0096, -0.0413, -0.0148]],
       device='cuda:0', requires_grad=True)