# Classifing last names with character-level RNN

In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [5]:
PATH = Path("/data2/yinterian/name_dataset/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/name_dataset/names_test.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv.gz'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_test.csv.gz')]

In [6]:
! head /data2/yinterian/name_dataset/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [7]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [8]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [9]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [10]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [11]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [12]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int32)

In [13]:
# one hot encoding
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [14]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [68]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
val = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [69]:
batch_size = 2000
n = len(test)
train_dl = DataLoader(train, batch_size=batch_size)
val_dl = DataLoader(test, batch_size=n)

In [70]:
len(train), len(test)

(13374, 13374)

In [71]:
x,y = train[0]
print(x.shape,y)

(15, 55) 2


## Model

In [72]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [73]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [74]:
x, y = next(iter(train_dl))

In [75]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [76]:
batch = x.shape[0]
h = model.initHidden(batch).cuda()
x = x.cuda().float()
y = y.cuda().long()

In [77]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [78]:
for ei in range(x.shape[1]):
    x_t, h = model(x[:,ei], h)

In [79]:
# note that just the last x_t is used in the loss
loss = F.cross_entropy(x_t, y)
loss.item()

2.8333206176757812

## Training

In [80]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [81]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [82]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch).cuda()
        loss = 0
        x = x.cuda().float()
        y = y.cuda().long()
        
        for t in range(x.shape[1]):
            out, h = model(x[:,t], h)
        
        loss = F.cross_entropy(out, y)
        optim.zero_grad()
        loss.backward(retain_graph=True)
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [87]:
def val_metric(model, val_dl):
    model.eval()
    x, y = next(iter(val_dl))
    x = x.cuda().float()
    y = y.cuda().long()
    N = x.shape[0]
    h = model.initHidden(N).cuda()
    for t in range(x.shape[1]):
        out, h = model(x[:,t], h)
    loss = F.cross_entropy(out, y)
    _, pred = torch.max(out, 1)
    acc = pred.eq(y).sum().float()/N
    return loss.item(), acc.item()
    print("test loss %.3f and accuracy %.3f" % (loss.item(), acc.item()))

In [94]:
vocab_size = 55
hidden_size = 80
n_classes = 18

In [112]:
def train_loop(model, lr, train_dl, val_dl, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        val_loss, val_acc = val_metric(model, val_dl)
        if i%5 == 1: print("train loss %.3f val loss %.3f and val accuracy %.3f" % (loss, val_loss, val_acc))

In [99]:
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [100]:
train_loop(model, lr=0.01, epochs=20)

train loss 2.204 valid loss 1.791 and accuracy 0.469
train loss 1.771 valid loss 1.598 and accuracy 0.493
train loss 1.446 valid loss 1.348 and accuracy 0.578
train loss 1.257 valid loss 1.184 and accuracy 0.645


In [101]:
train_loop(model, lr=0.001, epochs=20)

train loss 1.219 valid loss 1.194 and accuracy 0.638
train loss 1.147 valid loss 1.132 and accuracy 0.665
train loss 1.101 valid loss 1.082 and accuracy 0.675
train loss 1.067 valid loss 1.049 and accuracy 0.685


In [102]:
train_loop(model, lr=0.001, epochs=20)

train loss 1.037 valid loss 1.036 and accuracy 0.688
train loss 1.022 valid loss 1.008 and accuracy 0.699
train loss 1.001 valid loss 0.985 and accuracy 0.704
train loss 0.984 valid loss 0.967 and accuracy 0.708


# Model with character embeddings 

In [113]:
class NameDatasetEmb(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [121]:
train_2 = NameDatasetEmb(PATH/"names_train.csv", vocab2id, label2id)
val_2 = NameDatasetEmb(PATH/"names_test.csv", vocab2id, label2id)

In [122]:
batch_size = 2000
n = len(test)
train_dl_2 = DataLoader(train_2, batch_size=batch_size)
val_dl_2 = DataLoader(val_2, batch_size=n)

In [123]:
train_2[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
       dtype=int32), 2)

In [124]:
class CharEmbRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size):
        super(CharEmbRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.long()
        x = self.emb(x)
        combined = torch.cat((x, hidden), 1)
        hidden = F.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Train 

In [109]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs):
        loss = train(model, optim, train_2_dl)
        if i%5 == 1: print("train loss %.3f" % loss)
    predict(model, test_2_dl)

In [131]:
vocab_size = 55
emb_size = 30
hidden_size = 80
n_classes = 18
model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes).cuda()

In [132]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)

train loss 1.761 val loss 1.898 and val accuracy 0.474
train loss 1.294 val loss 1.201 and val accuracy 0.622
train loss 1.096 val loss 1.023 and val accuracy 0.679
train loss 0.988 val loss 0.913 and val accuracy 0.718
train loss 0.886 val loss 0.832 and val accuracy 0.749
train loss 0.825 val loss 0.760 and val accuracy 0.772
train loss 0.851 val loss 0.752 and val accuracy 0.770
train loss 0.743 val loss 0.678 and val accuracy 0.792
train loss 0.673 val loss 0.624 and val accuracy 0.811
train loss 0.641 val loss 0.586 and val accuracy 0.821


In [133]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)

train loss 0.752 val loss 0.731 and val accuracy 0.772
train loss 0.614 val loss 0.568 and val accuracy 0.823
train loss 0.553 val loss 0.508 and val accuracy 0.844
train loss 0.508 val loss 0.464 and val accuracy 0.856
train loss 0.469 val loss 0.424 and val accuracy 0.870
train loss 0.440 val loss 0.391 and val accuracy 0.878
train loss 0.421 val loss 0.381 and val accuracy 0.882
train loss 0.369 val loss 0.332 and val accuracy 0.898
train loss 0.359 val loss 0.310 and val accuracy 0.907
train loss 0.343 val loss 0.299 and val accuracy 0.908


In [134]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)

train loss 0.512 val loss 0.463 and val accuracy 0.846
train loss 0.308 val loss 0.277 and val accuracy 0.916
train loss 0.276 val loss 0.255 and val accuracy 0.922
train loss 0.251 val loss 0.226 and val accuracy 0.932


In [135]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)

train loss 0.448 val loss 0.406 and val accuracy 0.861
train loss 0.234 val loss 0.215 and val accuracy 0.937
train loss 0.201 val loss 0.182 and val accuracy 0.947
train loss 0.241 val loss 0.203 and val accuracy 0.938


In [136]:
train_loop(model, 0.001, train_dl_2, val_dl_2, epochs=50)

train loss 0.178 val loss 0.172 and val accuracy 0.950
train loss 0.163 val loss 0.160 and val accuracy 0.955
train loss 0.156 val loss 0.154 and val accuracy 0.957
train loss 0.152 val loss 0.150 and val accuracy 0.958
train loss 0.148 val loss 0.146 and val accuracy 0.960
train loss 0.145 val loss 0.143 and val accuracy 0.961
train loss 0.142 val loss 0.140 and val accuracy 0.962
train loss 0.139 val loss 0.137 and val accuracy 0.963
train loss 0.136 val loss 0.134 and val accuracy 0.964
train loss 0.134 val loss 0.132 and val accuracy 0.965


## Exercise
* Change the first model to learn a character language model that generates last names.
* Use one cycle training on this problem.

# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs.