# Classifing last names with character-level RNN

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [3]:
PATH = Path("/data2/yinterian/name_dataset/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/name_dataset/names_test.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv.gz'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_test.csv.gz')]

In [4]:
! head /data2/yinterian/name_dataset/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [5]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [6]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [7]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [8]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [9]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [10]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int32)

In [11]:
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [12]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [13]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
test = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [14]:
batch_size = 2000
n = len(test)
train_dl = DataLoader(train, batch_size=batch_size)
test_dl = DataLoader(test, batch_size=n)

In [15]:
len(train), len(test)

(13374, 13374)

In [16]:
x,y = train[0]
print(x.shape,y)

(15, 55) 2


## Model

In [25]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = F.tanh(self.i2h(combined))
        output = self.i2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [26]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [27]:
x, y = next(iter(train_dl))

In [28]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [29]:
batch = x.shape[0]
h = model.initHidden(batch).cuda()
x = x.cuda().float()
y = y.cuda().long()

In [30]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [31]:
for ei in range(x.shape[1]):
    x_t, h = model(x[:,ei], h)

In [32]:
loss = F.cross_entropy(x_t, y)
loss

tensor(2.9396, device='cuda:0')

## Training

In [39]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [40]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [41]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch).cuda()
        loss = 0
        x = x.cuda().float()
        y = y.cuda().long()
        
        for ei in range(x.shape[1]):
            out, h = model(x[:,ei], h)
        
        loss = F.cross_entropy(out, y)
        optim.zero_grad()
        loss.backward(retain_graph=True)
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [46]:
def predict(model, test_dl):
    model.eval()
    res = []
    x, y = next(iter(test_dl))
    x = x.cuda().float()
    y = y.cuda().long()
    N = x.shape[0]
    h = model.initHidden(N).cuda()
    for ei in range(x.shape[1]):
        out, h = model(x[:,ei], h)
    loss = F.cross_entropy(out, y)
    _, pred = torch.max(out, 1)
    acc = pred.eq(y).sum().float()/N
    print("test loss %.3f and accuracy %.3f" % (loss.item(), acc.item()))

In [59]:
vocab_size = 55
hidden_size = 80
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [60]:
def train_loop(model, lr, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        if i%5 == 1: print("train loss %.3f" % loss)
    predict(model, test_dl)

In [61]:
train_loop(model, lr=0.01, epochs=20)

train loss 2.139
train loss 1.852
train loss 1.595
train loss 1.431
test loss 1.705 and accuracy 0.469


In [62]:
train_loop(model, lr=0.001, epochs=20)

train loss 1.544
train loss 1.375
train loss 1.326
train loss 1.290
test loss 1.254 and accuracy 0.614


In [63]:
train_loop(model, lr=0.001, epochs=20)

train loss 1.268
train loss 1.232
train loss 1.205
train loss 1.177
test loss 1.142 and accuracy 0.652


In [64]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.147
train loss 1.123
train loss 1.100
train loss 1.077
train loss 1.056
train loss 1.037
train loss 1.019
train loss 1.005
test loss 0.990 and accuracy 0.699


In [65]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.058
train loss 0.983
train loss 0.969
train loss 0.957
train loss 0.947
train loss 0.936
train loss 0.926
train loss 0.917
test loss 0.892 and accuracy 0.731


In [66]:
train_loop(model, lr=0.001, epochs=40)

train loss 1.000
train loss 0.909
train loss 0.897
train loss 0.887
train loss 0.880
train loss 0.872
train loss 0.865
train loss 0.858
test loss 0.835 and accuracy 0.749


In [67]:
train_loop(model, lr=0.001, epochs=40)

train loss 0.877
train loss 0.847
train loss 0.840
train loss 0.834
train loss 0.827
train loss 0.821
train loss 0.815
train loss 0.809
test loss 0.786 and accuracy 0.765


# Model with character embeddings 

In [73]:
class NameDatasetEmb(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [74]:
train_2 = NameDatasetEmb(PATH/"names_train.csv", vocab2id, label2id)
test_2 = NameDatasetEmb(PATH/"names_test.csv", vocab2id, label2id)

In [75]:
batch_size = 2000
n = len(test)
train_2_dl = DataLoader(train_2, batch_size=batch_size)
test_2_dl = DataLoader(test_2, batch_size=n)

In [76]:
train_2[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
       dtype=int32), 2)

In [77]:
class CharEmbRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size):
        super(CharEmbRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(emb_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.long()
        x = self.emb(x)
        combined = torch.cat((x, hidden), 1)
        hidden = F.tanh(self.i2h(combined))
        output = self.i2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Train 

In [78]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs):
        loss = train(model, optim, train_2_dl)
        if i%5 == 1: print("train loss %.3f" % loss)
    predict(model, test_2_dl)

In [79]:
vocab_size = 55
emb_size = 30
hidden_size = 80
n_classes = 18
model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes).cuda()
optim = get_optimizer(model, lr =0.01, wd = 0.0)

In [80]:
train_loop(model, epochs=40, lr=0.01, wd=0.0)

train loss 2.098
train loss 1.320
train loss 1.132
train loss 1.021
train loss 0.925
train loss 0.870
train loss 0.805
train loss 0.754
test loss 0.694 and accuracy 0.785


In [81]:
train_loop(model, epochs=20, lr=0.01, wd=0.0)

train loss 1.066
train loss 0.739
train loss 0.668
train loss 0.767
test loss 0.587 and accuracy 0.818


In [82]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.583
train loss 0.567
train loss 0.557
train loss 0.549
test loss 0.538 and accuracy 0.834


In [83]:
train_loop(model, epochs=40, lr=0.001, wd=0.00)

train loss 0.543
train loss 0.534
train loss 0.528
train loss 0.522
train loss 0.516
train loss 0.510
train loss 0.504
train loss 0.498
test loss 0.488 and accuracy 0.849


In [84]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.493
train loss 0.486
train loss 0.482
train loss 0.477
test loss 0.468 and accuracy 0.857
train loss 0.472
train loss 0.466
train loss 0.462
train loss 0.458
test loss 0.449 and accuracy 0.864


In [85]:
train_loop(model, epochs=20, lr=0.001, wd=0.0)
train_loop(model, epochs=20, lr=0.001, wd=0.0)

train loss 0.454
train loss 0.448
train loss 0.444
train loss 0.440
test loss 0.431 and accuracy 0.869
train loss 0.436
train loss 0.431
train loss 0.427
train loss 0.423
test loss 0.414 and accuracy 0.875


# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html