# Classifing last names with character-level RNN

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [2]:
def unpack_dataset():
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz 
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz 
    ! mkdir -p data
    ! gunzip names_train.csv.gz 
    ! gunzip names_test.csv.gz
    ! mv names*.csv data

In [3]:
#unpack_dataset()

In [4]:
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/glove.6B.300d.txt'),
 PosixPath('data/glove.6B.100d.txt'),
 PosixPath('data/names_train.csv'),
 PosixPath('data/names_test.csv'),
 PosixPath('data/glove.6B.50d.txt'),
 PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000'),
 PosixPath('data/hour.csv'),
 PosixPath('data/glove.6B.200d.txt'),
 PosixPath('data/Readme.txt'),
 PosixPath('data/train.csv'),
 PosixPath('data/day.csv'),
 PosixPath('data/glove.6B.zip'),
 PosixPath('data/train.csv.zip')]

In [5]:
! head data/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [6]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [7]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [8]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [9]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [10]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [11]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int32)

In [12]:
# one hot encoding
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [13]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [14]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
val = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [15]:
batch_size = 2000
n=len(val)
train_dl = DataLoader(train, batch_size=batch_size)
val_dl = DataLoader(val, batch_size=n)

In [16]:
len(train), len(val)

(13374, 13374)

In [17]:
x,y = train[0]
print(x.shape,y)

(15, 55) 2


## Model with one-hot encoding input

In [18]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [19]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes)

In [20]:
x, y = next(iter(train_dl))

In [21]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [22]:
batch = x.shape[0]
h = model.initHidden(batch)
x = x.float()
y = y.long()

In [23]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [24]:
for ei in range(x.shape[1]):
    y_t, h = model(x[:,ei], h)

In [25]:
# note that just the last x_t is used in the loss
# update
loss = F.cross_entropy(y_t, y)
loss.item()

2.898435354232788

## Training

In [26]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes)

In [27]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [28]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch)
        loss = 0
        x = x.float()
        y = y.long()
        
        for t in range(x.shape[1]):
            out, h = model(x[:,t], h)
        
        loss = F.cross_entropy(out, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [29]:
def val_metric(model, val_dl):
    model.eval()
    x, y = next(iter(val_dl))
    x = x.float()
    y = y.long()
    N = x.shape[0]
    h = model.initHidden(N)
    for t in range(x.shape[1]):
        out, h = model(x[:,t], h)
    loss = F.cross_entropy(out, y)
    _, pred = torch.max(out, 1)
    acc = pred.eq(y).sum().float()/N
    return loss.item(), acc.item()

In [30]:
vocab_size = 55
hidden_size = 80
n_classes = 18

In [31]:
def train_loop(model, lr, train_dl, val_dl, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        val_loss, val_acc = val_metric(model, val_dl)
        if i%5 == 1: print("train loss %.3f val loss %.3f and val accuracy %.3f" % (loss, val_loss, val_acc))

In [32]:
model = CharRNN(vocab_size, hidden_size, n_classes)

In [33]:
train_loop(model, 0.01, train_dl, val_dl, epochs=20)

train loss 2.117 val loss 1.857 and val accuracy 0.469
train loss 1.871 val loss 1.740 and val accuracy 0.469
train loss 1.617 val loss 1.502 and val accuracy 0.521
train loss 1.407 val loss 1.305 and val accuracy 0.593


In [34]:
train_loop(model, 0.001, train_dl, val_dl, epochs=20)

train loss 1.416 val loss 1.245 and val accuracy 0.608
train loss 1.210 val loss 1.193 and val accuracy 0.648
train loss 1.174 val loss 1.162 and val accuracy 0.656
train loss 1.149 val loss 1.137 and val accuracy 0.662


In [35]:
train_loop(model, 0.001, train_dl, val_dl, epochs=20)

train loss 1.197 val loss 1.142 and val accuracy 0.651
train loss 1.116 val loss 1.107 and val accuracy 0.670
train loss 1.093 val loss 1.081 and val accuracy 0.680
train loss 1.074 val loss 1.059 and val accuracy 0.686


# Model with character embeddings 

In [36]:
class NameDatasetEmb(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [37]:
train_2 = NameDatasetEmb(PATH/"names_train.csv", vocab2id, label2id)
val_2 = NameDatasetEmb(PATH/"names_test.csv", vocab2id, label2id)

In [38]:
batch_size = 2000
n = len(val_2)
train_dl_2 = DataLoader(train_2, batch_size=batch_size)
val_dl_2 = DataLoader(val_2, batch_size=n)

In [39]:
train_2[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
       dtype=int32),
 2)

In [40]:
class CharEmbRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size):
        super(CharEmbRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = self.emb(x.long())
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Train 

In [42]:
vocab_size = 55
emb_size = 30
hidden_size = 80
n_classes = 18
model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes)

In [43]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)

train loss 2.655 val loss 2.008 and val accuracy 0.437
train loss 2.554 val loss 1.768 and val accuracy 0.471
train loss 1.408 val loss 1.336 and val accuracy 0.580
train loss 1.302 val loss 1.210 and val accuracy 0.621
train loss 1.208 val loss 1.118 and val accuracy 0.646
train loss 1.134 val loss 1.049 and val accuracy 0.670
train loss 1.076 val loss 0.994 and val accuracy 0.686
train loss 1.027 val loss 0.949 and val accuracy 0.704
train loss 0.985 val loss 0.910 and val accuracy 0.719
train loss 0.950 val loss 0.879 and val accuracy 0.728


In [44]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)

train loss 1.075 val loss 1.098 and val accuracy 0.664
train loss 0.931 val loss 0.872 and val accuracy 0.732
train loss 0.885 val loss 0.814 and val accuracy 0.752
train loss 0.865 val loss 0.832 and val accuracy 0.738
train loss 0.808 val loss 0.759 and val accuracy 0.769
train loss 0.771 val loss 0.721 and val accuracy 0.784
train loss 0.752 val loss 0.692 and val accuracy 0.789
train loss 0.741 val loss 0.724 and val accuracy 0.778
train loss 0.739 val loss 0.712 and val accuracy 0.782
train loss 0.684 val loss 0.636 and val accuracy 0.809


In [45]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)

train loss 0.858 val loss 0.805 and val accuracy 0.759
train loss 0.668 val loss 0.624 and val accuracy 0.809
train loss 0.633 val loss 0.589 and val accuracy 0.821
train loss 0.611 val loss 0.564 and val accuracy 0.827


In [46]:
train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)

train loss 0.681 val loss 0.746 and val accuracy 0.767
train loss 0.592 val loss 0.556 and val accuracy 0.828
train loss 0.566 val loss 0.523 and val accuracy 0.838
train loss 0.554 val loss 0.508 and val accuracy 0.842


In [47]:
train_loop(model, 0.001, train_dl_2, val_dl_2, epochs=50)

train loss 0.498 val loss 0.491 and val accuracy 0.851
train loss 0.490 val loss 0.486 and val accuracy 0.852
train loss 0.487 val loss 0.483 and val accuracy 0.854
train loss 0.484 val loss 0.480 and val accuracy 0.855
train loss 0.481 val loss 0.476 and val accuracy 0.855
train loss 0.478 val loss 0.473 and val accuracy 0.856
train loss 0.475 val loss 0.470 and val accuracy 0.857
train loss 0.472 val loss 0.467 and val accuracy 0.857
train loss 0.469 val loss 0.464 and val accuracy 0.858
train loss 0.466 val loss 0.461 and val accuracy 0.859


# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs.