# Classifing last names with character-level RNN

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [8]:
def unpack_dataset():
    ! mkdir -p data
    ! wget -O names_test.csv.gz https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz?raw=true
    ! wget -O names_train.csv.gz https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz?raw=true
    ! gunzip *.gz
    ! mv names_test.csv names_train.csv data/

In [9]:
#unpack_dataset()

--2019-09-23 11:07:56--  https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz?raw=true
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_test.csv.gz [following]
--2019-09-23 11:07:56--  https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_test.csv.gz
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz [following]
--2019-09-23 11:07:57--  https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.24.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.24.133|:443... connected.
HTTP requ

In [10]:
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/glove.6B.300d.txt'),
 PosixPath('data/glove.6B.100d.txt'),
 PosixPath('data/names_train.csv'),
 PosixPath('data/names_test.csv'),
 PosixPath('data/glove.6B.50d.txt'),
 PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000'),
 PosixPath('data/glove.6B.200d.txt'),
 PosixPath('data/glove.6B.zip')]

In [11]:
! head data/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [12]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [13]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [14]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [15]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [16]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [17]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int32)

In [18]:
# one hot encoding
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [19]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [20]:
train_ds = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
valid_ds = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [21]:
batch_size = 2000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=len(valid_ds))

In [23]:
len(train_ds), len(valid_ds)

(13374, 13374)

In [26]:
x, y = train_ds[0]
print(x.shape, y)

(15, 55) 2


In [27]:
x

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0

## Model

In [28]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [30]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes) #.cuda()

In [31]:
x, y = next(iter(train_dl))

In [32]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [34]:
batch = x.shape[0]
h = model.initHidden(batch) #.cuda()
x = x.float() #.cuda()
y = y.long() #.cuda()

In [35]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [36]:
for ei in range(x.shape[1]):
    y_t, h = model(x[:,ei], h)

In [37]:
# note that just the last x_t is used in the loss
# update
loss = F.cross_entropy(y_t, y)
loss.item()

2.8595266342163086

## Training

In [38]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes) #.cuda()

In [39]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [40]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch) #.cuda()
        x = x.float() #.cuda()
        y = y.long()  #.cuda()
        
        for t in range(x.shape[1]):
            out, h = model(x[:,t], h)
        
        loss = F.cross_entropy(out, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [65]:
def val_metric(model, valid_dl):
    model.eval()
    x, y = next(iter(valid_dl)) # just one batch
    x = x.float() #x.cuda()
    y = y.long()  # y.cuda()
    N = x.shape[0]
    h = model.initHidden(N) # .cuda()
    for t in range(x.shape[1]):
        out, h = model(x[:,t], h)
    loss = F.cross_entropy(out, y)
    _, pred = torch.max(out, 1)
    acc = pred.eq(y).sum().float()/N
    return loss.item(), acc.item()

In [42]:
vocab_size = 55
hidden_size = 80
n_classes = 18

In [49]:
def train_loop(model, lr, train_dl, valid_dl, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        val_loss, val_acc = val_metric(model, valid_dl)
        if i%5 == 1: print("train loss %.3f val loss %.3f and val accuracy %.3f" % (loss, val_loss, val_acc))

In [50]:
model = CharRNN(vocab_size, hidden_size, n_classes) #.cuda()

In [51]:
train_loop(model, 0.01, train_dl, valid_dl, epochs=20)

train loss 1.841 val loss 1.788 and val accuracy 0.469
train loss 1.332 val loss 1.268 and val accuracy 0.611
train loss 1.048 val loss 1.023 and val accuracy 0.700
train loss 0.879 val loss 0.854 and val accuracy 0.747


In [52]:
train_loop(model, 0.001, train_dl, valid_dl, epochs=20)

train loss 0.796 val loss 0.789 and val accuracy 0.767
train loss 0.768 val loss 0.766 and val accuracy 0.771
train loss 0.746 val loss 0.743 and val accuracy 0.778
train loss 0.728 val loss 0.725 and val accuracy 0.785


In [53]:
train_loop(model, 0.001, train_dl, valid_dl, epochs=20)

train loss 0.715 val loss 0.711 and val accuracy 0.788
train loss 0.697 val loss 0.694 and val accuracy 0.793
train loss 0.684 val loss 0.681 and val accuracy 0.796
train loss 0.671 val loss 0.667 and val accuracy 0.799


# Model with character embeddings 

In [54]:
class NameDatasetEmb(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [55]:
train_ds_2 = NameDatasetEmb(PATH/"names_train.csv", vocab2id, label2id)
valid_ds_2 = NameDatasetEmb(PATH/"names_test.csv", vocab2id, label2id)

In [56]:
batch_size = 2000
n = len(valid_ds_2)
train_dl_2 = DataLoader(train_ds_2, batch_size=batch_size)
valid_dl_2 = DataLoader(valid_ds_2, batch_size=n)

In [57]:
train_ds_2[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3, 32, 47, 37, 48],
       dtype=int32), 2)

In [62]:
class CharEmbRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size):
        super(CharEmbRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.hidden_size = hidden_size
        self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)
        self.linear_h2o = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        x = x.long() # this could be in the training loop
        x = self.emb(x)
        combined = torch.cat((x, hidden), 1)
        hidden = torch.tanh(self.linear_i2h(combined))
        output = self.linear_h2o(hidden)
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Train 

In [66]:
vocab_size = 55
emb_size = 30
hidden_size = 80
n_classes = 18
model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes) #.cuda()

In [67]:
train_loop(model, 0.01, train_dl_2, valid_dl_2, epochs=50)

train loss 4.425 val loss 2.188 and val accuracy 0.393
train loss 1.477 val loss 1.371 and val accuracy 0.570
train loss 1.296 val loss 1.208 and val accuracy 0.628
train loss 1.195 val loss 1.115 and val accuracy 0.647
train loss 1.120 val loss 1.041 and val accuracy 0.672
train loss 1.057 val loss 0.979 and val accuracy 0.690
train loss 0.998 val loss 0.924 and val accuracy 0.713
train loss 0.955 val loss 0.874 and val accuracy 0.736
train loss 0.920 val loss 0.929 and val accuracy 0.710
train loss 0.862 val loss 0.805 and val accuracy 0.760


In [69]:
train_loop(model, 0.01, train_dl_2, valid_dl_2, epochs=50)

train loss 1.050 val loss 0.953 and val accuracy 0.711
train loss 0.839 val loss 0.785 and val accuracy 0.766
train loss 0.770 val loss 0.720 and val accuracy 0.782
train loss 0.739 val loss 0.704 and val accuracy 0.788
train loss 0.723 val loss 0.676 and val accuracy 0.794
train loss 0.670 val loss 0.627 and val accuracy 0.810
train loss 0.649 val loss 0.598 and val accuracy 0.815
train loss 0.622 val loss 0.583 and val accuracy 0.823
train loss 0.600 val loss 0.553 and val accuracy 0.829
train loss 0.564 val loss 0.526 and val accuracy 0.837


In [70]:
train_loop(model, 0.01, train_dl_2, valid_dl_2, epochs=40)

train loss 0.659 val loss 0.715 and val accuracy 0.771
train loss 0.557 val loss 0.520 and val accuracy 0.838
train loss 0.511 val loss 0.475 and val accuracy 0.853
train loss 0.487 val loss 0.449 and val accuracy 0.860
train loss 0.471 val loss 0.427 and val accuracy 0.867
train loss 0.460 val loss 0.417 and val accuracy 0.871
train loss 0.440 val loss 0.405 and val accuracy 0.877
train loss 0.425 val loss 0.388 and val accuracy 0.879


In [71]:
train_loop(model, 0.001, train_dl_2, valid_dl_2, epochs=50)

train loss 0.357 val loss 0.353 and val accuracy 0.894
train loss 0.350 val loss 0.347 and val accuracy 0.896
train loss 0.347 val loss 0.343 and val accuracy 0.897
train loss 0.343 val loss 0.339 and val accuracy 0.899
train loss 0.339 val loss 0.335 and val accuracy 0.900
train loss 0.335 val loss 0.332 and val accuracy 0.901
train loss 0.332 val loss 0.328 and val accuracy 0.902
train loss 0.328 val loss 0.325 and val accuracy 0.903
train loss 0.325 val loss 0.321 and val accuracy 0.904
train loss 0.322 val loss 0.318 and val accuracy 0.905


## Lab
* Add dropout to the lastest model. 
* Change some of the hyper-parameters.
* Play with different learning rates.

# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs.