# Classifing last names with character-level RNN

In [19]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset
`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`

`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`

In [2]:
PATH = Path("/data2/yinterian/name_dataset/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/name_dataset/names_test.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv.gz'),
 PosixPath('/data2/yinterian/name_dataset/names_train.csv'),
 PosixPath('/data2/yinterian/name_dataset/names_test.csv.gz')]

In [3]:
! head /data2/yinterian/name_dataset/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


### Processing data

In [20]:
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [21]:
# getting a vocabulary of characters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]
len(vocab)

55

In [22]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [57]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
len(labels)

18

In [24]:
y = np.zeros(seq_length)
y

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [42]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_length, dtype=np.int8)
    n = min(seq_len, x.shape[0])
    z[seq_len - n:] = x[0:n]
    return z

In [48]:
x = pad_seq("aabbb")
x

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 29, 29, 30, 30, 30],
      dtype=int8)

In [86]:
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

In [135]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        x = seq2matrix(x, self.vocab_len)
        return x, self.y[idx]

In [176]:
train = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
test = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [177]:
batch_size = 2000
n = len(test)
train_dl = DataLoader(train, batch_size=batch_size)
test_dl = DataLoader(test, batch_size=n)

In [178]:
len(train)

13374

In [90]:
x,y = train[0]
print(x.shape,y)

(15, 55) 2


## Model

In [179]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden):
        combined = torch.cat((x, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self, bash_size):
        return Variable(torch.zeros(bash_size, self.hidden_size))

## Debugging model

In [180]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [181]:
x, y = next(iter(train_dl))

In [182]:
x.shape, y.shape

(torch.Size([2000, 15, 55]), torch.Size([2000]))

In [183]:
batch = x.shape[0]
h = model.initHidden(batch).cuda()
x = Variable(x).cuda().float()
y = Variable(y).cuda().long()

In [184]:
torch.cat((x[:,0], h), 1).size()

torch.Size([2000, 155])

In [185]:
for ei in range(x.shape[1]):
    x_t, h = model(x[:,ei], h)

In [108]:
loss = F.cross_entropy(x_t, y)
loss

Variable containing:
 2.8572
[torch.cuda.FloatTensor of size 1 (GPU 0)]

## Training

In [186]:
vocab_size = 55
hidden_size = 100
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()

In [187]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [212]:
def train(model, optim):
    model.train()
    total = 0
    sum_loss = 0
    for i, (x, y) in enumerate(train_dl):
        batch = x.shape[0]
        h = model.initHidden(batch).cuda()
        loss = 0
        x = Variable(x).cuda().float()
        y = Variable(y).cuda().long()
        
        for ei in range(x.shape[1]):
            out, h = model(x[:,ei], h)
        
        loss = F.nll_loss(out, y)
        
        optim.zero_grad()
        loss.backward(retain_graph=True)
        optim.step()
        total += batch
        sum_loss += batch*(loss.data[0])
    return sum_loss/total

In [189]:
def predict(model, valid_dl):
    model.eval()
    res = []
    x, y = next(iter(valid_dl))
    x = Variable(x).cuda().float()
    y = Variable(y).cuda().long()
    N = x.shape[0]
    h = model.initHidden(N).cuda()
    for ei in range(x.shape[1]):
        out, h = model(x[:,ei], h)
    loss = F.nll_loss(out, y)
    _, pred = torch.max(out.data, 1)
    correct = pred.eq(y.data).cpu().sum()
    print("loss and accuracy", loss.data[0], correct/N)
    return loss.data[0], correct/N 

In [221]:
vocab_size = 55
hidden_size = 80
n_classes = 18
model = CharRNN(vocab_size, hidden_size, n_classes).cuda()
optim = get_optimizer(model, lr =0.01, wd = 0.0)

In [222]:
for i in range(20):
    loss = train(model, optim)
    if i%5 == 1: print("loss ", loss)

loss  2.6379703053519634
loss  3.079397971237545
loss  4.699799905358827
loss  1.4068568627075246


In [223]:
predict(model, test_dl)

loss and accuracy 1.306892991065979 0.598175564528189


(1.306892991065979, 0.598175564528189)

In [224]:
optim = get_optimizer(model, lr =0.005, wd = 0.0)

In [225]:
for i in range(20):
    loss = train(model, optim)
    if i%5 == 1: print("loss ", loss)

loss  1.3531686674591243
loss  1.2070527767339985
loss  1.1234513457195452
loss  1.0495076240116266


In [227]:
predict(model, test_dl)

loss and accuracy 0.9648178219795227 0.7126514131897712


(0.9648178219795227, 0.7126514131897712)

In [228]:
optim = get_optimizer(model, lr =0.001, wd = 0.0)

In [229]:
for i in range(20):
    loss = train(model, optim)
    if i%5 == 1: print("loss ", loss)

loss  0.9698967509015145
loss  0.9602218927851489
loss  0.9478184662004998
loss  0.9370886844531185


In [230]:
predict(model, test_dl)

loss and accuracy 0.9145036935806274 0.7240915208613729


(0.9145036935806274, 0.7240915208613729)

## Model with word embeddings 

# References
This notebook is a modified version of this tutorial
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html