## Classifiying surnames
with muti-class logistic regression and bag of letters

In [76]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

In [77]:
def unpack_dataset():
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz 
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz 
    ! mkdir -p data
    ! gunzip names_train.csv.gz 
    ! gunzip names_test.csv.gz
    ! mv names*.csv data

In [78]:
#unpack_dataset()

In [79]:
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/names_train.csv'), PosixPath('data/names_test.csv')]

## Processing data
Here we split every last name into letters and assign every letter an id. We represent a last name by a vector of letter frequencies.

In [80]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [81]:
val = pd.read_csv(PATH/"names_test.csv", header=None)

In [82]:
df.head()

Unnamed: 0,0,1
0,Adsit,Czech
1,Ajdrna,Czech
2,Antonowitsch,Czech
3,Antonowitz,Czech
4,Ballalatak,Czech


In [85]:
## vocab is a list of unique letters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:5]

[' ', "'", ',', 'A', 'B']

In [87]:
## vocab2id is a dictionary mapping letters to a unique number
vocab2id = {key:i for i, key in enumerate(vocab)}
#vocab2id

In [88]:
## label2id is a dictionary mapping classes to ids
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [89]:
num_letters = len(vocab)
num_letters

55

In [90]:
def encode_data(df, vocab2id, label2id, num_letters):
    """ Returns encoded data
    
    outputs:
    data: a np array of shape (df.shape[0], num_letters)
          data[i, j] counts the number of times letter vocab[j]
          is on observation j
    y: np array of len df.shape[0]. Id of the labels of each observation.
    """
    data = np.zeros((df.shape[0], num_letters))
    y = np.zeros(df.shape[0])
    for i, row in df.iterrows():
        y[i] = label2id[row[1]]
        for c in list(row[0]):
            data[i][vocab2id[c]] +=1
    return data, y

In [91]:
x_train, y_train = encode_data(df, vocab2id, label2id, num_letters)
x_valid, y_valid = encode_data(val, vocab2id, label2id, num_letters)
x_train.shape, x_valid.shape

((13374, 55), (6700, 55))

In [92]:
y_train[0]

2.0

In [15]:
train[0]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 0.])

In [16]:
# Checking
[vocab[i] for i, v in enumerate(train[0]) if v==1 ]

['A', 'd', 'i', 's', 't']

In [17]:
[vocab[i] for i, v in enumerate(train[1]) if v==1 ]

['A', 'a', 'd', 'j', 'n', 'r']

## Model
We are going to write a multiclass logistic regression model. Here are the equations:

\begin{align}
z_1 & = a_{11}x_1 + \dots a_{1D}x_D + b_1\\
z_2 & = a_{21}x_1 + \dots a_{2D}x_D + b_2 \\
& \dots \\
z_K & = a_{K1}x_1 + \dots a_{KD}x_D + b_K
\end{align}

$$\hat{y}_k = \frac{e^{z_k}}{ \sum_{i=1}^K e^{z_i}}$$


Here the observations are $D$ dimensional vectors $x = (x_1, \dots, x_D)$.

In order to get multiclass logistic regression, we do a linear transformation and then a softmax transformation.

For numerical reasons, it is better not to apply the softmax directly after the linear transformation but to apply it together with the loss function. The loss function `F.cross_entropy` combines log_softmax and nll_loss in a single function. Therefore to write the model just do the linear transformation with the appropriate parameters.

In [93]:
class MultiLogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultiLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.linear(x)
        return x

In [109]:
linear = nn.Linear(55, 18)

In [141]:
[p.shape for p in linear.parameters()]

[torch.Size([18, 55]), torch.Size([18])]

In [110]:
x = x_train[:5]
x = torch.FloatTensor(x)

In [113]:
linear(x).shape

torch.Size([5, 18])

In [142]:
model = MultiLogisticRegression(55, 18)

In [143]:
[p.shape for p in model.parameters()]

[torch.Size([18, 55]), torch.Size([18])]

In [115]:
y_hat = model(x)

In [116]:
y = y_train[:5]
y = torch.LongTensor(y)
y.shape

torch.Size([5])

In [117]:
y

tensor([2, 2, 2, 2, 2])

In [118]:
y_hat

tensor([[ 0.1813,  0.0577, -0.1362,  0.0857, -0.0179, -0.0444, -0.1344, -0.0322,
          0.0118, -0.1159, -0.0112,  0.1755,  0.1616,  0.0831,  0.1713,  0.1547,
         -0.2563, -0.0202],
        [ 0.1791,  0.1832, -0.2689, -0.0783, -0.0911, -0.2611, -0.1542,  0.2432,
          0.2308, -0.0384, -0.1001, -0.1897,  0.1539,  0.2216, -0.1053,  0.4789,
         -0.2517,  0.0345],
        [ 0.1286, -0.3850,  0.1606,  0.0854,  0.0234,  0.1580, -0.1072, -0.5261,
         -0.0843, -0.0926, -0.5665,  0.2665, -0.0182, -0.1218, -0.4369,  0.1597,
         -0.1874,  0.0051],
        [ 0.4305, -0.4202,  0.2180, -0.1700,  0.1111,  0.3632, -0.3402, -0.1704,
         -0.2563, -0.1188, -0.3392,  0.3766,  0.0647, -0.0113, -0.3780,  0.2348,
         -0.1337,  0.1811],
        [ 0.3074, -0.2640, -0.1997,  0.1764, -0.7316, -0.3513,  0.3223, -0.4395,
          0.7176, -0.3365,  0.1911, -0.2637,  0.2245, -0.1673, -0.0556,  0.6120,
         -0.3820, -0.0515]], grad_fn=<AddmmBackward>)

In [121]:
_, pred = torch.max(y_hat, 1)

In [122]:
pred

tensor([ 0, 15, 11,  0,  8])

In [108]:
F.cross_entropy(y_hat, y)

tensor(3.1395, grad_fn=<NllLossBackward>)

## Training loop

`loss.item()` to get a Python number from a tensor containing a single value.

In [130]:
def train_epochs(model, x_train, y_train, x_valid, y_valid, epochs, lr=0.01, wd=1e-4):
    ## get an optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    ## convert your training data to pytorch tensors
    x = torch.FloatTensor(x_train)
    y = torch.LongTensor(y_train)
    for i in range(epochs):
        model.train()
        ## evaluate your training data to get y_hat
        y_hat = model(x)    
        ## compute your loss
        loss = F.cross_entropy(y_hat, y)
        ## zero_grad
        optimizer.zero_grad()
        ## compute gradients
        loss.backward()
        ## call gradient descent
        optimizer.step()
        ## call valid_metrics(model, x_valid, y_valid)
        ## print train loss, valid loss and potentially valid accuracy
        val_loss, val_acc = valid_metrics(model, x_valid, y_valid)
        if True:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % 
                  (loss.item(), val_loss, val_acc))

In [131]:
def valid_metrics(model, x_valid, y_valid):
    model.eval()
    x = torch.FloatTensor(x_valid)
    y = torch.LongTensor(y_valid)
    y_out = model(x)  
    loss = F.cross_entropy(y_out, y)
    _, y_hat = torch.max(y_out, 1)
    val_acc = y_hat.eq(y).sum().float()/y.size(0)
    return loss.item(), val_acc

In [132]:
model = MultiLogisticRegression(55, 18)

In [133]:
train_epochs(model, x_train, y_train, x_valid, y_valid, 300, lr=0.001, wd=1e-4)

train loss 2.995 val loss 2.984 and val accuracy 0.011
train loss 2.984 val loss 2.974 and val accuracy 0.013
train loss 2.974 val loss 2.964 and val accuracy 0.016
train loss 2.963 val loss 2.953 and val accuracy 0.019
train loss 2.953 val loss 2.943 and val accuracy 0.020
train loss 2.942 val loss 2.932 and val accuracy 0.022
train loss 2.932 val loss 2.922 and val accuracy 0.026
train loss 2.922 val loss 2.912 and val accuracy 0.028
train loss 2.911 val loss 2.902 and val accuracy 0.031
train loss 2.901 val loss 2.892 and val accuracy 0.034
train loss 2.891 val loss 2.881 and val accuracy 0.036
train loss 2.881 val loss 2.871 and val accuracy 0.043
train loss 2.870 val loss 2.861 and val accuracy 0.048
train loss 2.860 val loss 2.851 and val accuracy 0.053
train loss 2.850 val loss 2.841 and val accuracy 0.058
train loss 2.840 val loss 2.831 and val accuracy 0.064
train loss 2.830 val loss 2.821 and val accuracy 0.071
train loss 2.820 val loss 2.811 and val accuracy 0.080
train loss

train loss 1.867 val loss 1.869 and val accuracy 0.519
train loss 1.863 val loss 1.866 and val accuracy 0.520
train loss 1.860 val loss 1.862 and val accuracy 0.520
train loss 1.856 val loss 1.859 and val accuracy 0.520
train loss 1.853 val loss 1.855 and val accuracy 0.520
train loss 1.849 val loss 1.852 and val accuracy 0.520
train loss 1.846 val loss 1.849 and val accuracy 0.520
train loss 1.843 val loss 1.845 and val accuracy 0.520
train loss 1.839 val loss 1.842 and val accuracy 0.519
train loss 1.836 val loss 1.839 and val accuracy 0.520
train loss 1.833 val loss 1.835 and val accuracy 0.520
train loss 1.829 val loss 1.832 and val accuracy 0.520
train loss 1.826 val loss 1.829 and val accuracy 0.520
train loss 1.823 val loss 1.826 and val accuracy 0.519
train loss 1.820 val loss 1.822 and val accuracy 0.519
train loss 1.816 val loss 1.819 and val accuracy 0.519
train loss 1.813 val loss 1.816 and val accuracy 0.519
train loss 1.810 val loss 1.813 and val accuracy 0.519
train loss

In [138]:
model = MultiLogisticRegression(55, 18)
train_epochs(model, x_train, y_train, x_valid, y_valid, 300, lr=0.05, wd=1e-4)

train loss 2.840 val loss 2.379 and val accuracy 0.479
train loss 2.372 val loss 2.023 and val accuracy 0.511
train loss 2.015 val loss 1.788 and val accuracy 0.512
train loss 1.781 val loss 1.653 and val accuracy 0.517
train loss 1.646 val loss 1.582 and val accuracy 0.510
train loss 1.578 val loss 1.549 and val accuracy 0.504
train loss 1.546 val loss 1.529 and val accuracy 0.503
train loss 1.527 val loss 1.511 and val accuracy 0.511
train loss 1.510 val loss 1.489 and val accuracy 0.524
train loss 1.488 val loss 1.464 and val accuracy 0.540
train loss 1.462 val loss 1.438 and val accuracy 0.566
train loss 1.436 val loss 1.414 and val accuracy 0.586
train loss 1.410 val loss 1.392 and val accuracy 0.601
train loss 1.387 val loss 1.372 and val accuracy 0.614
train loss 1.366 val loss 1.352 and val accuracy 0.622
train loss 1.345 val loss 1.331 and val accuracy 0.623
train loss 1.323 val loss 1.309 and val accuracy 0.626
train loss 1.300 val loss 1.287 and val accuracy 0.629
train loss

train loss 0.960 val loss 0.993 and val accuracy 0.683
train loss 0.959 val loss 0.992 and val accuracy 0.683
train loss 0.959 val loss 0.992 and val accuracy 0.683
train loss 0.959 val loss 0.992 and val accuracy 0.683
train loss 0.959 val loss 0.992 and val accuracy 0.683
train loss 0.959 val loss 0.992 and val accuracy 0.683
train loss 0.958 val loss 0.991 and val accuracy 0.683
train loss 0.958 val loss 0.991 and val accuracy 0.684
train loss 0.958 val loss 0.991 and val accuracy 0.684
train loss 0.958 val loss 0.991 and val accuracy 0.684
train loss 0.957 val loss 0.991 and val accuracy 0.683
train loss 0.957 val loss 0.991 and val accuracy 0.683
train loss 0.957 val loss 0.990 and val accuracy 0.683
train loss 0.957 val loss 0.990 and val accuracy 0.683
train loss 0.957 val loss 0.990 and val accuracy 0.683
train loss 0.956 val loss 0.990 and val accuracy 0.683
train loss 0.956 val loss 0.990 and val accuracy 0.683
train loss 0.956 val loss 0.990 and val accuracy 0.683
train loss