# Feed forward NN for text classification

In [2]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

## Dataset
wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz

wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz

In [5]:
def unpack_dataset():
    ! mkdir -p data
    ! wget -O names_test.csv.gz https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz?raw=true
    ! wget -O names_train.csv.gz https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz?raw=true
    ! gunzip *.gz
    ! mv names_test.csv names_train.csv data/

In [6]:
unpack_dataset()

--2019-09-26 10:02:19--  https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz?raw=true
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_test.csv.gz [following]
--2019-09-26 10:02:19--  https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_test.csv.gz
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz [following]
--2019-09-26 10:02:19--  https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.188.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.188.133|:443... connected.
HTTP re

In [7]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/Genia4ERtest.tar.gz'),
 PosixPath('data/sampletest2.iob2'),
 PosixPath('data/LICENSE'),
 PosixPath('data/Genia4ERtask2.iob2'),
 PosixPath('data/names_train.csv'),
 PosixPath('data/names_test.csv'),
 PosixPath('data/Genia4EReval1.iob2'),
 PosixPath('data/Genia4EReval2.iob2'),
 PosixPath('data/sampletest1.raw'),
 PosixPath('data/sampletest2.raw'),
 PosixPath('data/README.txt'),
 PosixPath('data/Genia4ERtask1.iob2'),
 PosixPath('data/Genia4EReval1.raw'),
 PosixPath('data/Genia4EReval2.raw'),
 PosixPath('data/sampletest1.iob2'),
 PosixPath('data/Genia4ERtraining.tar.gz')]

In [8]:
! head data/names_train.csv

"Adsit","Czech"
"Ajdrna","Czech"
"Antonowitsch","Czech"
"Antonowitz","Czech"
"Ballalatak","Czech"
"Ballaltick","Czech"
"Bastl","Czech"
"Baroch","Czech"
"Betlach","Czech"
"Biganska","Czech"


## vocab2id and label2id
Computing vocab2id dictionary.

In [9]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [10]:
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [11]:
vocab2id = {key:i for i, key in enumerate(vocab)}
vocab2id[" "] # I am going to use 0 to pad sequences

0

In [12]:
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [15]:
def pad_seq(x, seq_len=15, vocab2id=vocab2id):
    x = list(x)
    x = np.array([vocab2id[k] for k in x])
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, x.shape[0])
    z[0:n] = x[0:n]
    return z

In [16]:
x = pad_seq("aabbb")
x

array([29, 29, 30, 30, 30,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [17]:
# one hot encoding
def seq2matrix(x, vocab_len=55):
    z = np.zeros((x.shape[0], vocab_len))
    z[np.arange(len(x)), x] = 1
    return z

## Dataset

In [24]:
class NameDataset(Dataset):
    def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):
        self.df = pd.read_csv(path, header=None)
        self.label2id = label2id
        self.vocab2id = vocab2id
        self.seq_len = seq_len
        self.vocab_len = vocab_len 
        self.x = df[0].values
        self.y = [self.label2id[l] for l in df[1].values]
        self.vocab2id = vocab2id
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)
        return x, self.y[idx]

In [25]:
train_ds = NameDataset(PATH/"names_train.csv", vocab2id, label2id)
valid_ds = NameDataset(PATH/"names_test.csv", vocab2id, label2id)

In [34]:
batch_size = 5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [28]:
x, y = train_ds[0]
x, y

(array([ 3, 32, 47, 37, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       dtype=int32), 2)

In [29]:
len(x)

15

## Model 

In [54]:
class NN(nn.Module):
    def __init__(self, vocab_size=55, emb_size=50, n_class=18, seq_len=15, hidden=50):
        super(NN, self).__init__()
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.dropout = nn.Dropout(0.5)
        self.linear1 = nn.Linear( seq_len*emb_size, hidden)
        self.linear2 = nn.Linear(hidden, n_class)
        self.bn = nn.BatchNorm1d(hidden)
        
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.linear1(x)
        x = self.bn(F.relu(x))
        x = self.linear2(x)
        return x

In [55]:
x, y = next(iter(train_dl))

In [56]:
x

tensor([[28, 36, 40, 49, 39, 48, 43, 50,  0,  0,  0,  0,  0,  0,  0],
        [ 3, 50, 31, 36, 37, 42, 42, 37, 39, 43, 50,  0,  0,  0,  0],
        [ 4, 29, 30, 29, 32, 54, 36, 29, 42, 53, 29, 42,  0,  0,  0],
        [ 9, 40, 29, 32, 31, 36, 33, 42, 39, 43,  0,  0,  0,  0,  0],
        [16, 29, 39, 36, 43, 32, 39, 37, 42,  0,  0,  0,  0,  0,  0]],
       dtype=torch.int32)

In [57]:
y

tensor([14, 14, 14, 14, 14])

In [58]:
vocab_size=55
emb_size=50
emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)

In [59]:
emb(x.long()).shape

torch.Size([5, 15, 50])

In [60]:
emb(x.long()).view(x.size(0), -1).shape

torch.Size([5, 750])

In [61]:
model = NN()

In [62]:
y_hat = model(x.long())
y_hat

tensor([[ 0.6068, -0.1797,  0.1763, -1.1019,  0.6065,  0.2611, -0.0560, -0.0980,
          0.1155, -0.2292,  0.9285,  0.2749, -0.6643,  0.2179,  0.0859,  0.9098,
          0.5109,  0.4601],
        [-0.5979, -0.7824,  0.3607,  0.1397,  0.1994, -0.2073,  0.7742,  0.9978,
          0.4974, -0.4033, -0.1074,  0.9378,  0.9813, -0.7749,  0.7348, -0.6909,
         -0.3219, -0.5288],
        [-0.5690,  0.8720,  0.3111, -0.3945, -1.0187,  0.1962,  0.0953, -0.5634,
          0.2079,  0.4241, -0.0719, -0.4937, -0.2775,  0.4149, -0.1146,  0.8249,
          0.4472, -0.1056],
        [ 0.7055,  0.1352, -0.0606,  0.5378,  0.7861, -0.5110, -0.5375, -0.3216,
         -0.6660,  0.0518, -0.2574, -0.1374,  0.2959,  0.1700, -0.4750, -1.2977,
          0.6809, -0.0407],
        [-0.2461, -0.4997, -0.7868,  0.6758,  0.0514,  0.1365,  0.2867, -0.4872,
          0.2300, -0.2027,  0.1064, -0.1115, -0.0092, -0.0769,  0.0920,  0.2657,
         -0.9941, -0.1773]], grad_fn=<AddmmBackward>)

In [63]:
y_hat.shape

torch.Size([5, 18])

In [64]:
F.softmax(y_hat, dim=1)

tensor([[0.0780, 0.0355, 0.0507, 0.0141, 0.0780, 0.0552, 0.0402, 0.0385, 0.0477,
         0.0338, 0.1076, 0.0560, 0.0219, 0.0529, 0.0463, 0.1056, 0.0708, 0.0673],
        [0.0236, 0.0196, 0.0616, 0.0494, 0.0524, 0.0349, 0.0931, 0.1164, 0.0706,
         0.0287, 0.0385, 0.1096, 0.1145, 0.0198, 0.0895, 0.0215, 0.0311, 0.0253],
        [0.0277, 0.1169, 0.0667, 0.0330, 0.0177, 0.0595, 0.0538, 0.0278, 0.0602,
         0.0747, 0.0455, 0.0298, 0.0370, 0.0740, 0.0436, 0.1116, 0.0765, 0.0440],
        [0.1038, 0.0587, 0.0483, 0.0878, 0.1126, 0.0308, 0.0300, 0.0372, 0.0263,
         0.0540, 0.0396, 0.0447, 0.0689, 0.0608, 0.0319, 0.0140, 0.1013, 0.0492],
        [0.0445, 0.0345, 0.0259, 0.1119, 0.0599, 0.0653, 0.0758, 0.0350, 0.0717,
         0.0465, 0.0633, 0.0509, 0.0564, 0.0527, 0.0624, 0.0743, 0.0211, 0.0477]],
       grad_fn=<SoftmaxBackward>)

In [65]:
F.softmax(y_hat, dim=1).sum(1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

In [66]:
F.cross_entropy(y_hat, y)

tensor(2.9675, grad_fn=<NllLossBackward>)

In [67]:
y

tensor([14, 14, 14, 14, 14])

In [68]:
torch.max(y_hat, dim=1)[1]

tensor([10,  7,  1,  4,  3])

## Training 

In [69]:
def get_optimizer(model, lr = 0.01, wd = 0.0):
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    return optim

In [70]:
def train_model(model, optimizer, train_dl, epochs=10):
    for i in range(epochs):
        model.train()
        total = 0
        sum_loss = 0
        for x, y in train_dl:
            batch = y.shape[0]
            out = model(x.long())
            loss = F.cross_entropy(out, y)   
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += batch
            sum_loss += batch*(loss.item())
        train_loss = sum_loss/total
        valid_loss, valid_acc = valid_metrics(model, valid_dl)
        print("train loss  %.3f val loss %.3f and accuracy %.3f" % (
            train_loss, valid_loss, valid_acc))   

In [76]:
def valid_metrics(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, y in valid_dl:
        batch = y.shape[0]
        out = model(x.long())
        loss = F.cross_entropy(out, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = torch.max(out, dim=1)[1]
        correct += (pred == y).float().sum().item()
    return sum_loss/total, correct/total

In [72]:
model = NN()

In [73]:
optimizer = get_optimizer(model, lr = 0.01, wd = 1e-5)

In [74]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [77]:
valid_metrics(model, valid_dl)

(2.955097976100985, 0.027516075968296694)

In [78]:
train_model(model, optimizer, train_dl, epochs=10)

train loss  1.980 val loss 1.349 and accuracy 0.658
train loss  1.361 val loss 1.045 and accuracy 0.697
train loss  1.170 val loss 0.927 and accuracy 0.734
train loss  1.062 val loss 0.847 and accuracy 0.751
train loss  0.987 val loss 0.792 and accuracy 0.761
train loss  0.944 val loss 0.740 and accuracy 0.778
train loss  0.891 val loss 0.700 and accuracy 0.789
train loss  0.868 val loss 0.662 and accuracy 0.797
train loss  0.841 val loss 0.645 and accuracy 0.798
train loss  0.822 val loss 0.628 and accuracy 0.807
