In [7]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from collections import OrderedDict
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset, TensorDataset
from nltk.tokenize import word_tokenize

In [8]:
train_data = pd.read_csv("./data/train.csv")
train_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_data['author'])

LabelEncoder()

In [10]:
encoded_authors = le.transform(train_data['author'])
encoded_authors

array([0, 1, 0, ..., 0, 0, 1])

In [14]:
le.inverse_transform(2)

'MWS'

In [35]:
EMBEDDING_DIM = 100
FIXED_LENGTH = 64
HIDDEN_SIZE = 128
LINEAR_SIZE = 64

In [15]:
def tokenize(text):
    ret = word_tokenize(text)
    return ret

In [16]:
tokenized_text = train_data['text'].str.lower().apply(tokenize)
tokenized_text.head()

0    [this, process, ,, however, ,, afforded, me, n...
1    [it, never, once, occurred, to, me, that, the,...
2    [in, his, left, hand, was, a, gold, snuff, box...
3    [how, lovely, is, spring, as, we, looked, from...
4    [finding, nothing, else, ,, not, even, gold, ,...
Name: text, dtype: object

In [21]:
vocab = set()
for text in tokenized_text:
    vocab = vocab.union(set(text))
len(vocab)

25369

In [22]:
word_to_ix = {word : i + 1  for i, word in enumerate(vocab)}

In [23]:
def wti(tokens, padding=True, fixed_length=FIXED_LENGTH):
    pad = []
    ret = [word_to_ix[w] for w in tokens]
    if len(ret) > fixed_length:
        ret = ret[:fixed_length]
    pad = [0] * (fixed_length - len(tokens))    
    return np.array(ret + pad)

In [24]:
tokenized_ix = [] 
for tokens in tokenized_text:
    tokenized_ix.append(wti(tokens))
tokenized_ix = np.array(tokenized_ix)

In [25]:
tokenized_ix.shape

(19579, 64)

In [26]:
tokenized_ix[0]

array([  579, 16699, 19870,  3341, 19870, 14653, 10516,   877,  3060,
       18406, 11372, 18610, 11377, 18406,  5710, 20149,  4500,   451,
         329, 12439, 23335, 16586,  2047, 19870, 25234, 17228,  7339,
       18610,  9644, 24490,   329, 12444, 17727, 19870,  3781,  9115,
       21985, 18406, 18610, 20983,  4500,  7239,  8353, 13420, 16430,
       18610,  8386, 15232,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [56]:
if torch.cuda.is_available:
    dtype = torch.cuda.LongTensor
else:
    dtype = torch.LongTensor

In [57]:
dataset = TensorDataset(dtype(tokenized_ix), dtype(encoded_authors))

In [58]:
dataloader = DataLoader(dataset, batch_size=50, shuffle=True)

## lstm : input(seqlen, batch, inputsize) ## 

In [59]:
class Model(nn.Module):
    def __init__(self, vocab_size,
                 embedding_dim,
                 hidden_size,
                 linear_size,
                 cuda=False,
                 nlayers=1):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=nlayers, batch_first=True)
        self.sequential = nn.Sequential(
        OrderedDict([
            ('lin1', nn.Linear(hidden_size, linear_size)),
            ('relu', nn.ReLU()),
            ('lin2', nn.Linear(linear_size, 3)),
        ]))
        
    def forward(self, inputs):
        embeds = self.embedding(inputs) 
        lstm, (h, _) = self.lstm(embeds)
        last = lstm[:, -1, :]
        output = self.sequential(last)
        return output

In [4]:
import random
import numpy as np

In [5]:
a = np.arange(10)
random.shuffle(a)
a

array([2, 9, 7, 5, 6, 1, 8, 3, 0, 4])

In [6]:
b = np.arange(10)
b[a]

array([2, 9, 7, 5, 6, 1, 8, 3, 0, 4])

In [60]:
if torch.cuda.is_available:
    net = Model(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_SIZE, LINEAR_SIZE).cuda()
else:
    net = Model(len(vocab) + 1, EMBEDDING_DIM, HIDDEN_SIZE, LINEAR_SIZE)
net

Model(
  (embedding): Embedding(25370, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (sequential): Sequential(
    (lin1): Linear(in_features=128, out_features=64)
    (relu): ReLU()
    (lin2): Linear(in_features=64, out_features=3)
  )
)

In [61]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

In [63]:
for epoch in range(100):
    for i, (sentence, target) in enumerate(dataloader):
        var_x, var_y = Variable(sentence).cuda(), Variable(target).cuda()
        pred_score = net(var_x)
        _, pred = pred_score.max(dim=1)

    #     import IPython.core.debugger
    #     IPython.core.debugger.set_trace()
        equals = (pred == var_y).float() 
        acc = equals.sum() / sentence.size(0)

        y_loss = criterion(pred_score, var_y)
        optimizer.zero_grad()
        y_loss.backward()
        optimizer.step()
        if i % 20 == 19:
            print('loss', y_loss.data[0], 'acc', acc.data[0])

loss 1.0752227306365967 acc 0.5199999809265137
loss 1.104432463645935 acc 0.3799999952316284
loss 1.073684573173523 acc 0.4599999785423279
loss 1.081127405166626 acc 0.3999999761581421
loss 1.047908902168274 acc 0.5
loss 1.1002520322799683 acc 0.3799999952316284
loss 1.084128975868225 acc 0.41999998688697815
loss 1.0829085111618042 acc 0.3999999761581421
loss 1.0986785888671875 acc 0.3400000035762787
loss 1.1084715127944946 acc 0.3999999761581421
loss 1.091080904006958 acc 0.35999998450279236
loss 1.0823174715042114 acc 0.4399999976158142
loss 1.0900958776474 acc 0.3400000035762787
loss 1.0789662599563599 acc 0.4399999976158142
loss 1.0730787515640259 acc 0.47999998927116394
loss 1.0952273607254028 acc 0.3999999761581421
loss 1.083926796913147 acc 0.4399999976158142
loss 1.0768883228302002 acc 0.4399999976158142
loss 1.0723313093185425 acc 0.4399999976158142
loss 1.0716025829315186 acc 0.47999998927116394
loss 1.1180696487426758 acc 0.41999998688697815
loss 1.0079481601715088 acc 0.459

loss 0.12266556918621063 acc 0.9599999785423279
loss 0.10538741946220398 acc 0.9599999785423279
loss 0.11226654797792435 acc 0.9599999785423279
loss 0.0872671902179718 acc 0.9599999785423279
loss 0.43745553493499756 acc 0.8799999952316284
loss 0.0910278707742691 acc 0.9599999785423279
loss 0.0871838629245758 acc 0.9799999594688416
loss 0.07187561690807343 acc 0.9799999594688416
loss 0.024505339562892914 acc 1.0
loss 0.05355822667479515 acc 0.9799999594688416
loss 0.2436973601579666 acc 0.9199999570846558
loss 0.1825934797525406 acc 0.9799999594688416
loss 0.04527368023991585 acc 0.9799999594688416
loss 0.10311517119407654 acc 0.9599999785423279
loss 0.08516164124011993 acc 0.9799999594688416
loss 0.01796910911798477 acc 1.0
loss 0.07351718097925186 acc 0.9799999594688416
loss 0.06333431601524353 acc 0.9799999594688416
loss 0.12687212228775024 acc 0.9399999976158142
loss 0.014121640473604202 acc 1.0
loss 0.03722082078456879 acc 0.9799999594688416
loss 0.009759149514138699 acc 1.0
loss 0

loss 0.004997623153030872 acc 1.0
loss 0.0004960346268489957 acc 1.0
loss 0.003001241711899638 acc 1.0
loss 0.011879375204443932 acc 1.0
loss 0.006246996112167835 acc 1.0
loss 0.0021658730693161488 acc 1.0
loss 0.0009072351385839283 acc 1.0
loss 0.007164201699197292 acc 1.0
loss 0.0032293510157614946 acc 1.0
loss 0.0018472671508789062 acc 1.0
loss 0.022092105820775032 acc 0.9799999594688416
loss 0.022001300007104874 acc 1.0
loss 0.0017378473421558738 acc 1.0
loss 0.004268376622349024 acc 1.0
loss 0.00036798001383431256 acc 1.0
loss 0.0004966449923813343 acc 1.0
loss 0.0004028511175420135 acc 1.0
loss 0.0035031079314649105 acc 1.0
loss 0.00226032268255949 acc 1.0
loss 0.00031558514456264675 acc 1.0
loss 0.00024177551676984876 acc 1.0
loss 0.0023622154258191586 acc 1.0
loss 0.0004102849925402552 acc 1.0
loss 0.0019222497940063477 acc 1.0
loss 0.00011712074046954513 acc 1.0
loss 0.08683198690414429 acc 0.9799999594688416
loss 0.0015444325981661677 acc 1.0
loss 0.006526284385472536 acc 1.0

loss 0.00010214805661235005 acc 1.0
loss 0.00048251153202727437 acc 1.0
loss 0.003456738078966737 acc 1.0
loss 0.00017432689492125064 acc 1.0
loss 5.3758620197186247e-05 acc 1.0
loss 0.00034088134998455644 acc 1.0
loss 0.0029952717013657093 acc 1.0
loss 0.07750865817070007 acc 0.9799999594688416
loss 0.047856640070676804 acc 0.9799999594688416
loss 0.00012214183516334742 acc 1.0
loss 0.00040005205664783716 acc 1.0
loss 0.00461939349770546 acc 1.0
loss 0.0017993641085922718 acc 1.0
loss 0.0018564533675089478 acc 1.0
loss 0.002983155194669962 acc 1.0
loss 0.001886663492769003 acc 1.0
loss 0.0002585125039331615 acc 1.0
loss 0.001394562772475183 acc 1.0
loss 0.00010123253014171496 acc 1.0
loss 0.0009524965425953269 acc 1.0
loss 0.00022385120973922312 acc 1.0
loss 0.0007463169167749584 acc 1.0
loss 0.017142323777079582 acc 0.9799999594688416
loss 0.0020403671078383923 acc 1.0
loss 0.001033191685564816 acc 1.0
loss 8.696556324139237e-05 acc 1.0
loss 0.00011130332859465852 acc 1.0
loss 0.0007

loss 0.01963825896382332 acc 0.9799999594688416
loss 1.7070769899873994e-05 acc 1.0
loss 0.000393586145946756 acc 1.0
loss 0.0001228713954333216 acc 1.0
loss 1.8043518139165826e-05 acc 1.0
loss 1.5277862985385582e-05 acc 1.0
loss 2.079010073430254e-06 acc 1.0
loss 5.492210402735509e-05 acc 1.0
loss 0.000586662266869098 acc 1.0
loss 8.707046617928427e-06 acc 1.0
loss 0.013278109952807426 acc 1.0
loss 0.0002257871674373746 acc 1.0
loss 0.2050478309392929 acc 0.9599999785423279
loss 0.0004987144493497908 acc 1.0
loss 0.00021675109746865928 acc 1.0
loss 0.00016542435332667083 acc 1.0
loss 0.005892486777156591 acc 1.0
loss 5.743980364059098e-05 acc 1.0
loss 6.007194679114036e-05 acc 1.0
loss 0.0005168628413230181 acc 1.0
loss 0.0001349163067061454 acc 1.0
loss 3.279685915913433e-05 acc 1.0
loss 0.0007378864102065563 acc 1.0
loss 8.94546537892893e-05 acc 1.0
loss 5.042076008976437e-05 acc 1.0
loss 0.00027531624073162675 acc 1.0
loss 0.00013645172293763608 acc 1.0
loss 0.025001034140586853 ac

loss 7.629394360719743e-08 acc 1.0
loss 2.574920756615029e-07 acc 1.0
loss 1.4305115314527939e-07 acc 1.0
loss 1.23977656585339e-07 acc 1.0
loss 3.814697322468419e-07 acc 1.0
loss 2.0027160019253643e-07 acc 1.0
loss 7.629394360719743e-08 acc 1.0
loss 9.536743306171047e-08 acc 1.0
loss 2.574920756615029e-07 acc 1.0
loss 4.482269275740691e-07 acc 1.0
loss 3.814697180359872e-08 acc 1.0
loss 6.580352760465757e-07 acc 1.0
loss 4.95910626341356e-07 acc 1.0
loss 1.23977656585339e-06 acc 1.0
loss 8.1062319168268e-07 acc 1.0
loss 4.1007996287589776e-07 acc 1.0
loss 2.384185791015625e-07 acc 1.0
loss 7.629394360719743e-08 acc 1.0
loss 3.337860050578456e-07 acc 1.0
loss 1.6212463549436507e-07 acc 1.0
loss 1.497268726780021e-06 acc 1.0
loss 1.0490417423625331e-07 acc 1.0
loss 1.1444091541079615e-07 acc 1.0
loss 3.1471253691961465e-07 acc 1.0
loss 1.5258788721439487e-07 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 7.152557373046875e-07 acc 1.0
loss 9.822845186135964e-07 acc 1.0
loss 1.8119811784

loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 0.0 acc 1.0
loss 2.8610228852699038e-08 acc 1.0
loss 8.583069188716763e-08 acc 1.0
loss 0.0 acc 1.0
loss 3.814697180359872e-08 acc 1.0
loss 9.53674295089968e-09 acc 1.0
loss 4.7683716530855236e-08 acc 1.0
loss 0.0 acc 1.0
loss 3.814697180359872e-08 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 7.629394360719743e-08 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 3.814697180359872e-08 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 3.814697180359872e-08 acc 1.0
loss 0.0 acc 1.0
loss 4.7683716530855236e-08 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 3.814697180359872e-08 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 6.67572024326546e-08 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 0.0 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 0.0 acc 1.0
loss 1.907348590179936e-08 acc 1.0
loss 1.90734

loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1.0
loss 0.0 acc 1

In [55]:
sentence


  6653  22290   7339  ...       0      0      0
 11367  12732   9077  ...       0      0      0
 15714  11367  24849  ...       0      0      0
        ...            ⋱           ...         
 10448   4012  12551  ...       0      0      0
  6548  19870  25207  ...       0      0      0
   329  21983  24473  ...       0      0      0
[torch.cuda.LongTensor of size 50x64 (GPU 0)]

In [53]:
target


 1
 0
 0
 0
 1
 1
 1
 2
 0
 0
 1
 2
 1
 0
 0
 1
 2
 2
 0
 1
 0
 2
 2
 2
 0
 0
 1
 0
 0
 2
 0
 2
 1
 2
 0
 0
 0
 1
 0
 2
 0
 0
 2
 0
 0
 0
 0
 2
 2
 2
[torch.LongTensor of size 50]

In [52]:
var_y

Variable containing:
 1
 0
 0
 0
 1
 1
 1
 2
 0
 0
 1
 2
 1
 0
 0
 1
 2
 2
 0
 1
 0
 2
 2
 2
 0
 0
 1
 0
 0
 2
 0
 2
 1
 2
 0
 0
 0
 1
 0
 2
 0
 0
 2
 0
 0
 0
 0
 2
 2
 2
[torch.LongTensor of size 50]

In [43]:
pred

Variable containing:
 25  10  11
[torch.cuda.LongTensor of size 1x3 (GPU 0)]

In [None]:
sentence

In [None]:
target.shape

In [None]:
pred.shape

In [None]:
loss = criterion(pred, Variable(target))

In [None]:
loss