In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
from collections import OrderedDict
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset, TensorDataset

In [3]:
train_data = pd.read_csv("./data/train.csv")
train_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_data['author'])

LabelEncoder()

In [5]:
encoded_authors = le.transform(train_data['author'])
encoded_authors

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [6]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

In [13]:
sample_text = train_data.loc[train_data['id'] == 'id26305']['text']
sample_text

0    This process, however, afforded me no means of...
Name: text, dtype: object

In [15]:
sample_text.values[0]

'This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.'

In [17]:
sample_texts = sample_text.values[0].lower().split()
sample_texts[:5]

['this', 'process,', 'however,', 'afforded', 'me']

In [19]:
trigrams = [([sample_texts[i], sample_texts[i+1]], sample_texts[i+2])
           for i in range(len(sample_texts) - 2)]
trigrams[:3]

[(['this', 'process,'], 'however,'),
 (['process,', 'however,'], 'afforded'),
 (['however,', 'afforded'], 'me')]

In [21]:
vocab = set(sample_texts)
word_to_ix = {word : i for i, word in enumerate(vocab)}

In [22]:
import torch.functional as F

In [41]:
class NN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.net = nn.Sequential(
            OrderedDict(
            [ 
             ('linear1', nn.Linear(context_size * embedding_dim, 128)),
              ('relu1', nn.ReLU()),
             ('linear2', nn.Linear(128, vocab_size)),
             ]
            )
        )
    
    def forward(self, inputs):
        embeds = self.embedding(inputs).view((1, -1))
        return self.net(embeds)

In [42]:
net = NN(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).cuda()


In [43]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

In [44]:
dtype = torch.cuda.LongTensor

In [45]:
context, target = trigrams[0]
context, target

(['this', 'process,'], 'however,')

In [46]:
context_idxs = [word_to_ix[w] for w in context]
context_idxs

[19, 11]

In [47]:
var = Variable(dtype(context_idxs))
var

Variable containing:
 19
 11
[torch.cuda.LongTensor of size 2 (GPU 0)]

In [48]:
net(var)

Variable containing:

Columns 0 to 9 
-0.1311 -0.2707 -0.0057 -0.4845 -0.0356 -0.0558  0.1130 -0.0104  0.3062 -0.1768

Columns 10 to 19 
-0.1327 -0.0752 -0.2897  0.0669  0.1060  0.1651 -0.1942 -0.0347  0.0777 -0.1201

Columns 20 to 29 
-0.0574  0.0980  0.0162  0.1112  0.1792  0.1688 -0.3074 -0.1089  0.2785 -0.0360

Columns 30 to 34 
-0.3749  0.0251 -0.2026 -0.0339 -0.2261
[torch.cuda.FloatTensor of size 1x35 (GPU 0)]