In [26]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
from collections import Counter, OrderedDict
import re
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [27]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])

In [28]:
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [29]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [31]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

In [30]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [32]:
data = open('./data/train_1000.label.txt', 'r', encoding='latin-1').readlines()

In [33]:
data[:1]

['DESC:manner How did serfdom develop in and then leave Russia ?\n']

In [34]:
data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]

In [35]:
data

[['manner How did serfdom develop in and then leave Russia ?', 'DESC'],
 ['cremat What films featured the character Popeye Doyle ?', 'ENTY'],
 ["manner How can I find a list of celebrities ' real names ?", 'DESC'],
 ['animal What fowl grabs the spotlight after the Chinese Year of the Monkey ?',
  'ENTY'],
 ['exp What is the full form of .com ?', 'ABBR'],
 ['ind What contemptible scoundrel stole the cork from my lunch ?', 'HUM'],
 ["gr What team did baseball 's St. Louis Browns become ?", 'HUM'],
 ['title What is the oldest profession ?', 'HUM'],
 ['def What are liver enzymes ?', 'DESC'],
 ['ind Name the scar-faced bounty hunter of The Old West .', 'HUM'],
 ['date When was Ozzy Osbourne born ?', 'NUM'],
 ['reason Why do heavier objects travel downhill faster ?', 'DESC'],
 ['ind Who was The Pride of the Yankees ?', 'HUM'],
 ['ind Who killed Gandhi ?', 'HUM'],
 ['event What is considered the costliest disaster the insurance industry has ever faced ?',
  'ENTY'],
 ['state What sprawling U.

In [24]:
X, y = list(zip(*data))

In [37]:
X[:2]

('manner How did serfdom develop in and then leave Russia ?',
 'cremat What films featured the character Popeye Doyle ?')

In [38]:
y[:2]

('DESC', 'ENTY')

In [39]:
X = list(X)

In [40]:
X[:2]

['manner How did serfdom develop in and then leave Russia ?',
 'cremat What films featured the character Popeye Doyle ?']

In [41]:
for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', x).split()

In [42]:
X[:5]

[['manner',
  'How',
  'did',
  'serfdom',
  'develop',
  'in',
  'and',
  'then',
  'leave',
  'Russia',
  '?'],
 ['cremat',
  'What',
  'films',
  'featured',
  'the',
  'character',
  'Popeye',
  'Doyle',
  '?'],
 ['manner',
  'How',
  'can',
  'I',
  'find',
  'a',
  'list',
  'of',
  'celebrities',
  "'",
  'real',
  'names',
  '?'],
 ['animal',
  'What',
  'fowl',
  'grabs',
  'the',
  'spotlight',
  'after',
  'the',
  'Chinese',
  'Year',
  'of',
  'the',
  'Monkey',
  '?'],
 ['exp', 'What', 'is', 'the', 'full', 'form', 'of', '.com', '?']]

In [46]:
flatten(X[:5])

['manner',
 'How',
 'did',
 'serfdom',
 'develop',
 'in',
 'and',
 'then',
 'leave',
 'Russia',
 '?',
 'cremat',
 'What',
 'films',
 'featured',
 'the',
 'character',
 'Popeye',
 'Doyle',
 '?',
 'manner',
 'How',
 'can',
 'I',
 'find',
 'a',
 'list',
 'of',
 'celebrities',
 "'",
 'real',
 'names',
 '?',
 'animal',
 'What',
 'fowl',
 'grabs',
 'the',
 'spotlight',
 'after',
 'the',
 'Chinese',
 'Year',
 'of',
 'the',
 'Monkey',
 '?',
 'exp',
 'What',
 'is',
 'the',
 'full',
 'form',
 'of',
 '.com',
 '?']

In [43]:
vocab = list(set(flatten(X)))

In [44]:
len(vocab)

2846

In [47]:
len(set(y)) # num of class

6

In [48]:
word2index={'<PAD>': 0, '<UNK>': 1}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

In [49]:
word2index

{'<PAD>': 0,
 '<UNK>': 1,
 'Renoir': 2,
 'Gatsby': 3,
 'Alvin': 4,
 'recorded': 5,
 'level': 6,
 'Dane': 7,
 'drills': 8,
 'advertizing': 9,
 'Democratic': 10,
 'fare': 11,
 'Janelle': 12,
 'def': 13,
 'tokens': 14,
 'duck': 15,
 'nicknamed': 16,
 'Anopheles': 17,
 'layers': 18,
 'breeding': 19,
 '.dbf': 20,
 'cash-conscious': 21,
 'Genesis': 22,
 'mehitabel': 23,
 'named': 24,
 'piece': 25,
 'bought': 26,
 'profession': 27,
 'Thatcher': 28,
 'Brown': 29,
 'National': 30,
 'first': 31,
 'systems': 32,
 'cancer': 33,
 'popcorn': 34,
 'Dr.': 35,
 'steel': 36,
 'death': 37,
 'biggest': 38,
 'J.': 39,
 'motto': 40,
 'Dolphins': 41,
 'Night': 42,
 'colleges': 43,
 'had': 44,
 'total': 45,
 'Chinese': 46,
 'dare': 47,
 'claim': 48,
 'deck': 49,
 'produced': 50,
 'album': 51,
 'Game': 52,
 'two': 53,
 'Challengers': 54,
 'organization': 55,
 'gold': 56,
 'embassy': 57,
 'where': 58,
 'him': 59,
 'Club': 60,
 'Las': 61,
 'Loop': 62,
 'shower': 63,
 'Internet': 64,
 'Representatives': 65,
 'tou

In [50]:
index2word = {v:k for k, v in word2index.items()}

In [51]:
target2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)

index2target = {v:k for k, v in target2index.items()}

In [52]:
target2index

{'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'LOC': 3, 'HUM': 4, 'NUM': 5}

In [54]:
list(zip(X))

[(['manner',
   'How',
   'did',
   'serfdom',
   'develop',
   'in',
   'and',
   'then',
   'leave',
   'Russia',
   '?'],),
 (['cremat',
   'What',
   'films',
   'featured',
   'the',
   'character',
   'Popeye',
   'Doyle',
   '?'],),
 (['manner',
   'How',
   'can',
   'I',
   'find',
   'a',
   'list',
   'of',
   'celebrities',
   "'",
   'real',
   'names',
   '?'],),
 (['animal',
   'What',
   'fowl',
   'grabs',
   'the',
   'spotlight',
   'after',
   'the',
   'Chinese',
   'Year',
   'of',
   'the',
   'Monkey',
   '?'],),
 (['exp', 'What', 'is', 'the', 'full', 'form', 'of', '.com', '?'],),
 (['ind',
   'What',
   'contemptible',
   'scoundrel',
   'stole',
   'the',
   'cork',
   'from',
   'my',
   'lunch',
   '?'],),
 (['gr',
   'What',
   'team',
   'did',
   'baseball',
   "'s",
   'St.',
   'Louis',
   'Browns',
   'become',
   '?'],),
 (['title', 'What', 'is', 'the', 'oldest', 'profession', '?'],),
 (['def', 'What', 'are', 'liver', 'enzymes', '?'],),
 (['ind',
   '

In [55]:
X_p, y_p = [], []
for pair in zip(X,y):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))
    
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]

In [56]:
train_data[:5]

[(tensor([[1636,  194, 2435, 2003, 1130, 1607,   45,  532, 1918, 2270, 1607, 2335,
           2521]], device='cuda:0'),
  tensor([[1]], device='cuda:0')),
 (tensor([[ 898, 2067,  192, 2740, 1537, 1783,  170, 1490, 1931,  292, 1384, 2740,
           2291, 2521]], device='cuda:0'),
  tensor([[4]], device='cuda:0')),
 (tensor([[ 843, 2067,   51,  263,  463, 1065, 2270, 2740,  289, 1759, 2159, 1384,
           1592, 2521]], device='cuda:0'),
  tensor([[2]], device='cuda:0')),
 (tensor([[ 843, 2067, 1064, 1968,  811,  917, 2738, 1759,  811, 1384, 2740, 1251,
           2521]], device='cuda:0'),
  tensor([[2]], device='cuda:0')),
 (tensor([[ 782, 2067, 2126, 2740, 2193, 1759, 1921, 2521]], device='cuda:0'),
  tensor([[5]], device='cuda:0'))]

In [58]:
import gensim

In [61]:
model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

ValueError: invalid literal for int() with base 10: 'version'