In [0]:
%matplotlib inline

## train.txt, test.txtをアップロード

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving test.txt to test (1).txt
Saving train.txt to train (1).txt
User uploaded file "test.txt" with length 464128 bytes
User uploaded file "train.txt" with length 3721758 bytes


## pytorchのインストール

In [0]:
try:
  import torch
except ModuleNotFoundError:
  # http://pytorch.org/
  from os import path
  from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
  platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

  accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

  !pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
  import torch

In [3]:
# Author: Robert Guthrie

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fd73a2cc350>

In [0]:
# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
with open("word_to_idx.db", "r") as f:
    for key, value in f.items():
        word_to_ix[key] = value

with open("test_data", "r") as f:
    test_data = [line for line in f]
    
with open("train_data", "r") as f:
    train_data = [line for line in f]

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

In [0]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        # ( super classのinit()を呼ぶ )
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        # (必要なパラメタの定義を行う)
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}
Parameter containing:

Columns 0 to 9 
 0.1860 -0.1301  0.0245  0.1464  0.1421  0.1218 -0.1419 -0.1412 -0.1186  0.0246
-0.1341 -0.1647 -0.0899 -0.0228 -0.1202  0.0717  0.0607 -0.0444  0.0754  0.0634

Columns 10 to 19 
 0.1955 -0.1239  0.1045 -0.1085 -0.1844 -0.0417  0.1130  0.1821 -0.1218  0.0426
 0.1197  0.1321 -0.0664  0.1916 -0.0227 -0.0067 -0.1851 -0.1262 -0.1146 -0.0839

Columns 20 to 25 
 0.1692  0.1300  0.1222  0.1394  0.1240  0.0507
 0.1394 -0.0641 -0.1466  0.0755  0.0628  0.1270
[torch.FloatTensor of size 2x26]

Parameter containing:
-0.1015
 0.0425
[torch.FloatTensor of size 2]

Variable containing:
-0.3691 -1.1756
[torch.FloatTensor of size 1x2]



In [0]:
# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = train_data[0]
bow_vector = make_bow_vector(sample, word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

In [0]:
def fvec2dic(fvec_str):
  """文字列である feature vector を，idxをkeyに持ち，valueに頻度を持つ辞書に変換"""
  dic = {}
  for idx2freq in fvec_str.strip().split():
    idx, freq = idx2freq.split(":")
    dic[int(idx)] = int(freq)
  return dic

def make_bow_vector2(line, n_words):
    """f_vec_str: '1:3 2:4 3:1 ... 654:1' """
    vec = torch.zeros(n_words)
    _, fvec = line.strip().split(maxsplit=1)
    for idx, freq in fvec2dic(fvec).items():
        vec[idx] = freq
    return vec.view(1,-1)

def make_target2(line, n_labels):
    label, _ = line.strip().split(maxsplit=1)
    return torch.LongTensor(int(label))

In [0]:
import random
    
# Run on test data before we train, just to see a before-and-after
for line in test_data:
    bow_vec = autograd.Variable(make_bow_vector2(line, VOCAB_SIZE))
    log_probs = model(bow_vec)
    print(log_probs)

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

random.shuffle(train_data)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for line in train_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Variable as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector2(line, VOCAB_SIZE))
        target = autograd.Variable(make_target2(line, NUM_LABELS))

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

print("[test data]")
for line in test_data:
    bow_vec = autograd.Variable(make_bow_vector2(line, VOCAB_SIZE))
    log_probs = model(bow_vec)
    print(log_probs)

# Index corresponding to Spanish goes up, English goes down!
print("matrix column for corresponding to 'creo'.")
print(next(model.parameters())[:, word_to_ix["creo"]])

Variable containing:
-0.0736 -2.6452
[torch.FloatTensor of size 1x2]

Variable containing:
-3.1347 -0.0445
[torch.FloatTensor of size 1x2]

matrix column for corresponding to 'creo'.
Variable containing:
 0.8268
-0.5116
[torch.FloatTensor of size 2]

[test data]
instance: Yo creo que si
label: SPANISH
Variable containing:
-0.0678 -2.7255
[torch.FloatTensor of size 1x2]

instance: it is lost on me
label: ENGLISH
Variable containing:
-3.2532 -0.0394
[torch.FloatTensor of size 1x2]

matrix column for corresponding to 'creo'.
Variable containing:
 0.8466
-0.5314
[torch.FloatTensor of size 2]



We got the right answer! You can see that the log probability for
Spanish is much higher in the first example, and the log probability for
English is much higher in the second for the test data, as it should be.

Now you see how to make a Pytorch component, pass some data through it
and do gradient updates. We are ready to dig deeper into what deep NLP
has to offer.


