## 前準備

In [0]:
%matplotlib inline

In [38]:
USING_COLAB = False
try:
    import google.colab
    USING_COLAB = True
    !rm *
    !apt-get install python3.6-gdbm

except ImportError:
    pass

rm: cannot remove 'datalab': Is a directory
Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'python3-gdbm' instead of 'python3.6-gdbm'
python3-gdbm is already the newest version (3.6.3-0ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


### train.txt, devel.txt, test.txt, word2idx.dbをアップロード

In [39]:
if USING_COLAB:
    from google.colab import files

    uploaded = files.upload()

    for fn in uploaded.keys():
      print('User uploaded file "{name}" with length {length} bytes'.format(
          name=fn, length=len(uploaded[fn])))

### pytorchのインストール

In [0]:
if USING_COLAB:
    try:
      import torch
    except ModuleNotFoundError:
      # http://pytorch.org/
      from os import path
      from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
      platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

      accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

      !pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
      import torch

## pytorchのimportとモデルの定義
参考:http://pytorch.org/tutorials/beginner/nlp/deep_learning_tutorial.html?highlight=logistic%20regression

In [11]:
# Author: Robert Guthrie

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f22c0994290>

In [0]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        # ( super classのinit()を呼ぶ )
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        # (必要なパラメタの定義を行う)
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

## 新しい関数の定義

In [0]:
def fvec2dic(fvec_str):
  """文字列である feature vector を，idxをkeyに持ち，valueに頻度を持つ辞書に変換"""
  dic = {}
  for idx2freq in fvec_str.strip().split():
    idx, freq = idx2freq.split(":")
    dic[int(idx)] = int(freq)
  return dic

def make_bow_vector2(line, n_words):
    """f_vec_str: '1:3 2:4 3:1 ... 654:1' """
    vec = torch.zeros(n_words)
    _, fvec = line.strip().split(maxsplit=1)
    for idx, freq in fvec2dic(fvec).items():
        vec[idx - 1] = freq # idxは1から始まるので，-1が必要
    return vec.view(1,-1)

def make_target2(line):
    label, _ = line.strip().split(maxsplit=1)
    return torch.LongTensor([int(label)])

## ファイルの読み込み

In [0]:
# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
import shelve
word_to_ix = {}
with shelve.open("word2idx.db") as f:
    for key, value in f.items():
        word_to_ix[key] = value

with open("test.txt", "r") as f:
    test_data = f.read().strip().split("\n")
with open("devel.txt", "r") as f:
    devel_data =  f.read().strip().split("\n")
with open("train.txt", "r") as f:
    train_data =  f.read().strip().split("\n")

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

In [41]:
# よりスマートにもやれる
"""
train = torch.utils.data.TensorDataset(torch.from_numpy(Xtrain), torch.from_numpy(ytrain))
train_loader = torch.utils.data.DataLoader(train, batch_size=100, shuffle=True)

devel = torch.utils.data.TensorDataset(torch.from_numpy(Xdevel), torch.from_numpy(ydevel))
devel_loader = torch.utils.data.DataLoader(devel, batch_size=100, shuffle=True)

test = torch.utils.data.TensorDataset(torch.from_numpy(Xtest), torch.from_numpy(ytest))
test_loader = torch.utils.data.DataLoader(test, batch_size=100, shuffle=True)
"""

'\ntrain = torch.utils.data.TensorDataset(torch.from_numpy(Xtrain), torch.from_numpy(ytrain))\ntrain_loader = torch.utils.data.DataLoader(train, batch_size=100, shuffle=True)\n\ndevel = torch.utils.data.TensorDataset(torch.from_numpy(Xdevel), torch.from_numpy(ydevel))\ndevel_loader = torch.utils.data.DataLoader(devel, batch_size=100, shuffle=True)\n\ntest = torch.utils.data.TensorDataset(torch.from_numpy(Xtest), torch.from_numpy(ytest))\ntest_loader = torch.utils.data.DataLoader(test, batch_size=100, shuffle=True)\n'

In [15]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for name, param in model.named_parameters():
    print(name, param, sep="\n")

# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = train_data[0]
bow_vector = make_bow_vector2(sample, VOCAB_SIZE)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

linear.weight
Parameter containing:
 2.3134e-03 -1.9817e-03 -8.7040e-04  ...   2.4336e-03 -9.8698e-04 -6.1097e-04
-1.9158e-03 -4.9809e-04  4.1348e-03  ...   2.1656e-03 -2.2442e-03  3.8343e-03
[torch.FloatTensor of size 2x49607]

linear.bias
Parameter containing:
1.00000e-03 *
  1.7290
  2.9111
[torch.FloatTensor of size 2]

Variable containing:
-0.5520 -0.8576
[torch.FloatTensor of size 1x2]



In [42]:
# dev dataの先頭及び末尾5行に対してのpredictionを見てみる
def peep_in_dev(verbose=False, use_cuda=False):
    for line in devel_data[:5] + devel_data[-5:]:
        if use_cuda:
          log_probs = model(autograd.Variable(make_bow_vector2(line, VOCAB_SIZE)).cuda())
        else:
          log_probs = model(autograd.Variable(make_bow_vector2(line, VOCAB_SIZE)))
        label = make_target2(line).tolist()[0]
        pred = torch.max(log_probs, dim=1)[1].cpu().data.numpy()[0]
        print("pred: {}\tans: {}".format(pred, label))
        if verbose:
            print(log_probs)
peep_in_dev()

pred: 1	ans: 1
pred: 1	ans: 1
pred: 1	ans: 1
pred: 0	ans: 1
pred: 1	ans: 1
pred: 0	ans: 0
pred: 0	ans: 0
pred: 1	ans: 0
pred: 0	ans: 0
pred: 0	ans: 0


## 学習

In [20]:
# from ipywidgets import FloatProgress
# from IPython.display import display
import random

n_epochs = 50

# progress barを出したいだけ
# progress = FloatProgress(min=0, max=n_epochs) # instantiate the bar
# display(progress) # display the bar

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 学習データをシャッフル
random.shuffle(train_data)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
model.cuda()
for epoch in range(n_epochs):
    if epoch % 10 == 0:
        print("epoch {}/{}".format(epoch, n_epochs))
    # progress.value += 1
    for line in train_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Variable as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector2(line, VOCAB_SIZE))
        target = autograd.Variable(make_target2(line))

        # Step 3. Run our forward pass.
        log_prob = model(bow_vec.cuda())

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_prob, target.cuda())
        loss.backward()
        optimizer.step()

epoch 0/50
epoch 1/50
epoch 2/50
epoch 3/50
epoch 4/50
epoch 5/50
epoch 6/50
epoch 7/50
epoch 8/50
epoch 9/50
epoch 10/50
epoch 11/50
epoch 12/50
epoch 13/50
epoch 14/50
epoch 15/50
epoch 16/50
epoch 17/50
epoch 18/50
epoch 19/50
epoch 20/50
epoch 21/50
epoch 22/50
epoch 23/50
epoch 24/50
epoch 25/50
epoch 26/50
epoch 27/50
epoch 28/50
epoch 29/50
epoch 30/50
epoch 31/50
epoch 32/50
epoch 33/50
epoch 34/50
epoch 35/50
epoch 36/50
epoch 37/50
epoch 38/50
epoch 39/50
epoch 40/50
epoch 41/50
epoch 42/50
epoch 43/50
epoch 44/50
epoch 45/50
epoch 46/50
epoch 47/50
epoch 48/50
epoch 49/50


In [25]:
peep_in_dev(verbose=True, use_cuda=True)

pred: 1	ans: 1
Variable containing:
-1031.9666     0.0000
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 1	ans: 1
Variable containing:
-472.7870    0.0000
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 1	ans: 1
Variable containing:
-291.0772    0.0000
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 0	ans: 1
Variable containing:
   0.0000 -102.4945
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 1	ans: 1
Variable containing:
-339.0583    0.0000
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 0	ans: 0
Variable containing:
   0.0000 -464.7249
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 0	ans: 0
Variable containing:
    0.0000 -1586.5566
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 1	ans: 0
Variable containing:
-513.5421    0.0000
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 0	ans: 0
Variable containing:
-0.4282 -1.0547
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

pred: 0	ans: 0
Variable containing:
   0.0000 -179.0793
[torch.cuda.FloatTensor of

In [0]:
my_review01 = "this movie was great . \n i felt so happy to be able to watch it . "
my_review02 = "this movie was boring bad obscure sad dislike hate unrealistic outdated . "

In [0]:
def fscore(preds, labels):
  tp, fp, fn = [0,0,0]
  for pred, label in zip(preds, labels):
    if pred == 1:
      if label == 1:
        tp +=1
      else:
        fp +=1
    if label == 1 and pred == 0:
      fn +=1
  precision = tp/(tp+fp) if tp+fp != 0 else 0
  recall = tp/(tp+fn) if tp+fn !=0 else 0
  fscore = 2*precision*recall/(recall+precision) if recall + precision != 0 else 0
  return precision, recall, fscore

In [32]:
model.cpu()
preds, labels = [], []
for line in devel_data:
    log_prob = model(autograd.Variable(make_bow_vector2(line, VOCAB_SIZE)))

    label = make_target2(line).tolist()[0]
    
    preds.append(torch.max(log_prob, dim=1)[1].cpu().data.numpy()[0])
    labels.append(label)
pre, rec, f1 = fscore(preds, labels)
print("precision:{:.4f}\nrecall:{:.4f}\nf-measure:{:.4f}".format(pre, rec, f1))

precision:0.7500
recall:0.8100
f-measure:0.7788


In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score
def sk_fscore(labels, preds):
    assert len(labels) == len(preds)
    pre, rec, f1 = precision_score(labels, preds), recall_score(labels, preds), f1_score(labels, preds)
    return pre, rec, f1
pre, rec, f1 = sk_fscore(labels, preds)
print("precision:{:.4f}\nrecall:{:.4f}\nf-measure:{:.4f}".format(pre, rec, f1))

precision:0.7500
recall:0.8100
f-measure:0.7788


In [0]:
import numpy as np
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

train_bow = [make_bow_vector2(line, VOCAB_SIZE).tolist()[0] for line in train_data]
train_labels = [make_target2(line).tolist()[0] for line in train_data]
devel_bow = [make_bow_vector2(line, VOCAB_SIZE).tolist()[0] for line in devel_data]
devel_labels = [make_target2(line).tolist()[0] for line in devel_data]

# lr.fit(trainbow, trainlabel)


In [0]:
lr.fit(train_bow, train_labels)
pred = lr.predict(devel_bow)

In [36]:
pre, rec, f1 = sk_fscore(devel_labels, pred)
print("precision:{:.4f}\nrecall:{:.4f}\nf-measure:{:.4f}".format(pre, rec, f1))

precision:0.7636
recall:0.8400
f-measure:0.8000
