In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%matplotlib inline
import torch
import numpy as np

knock80

In [56]:
import pandas as pd
import re

def preprc(data):
    x = []
    y = []
    label = {"b":0, "e":1, "t":2, "m":3} #ビジネス:0, エンターテインメント:1, 科学技術:2, 健康:3
    for l in data:
        d = l.split("\t")
        xd = [i.rstrip("\n").strip(":'$%?,.()!;").strip('"') for i in re.sub("[0-9]+", "0", d[1]).lower().split(" ") if i != "" and i != "...\n"]
        x.append(xd)
        y.append(label[d[0]])
    return x, y

train = []
with open('/content/drive/MyDrive/train.txt', encoding='utf-8') as f:
  for l in f:
    train.append(l)
X_train, Y_train = preprc(train)

In [57]:
def cwi(ws, wid):
  '''
  単語のリストとid番号の辞書を受け取り、id番号のリストを返す
  '''
  ids = [wid.get(w, 0) for w in ws]
  return ids

w_cnt = {}
for t in X_train:
  for w in t:
    w_cnt[w] = w_cnt.get(w, 0) + 1
w_cnt_sorted = sorted(w_cnt.items(), key=lambda x: x[1], reverse=True)
w_id = {}
for i, w in enumerate(w_cnt_sorted, 1):
  if w[1] < 2:
    break
  w_id[w[0]] = i

knock81

In [97]:
Xid_train = [cwi(t, w_id) for t in X_train]

In [90]:
import torch.nn as nn
from torch.utils.data import Dataset

class TextDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X_list = [x for x in self.X[idx]]
    inputs = torch.tensor(X_list)
    label = torch.tensor(self.y[idx])
    return inputs, label

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_dim=300, hidden_size=50, output_size=4):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
    self.rnn = nn.RNN(input_size=emb_dim, hidden_size=hidden_size, num_layers=1, nonlinearity="tanh", bias=True)
    self.fc = nn.Linear(in_features=hidden_size, out_features=output_size, bias=True)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, h_0=None):
    x = self.emb(x)
    x, h_t = self.rnn(x, h_0)
    x = self.fc(x)
    x = self.softmax(x)
    return x

model = RNN(len(w_id))
ds = TextDataset(Xid_train, Y_train)

for i in range(10):
  X = ds[i][0]
  X.unsqueeze(0)
  print(model(x=X)[-1])

tensor([0.3437, 0.1723, 0.2938, 0.1902], grad_fn=<SelectBackward0>)
tensor([0.2344, 0.2446, 0.1978, 0.3232], grad_fn=<SelectBackward0>)
tensor([0.2510, 0.1975, 0.3507, 0.2007], grad_fn=<SelectBackward0>)
tensor([0.1803, 0.4115, 0.2490, 0.1592], grad_fn=<SelectBackward0>)
tensor([0.1972, 0.2187, 0.4696, 0.1145], grad_fn=<SelectBackward0>)
tensor([0.3315, 0.2255, 0.2565, 0.1865], grad_fn=<SelectBackward0>)
tensor([0.3176, 0.1294, 0.1767, 0.3763], grad_fn=<SelectBackward0>)
tensor([0.1634, 0.3666, 0.3025, 0.1676], grad_fn=<SelectBackward0>)
tensor([0.1535, 0.2949, 0.4009, 0.1507], grad_fn=<SelectBackward0>)
tensor([0.3186, 0.2656, 0.1583, 0.2576], grad_fn=<SelectBackward0>)


knock82

In [98]:
valid = []
test = []
with open('/content/drive/MyDrive/valid.txt', encoding='utf-8') as f:
  for l in f:
    valid.append(l)
with open('/content/drive/MyDrive/test.txt', encoding='utf-8') as f:
  for l in f:
    test.append(l)
X_valid, Y_valid = preprc(valid)
X_test, Y_test = preprc(test)
Xid_valid = [cwi(t, w_id) for t in X_valid]
Xid_test = [cwi(t, w_id) for t in X_test]

In [110]:
import os
import random
import time
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from tqdm import tqdm

def seed_everything(seed=42, use_torch=False):
  random.seed(seed)
  os.environ["PYTHONHASHSEED"] = str(seed)
  np.random.seed(seed)
  if use_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(use_torch=True)

def train_fn(model, loader, optimizer, criterion, BATCHSIZE, HIDDEN_SIZE) -> float:
  model.train()
  train_running_loss = 0.0
  for dataloader_x, dataloader_y in loader:
    optimizer.zero_grad()
    dataloader_y_pred_prob = model(x=dataloader_x)
    print(dataloader_y_pred_prob)
    loss = criterion(dataloader_y_pred_prob, dataloader_y)
    loss.backward()
    optimizer.step()
    train_running_loss += loss.item() / len(loader)
  return train_running_loss

def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  model.eval()
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

  loss = 0.0
  total = 0
  correct = 0

  with torch.no_grad():
    for dataloader_x, dataloader_y in dataloader:
      outputs = model(dataloader_x)
      loss += criterion(outputs, dataloader_y).item()
      pred = torch.argmax(outputs, dim=-1)
      total += len(dataloader_x)
      correct += (pred == dataloader_y).sum().item()

  return loss / len(dataset), correct / total

def padding(id_list, max_len):
  if len(id_list) > max_len:
    id_list = id_list[:max_len]
  else:
    pad_num = max_len - len(id_list)
    for _ in range(pad_num):
      id_list.append(0)
  return id_list

def make_graph(value_dict: dict, value_name: str, bn: int, method: str):
  for phase in ["train", "test"]:
    plt.plot(value_dict[phase], label=phase)
  plt.xlabel("epoch")
  plt.ylabel(value_name)
  plt.title(f"{value_name} per epoch at bn{bn}")
  plt.legend()
  plt.savefig(f"{method}_{value_name}_bn{bn}.png")
  plt.close()


start = time.time()

max_len = 10
Xid_train = [padding(i, max_len) for i in Xid_train]
Xid_test = [padding(i, max_len) for i in Xid_test]

N_LETTERS = len(w_id) + 1
EMB_SIZE = 300
HIDDEN_SIZE = 50
N_CATEGORIES = 4

model = RNN(vocab_size=N_LETTERS, emb_dim=EMB_SIZE, hidden_size=HIDDEN_SIZE, output_size=N_CATEGORIES)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

dataset_train = TextDataset(Xid_train, Y_train)
dataset_test = TextDataset(Xid_test, Y_test)

BATCHSIZE = 1
dataloader_train = DataLoader(dataset_train, batch_size=BATCHSIZE, shuffle=False, drop_last=True)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

device = (torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu"))

EPOCH = 10
for epoch in tqdm(range(EPOCH)):

  train_running_loss = train_fn(model, dataloader_train, optimizer, criterion, BATCHSIZE, HIDDEN_SIZE)
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(model, dataset_train, device, criterion)
  test_loss, test_acc = calculate_loss_and_accuracy(model, dataset_test, device, criterion)

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

  if epoch % 20 == 0:
      torch.save(model.state_dict(), f"82_model_epoch{epoch}.pth")
      torch.save(
          optimizer.state_dict(),
          f"82_optimizer_epoch{epoch}.pth",
      )

losses = {"train": train_losses, "test": test_losses}

accs = {"train": train_accs, "test": test_accs}

make_graph(losses, "losses", bn=BATCHSIZE, method="rnn")
make_graph(accs, "accs", bn=BATCHSIZE, method="rnn")

print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

elapsed_time = time.time() - start
print(elapsed_time)

  0%|          | 0/10 [00:00<?, ?it/s]

tensor([[[0.1272, 0.1184, 0.0731, 0.0681],
         [0.0858, 0.0557, 0.0519, 0.1484],
         [0.0519, 0.0915, 0.1495, 0.1712],
         [0.1978, 0.1605, 0.1368, 0.0493],
         [0.1034, 0.0379, 0.1144, 0.0885],
         [0.0334, 0.0947, 0.1129, 0.0950],
         [0.1274, 0.1390, 0.0488, 0.1111],
         [0.0333, 0.0990, 0.1015, 0.1157],
         [0.0750, 0.0987, 0.1766, 0.0726],
         [0.1648, 0.1046, 0.0344, 0.0800]]], grad_fn=<SoftmaxBackward0>)





RuntimeError: ignored

knock83

In [113]:
def train_fn(model, loader, device, optimizer, criterion, BATCHSIZE, HIDDEN_SIZE) -> float:
    model.train()
    train_running_loss = 0.0

    for dataloader_x, dataloader_y in loader:
        dataloader_x.to(device)
        dataloader_y.to(device)
        optimizer.zero_grad()

        dataloader_y_pred_prob = model(x=dataloader_x, h_0=torch.zeros(1 * 1, BATCHSIZE, HIDDEN_SIZE))

        loss = criterion(dataloader_y_pred_prob, dataloader_y)
        loss.backward()
        optimizer.step()

        train_running_loss += loss.item() / len(loader)

    return train_running_loss

N_LETTERS = len(w_id) + 1
EMB_SIZE = 300
HIDDEN_SIZE = 50
N_CATEGORIES = 4

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = RNN(vocab_size=N_LETTERS, emb_dim=EMB_SIZE, hidden_size=HIDDEN_SIZE, output_size=N_CATEGORIES).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

BATCHSIZE = 32
dataloader_train = DataLoader(dataset_train, batch_size=BATCHSIZE, shuffle=True, drop_last=True)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

EPOCH = 10
for epoch in tqdm(range(EPOCH)):
  train_running_loss = train_fn(
    model,
    dataloader_train,
    device,
    optimizer,
    criterion,
    BATCHSIZE,
    HIDDEN_SIZE,
  )
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(model, dataset_train, device, criterion)
  test_loss, test_acc = calculate_loss_and_accuracy(model, dataset_test, device, criterion)

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

  if epoch % 20 == 0:
      torch.save(model.state_dict(), f"83_model_epoch{epoch}.pth")
      torch.save(optimizer.state_dict(), f"83_optimizer_epoch{epoch}.pth")

losses = {"train": train_losses, "test": test_losses}

accs = {"train": train_accs, "test": test_accs}

make_graph(losses, "losses", method="rnn")
make_graph(accs, "accs", method="rnn")

print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

  0%|          | 0/10 [00:00<?, ?it/s]


RuntimeError: ignored

knock84

In [121]:
from gensim.models import KeyedVectors
model_GN = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [124]:
class RNN(nn.Module):
  def __init__(self, vocab_size, emb_dim, hidden_size, output_size, word_id_dict):
    super().__init__()
    model = model_GN
    weight = torch.zeros(len(word_id_dict) + 1, 300)
    for word, idx in word_id_dict.items():
      if word in model.vocab.keys():
        weight[idx] = torch.tensor(model[word])

    self.emb = nn.Embedding.from_pretrained(weight, padding_idx=0)

    self.rnn = nn.RNN(
        input_size=emb_dim,
        hidden_size=hidden_size,
        num_layers=1,
        nonlinearity="tanh",
        bias=True,
        batch_first=True,
    )

    self.fc = nn.Linear(in_features=hidden_size, out_features=output_size, bias=True)

    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, h_0=None):
    x = self.emb(x)
    x, h_t = self.rnn(x, h_0)
    x = x[:, -1, :]
    x = self.fc(x)
    x = self.softmax(x)
    return x

In [125]:
model = RNN(
  vocab_size=N_LETTERS,
  emb_dim=EMB_SIZE,
  hidden_size=HIDDEN_SIZE,
  output_size=N_CATEGORIES,
  word_id_dict=w_id,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

EPOCH = 10
for epoch in tqdm(range(EPOCH)):

  train_running_loss = train_fn(
    model,
    dataloader_train,
    device,
    optimizer,
    criterion,
    BATCHSIZE,
    HIDDEN_SIZE,
  )
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(
      model, dataset_train, device, criterion
  )
  test_loss, test_acc = calculate_loss_and_accuracy(
      model, dataset_test, device, criterion
  )

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

losses = {"train": train_losses, "test": test_losses}

accs = {"train": train_accs, "test": test_accs}

make_graph(losses, "losses", bn=BATCHSIZE, method="rnn_pretrain")
make_graph(accs, "accs", bn=BATCHSIZE, method="rnn_pretrain")

print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

AttributeError: ignored

knock85

In [128]:
model = RNN(
  vocab_size=N_LETTERS,
  emb_dim=EMB_SIZE,
  hidden_size=HIDDEN_SIZE,
  output_size=N_CATEGORIES,
  word_id_dict=w_id,
).to(device)

EPOCH = 10
for epoch in tqdm(range(EPOCH)):

  train_running_loss = train_fn(
    model,
    dataloader_train,
    device,
    optimizer,
    criterion,
    BATCHSIZE,
    HIDDEN_SIZE,
  )
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(
    model, dataset_train, device, criterion
  )
  test_loss, test_acc = calculate_loss_and_accuracy(
    model, dataset_test, device, criterion
  )

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

losses = {"train": train_losses, "test": test_losses}

accs = {"train": train_accs, "test": test_accs}

make_graph(losses, "losses", bn=BATCHSIZE, method="rnn_bidirectional_3layer")
make_graph(accs, "accs", bn=BATCHSIZE, method="rnn_bidirectional_3layer")

print(f"train_acc: {train_acc: .4f}")
print(f"test_acc: {test_acc: .4f}")

AttributeError: ignored