In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%matplotlib inline
import torch
import numpy as np

knock80

In [12]:
import collections
import pandas as pd
import re
import string


def word_to_id(input_word, word_id):
  result_list = []
  for word in input_word.split():
    if word in word_id:
      result_list.append(str(word_id[word]))
    else:
      result_list.append("0")

  return " ".join(result_list)

def preprocess(text: str) -> str:
  text = re.sub(r'\s*\bNaN\b\s*', '', text)
  text = text.lower()
  text = re.sub(r'\d+', '', text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = re.sub(r'\s+', ' ', text).strip()
  return text

train = pd.read_csv("/content/drive/MyDrive/train.txt", sep="\t", index_col=0)
test = pd.read_csv("/content/drive/MyDrive/test.txt", sep="\t", index_col=0)

train["flg"] = "train"
test["flg"] = "test"
train_test = pd.concat([train, test])

train_test["TITLE"] = train_test["TITLE"].apply(preprocess)

all_sentence_list = " ".join(train_test["TITLE"].tolist()).split(" ")

all_word_cnt = collections.Counter(all_sentence_list)

word_cnt_over2 = [i for i in all_word_cnt.items() if i[1] >= 2]
word_cnt_over2 = sorted(word_cnt_over2, key=lambda x: x[1], reverse=True)

word_over2 = [i[0] for i in word_cnt_over2]
id_list = [i for i in range(1, len(word_over2))]

word_id_dict = dict(zip(word_over2, id_list))

train_test["TITLE"] = train_test["TITLE"].apply(
    lambda x: word_to_id(x, word_id_dict)
)

train = train_test.query('flg == "train"')
test = train_test.query('flg == "test"')

knock81

In [15]:
label = {"b":0, "t":1, "e":2, "m":3}
train["CATEGORY"] = train["CATEGORY"].map(label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["CATEGORY"] = train["CATEGORY"].map(label)


In [5]:
import torch.nn as nn
from torch.utils.data import Dataset

class TextDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    X_list = [int(x) for x in self.X[idx].split()]
    inputs = torch.tensor(X_list)
    label = torch.tensor(self.y[idx])
    return inputs, label

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_dim=300, hidden_size=50, output_size=4):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
    self.rnn = nn.RNN(input_size=emb_dim, hidden_size=hidden_size, num_layers=1, nonlinearity="tanh", bias=True)
    self.fc = nn.Linear(in_features=hidden_size, out_features=output_size, bias=True)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, h_0=None):
    x = self.emb(x)
    x, h_t = self.rnn(x, h_0)
    x = x[:, -1, :]
    x = self.fc(x)
    x = self.softmax(x)
    return x

model = RNN(len(word_id_dict))
ds = TextDataset(train["TITLE"], train["CATEGORY"])

for i in range(10):
  X = ds[i][0]
  X = X.unsqueeze(0)
  print(model(x=X))

tensor([[0.2857, 0.2739, 0.1043, 0.3361]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1539, 0.3277, 0.3565, 0.1619]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1887, 0.3303, 0.2299, 0.2510]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2876, 0.2322, 0.2893, 0.1909]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1269, 0.3080, 0.2982, 0.2669]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1269, 0.3080, 0.2982, 0.2669]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2670, 0.2618, 0.1298, 0.3414]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2670, 0.2618, 0.1298, 0.3414]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2073, 0.2658, 0.2786, 0.2484]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3211, 0.3112, 0.2228, 0.1449]], grad_fn=<SoftmaxBackward0>)


knock82

In [32]:
class RNN(nn.Module):
  def __init__(self, vocab_size, emb_dim=300, hidden_size=50, output_size=4):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
    self.rnn = nn.RNN(input_size=emb_dim, hidden_size=hidden_size, num_layers=1, nonlinearity="tanh", bias=True, batch_first=True)
    self.fc = nn.Linear(in_features=hidden_size, out_features=output_size, bias=True)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, h_0=None):
    x = self.emb(x)
    x, h_t = self.rnn(x, h_0)
    x = x[:, -1, :]
    x = self.fc(x)
    x = self.softmax(x)
    return x

In [34]:
import os
import random
import time
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from tqdm import tqdm

def seed_everything(seed=42, use_torch=False):
  random.seed(seed)
  os.environ["PYTHONHASHSEED"] = str(seed)
  np.random.seed(seed)
  if use_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(use_torch=True)

def train_fn(model, loader, optimizer, criterion, BATCHSIZE, HIDDEN_SIZE) -> float:
  model.train()
  train_running_loss = 0.0
  for dataloader_x, dataloader_y in loader:
    optimizer.zero_grad()
    dataloader_y_pred_prob = model(x=dataloader_x, h_0=torch.zeros(1 * 1, BATCHSIZE, HIDDEN_SIZE))
    loss = criterion(dataloader_y_pred_prob, dataloader_y)
    loss.backward()
    optimizer.step()
    train_running_loss += loss.item() / len(loader)
  return train_running_loss

def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  model.eval()
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

  loss = 0.0
  total = 0
  correct = 0

  with torch.no_grad():
    for dataloader_x, dataloader_y in dataloader:
      outputs = model(dataloader_x)
      loss += criterion(outputs, dataloader_y).item()
      pred = torch.argmax(outputs, dim=-1)
      total += len(dataloader_x)
      correct += (pred == dataloader_y).sum().item()

  return loss / len(dataset), correct / total

def padding(id_seq: str, max_len: int):
  id_list = id_seq.split(" ")
  if len(id_list) > max_len:
    id_list = id_list[:max_len]
  else:
    pad_num = max_len - len(id_list)
    for _ in range(pad_num):
      id_list.append("0")
  return " ".join(id_list)


start = time.time()

max_len = 10
train["TITLE"] = train["TITLE"].apply(lambda x: padding(x, max_len))
test["TITLE"] = test["TITLE"].apply(lambda x: padding(x, max_len))
test["CATEGORY"] = test["CATEGORY"].map(label)

N_LETTERS = len(word_id_dict) + 1
EMB_SIZE = 300
HIDDEN_SIZE = 50
N_CATEGORIES = 4

model = RNN(vocab_size=N_LETTERS, emb_dim=EMB_SIZE, hidden_size=HIDDEN_SIZE, output_size=N_CATEGORIES)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

dataset_train = TextDataset(train["TITLE"], train["CATEGORY"])
dataset_test = TextDataset(test["TITLE"], test["CATEGORY"])

BATCHSIZE = 1
dataloader_train = DataLoader(dataset_train, batch_size=BATCHSIZE, shuffle=False, drop_last=True)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

device = (torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu"))

EPOCH = 10
for epoch in tqdm(range(EPOCH)):

  train_running_loss = train_fn(model, dataloader_train, optimizer, criterion, BATCHSIZE, HIDDEN_SIZE)
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(model, dataset_train, device, criterion)
  test_loss, test_acc = calculate_loss_and_accuracy(model, dataset_test, device, criterion)

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

  if epoch % 20 == 0:
      torch.save(model.state_dict(), f"82_model_epoch{epoch}.pth")
      torch.save(
          optimizer.state_dict(),
          f"82_optimizer_epoch{epoch}.pth",
      )

print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

elapsed_time = time.time() - start
print(elapsed_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["TITLE"] = train["TITLE"].apply(lambda x: padding(x, max_len))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["TITLE"] = test["TITLE"].apply(lambda x: padding(x, max_len))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["CATEGORY"] = test["CATEGORY"].map(label)
  0%|          | 0/10 

1.1971740226416323


  0%|          | 0/10 [00:47<?, ?it/s]


KeyError: ignored

knock83

In [51]:
def train_fn(model, loader, device, optimizer, criterion, BATCHSIZE, HIDDEN_SIZE) -> float:
    model.train()
    train_running_loss = 0.0

    for dataloader_x, dataloader_y in loader:
        dataloader_x = dataloader_x.to(device)
        dataloader_y = dataloader_y.to(device)
        optimizer.zero_grad()
        dataloader_y_pred_prob = model(x=dataloader_x, h_0=torch.zeros(1 * 1, BATCHSIZE, HIDDEN_SIZE, device=device))
        loss = criterion(dataloader_y_pred_prob, dataloader_y)
        loss.backward()
        optimizer.step()
        train_running_loss += loss.item() / len(loader)

    return train_running_loss

device = torch.device("cuda")

In [31]:
model = RNN(vocab_size=N_LETTERS, emb_dim=EMB_SIZE, hidden_size=HIDDEN_SIZE, output_size=N_CATEGORIES).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

dataset_train = TextDataset(train["TITLE"], train["CATEGORY"])
dataset_test = TextDataset(test["TITLE"], test["CATEGORY"])

BATCHSIZE = 32
dataloader_train = DataLoader(dataset_train, batch_size=BATCHSIZE, shuffle=True, drop_last=True)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

EPOCH = 10
for epoch in tqdm(range(EPOCH)):
  train_running_loss = train_fn(
    model,
    dataloader_train,
    device,
    optimizer,
    criterion,
    BATCHSIZE,
    HIDDEN_SIZE,
  )
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(model, dataset_train, device, criterion)
  test_loss, test_acc = calculate_loss_and_accuracy(model, dataset_test, device, criterion)

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

  if epoch % 20 == 0:
      torch.save(model.state_dict(), f"83_model_epoch{epoch}.pth")
      torch.save(optimizer.state_dict(), f"83_optimizer_epoch{epoch}.pth")

print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

  0%|          | 0/10 [00:00<?, ?it/s]

1.3201183052034346





RuntimeError: ignored

knock84

In [35]:
from gensim.models import KeyedVectors
model_GN = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [49]:
from numpy.lib.function_base import kaiser

class TextDataset(Dataset):
    def __init__(self, X, y, device):
        self.X = X
        self.y = y
        self.device = device

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X_list = [int(x) for x in self.X[idx].split()]
        inputs = torch.tensor(X_list, device=self.device)
        label = torch.tensor(self.y[idx], device=self.device)
        return inputs, label

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_dim, hidden_size, output_size, word_id_dict):
    super().__init__()
    model = model_GN
    weight = torch.zeros(len(word_id_dict) + 1, 300)
    for word, idx in word_id_dict.items():
      try:
        a = model[word]
        weight[idx] = torch.tensor(model[word])
      except:
        continue

    self.emb = nn.Embedding.from_pretrained(weight, padding_idx=0)

    self.rnn = nn.RNN(
        input_size=emb_dim,
        hidden_size=hidden_size,
        num_layers=1,
        nonlinearity="tanh",
        bias=True,
        batch_first=True,
    )

    self.fc = nn.Linear(in_features=hidden_size, out_features=output_size, bias=True)

    self.softmax = nn.Softmax(dim=1)

  def forward(self, x, h_0=None):
    x = self.emb(x)
    x, h_t = self.rnn(x, h_0)
    x = x[:, -1, :]
    x = self.fc(x)
    x = self.softmax(x)
    return x

In [53]:
test = test.reset_index(drop=True)

model = RNN(
  vocab_size=N_LETTERS,
  emb_dim=EMB_SIZE,
  hidden_size=HIDDEN_SIZE,
  output_size=N_CATEGORIES,
  word_id_dict=word_id_dict,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

dataset_train = TextDataset(train["TITLE"], train["CATEGORY"], device)
dataset_test = TextDataset(test["TITLE"], test["CATEGORY"], device)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

EPOCH = 10
for epoch in tqdm(range(EPOCH)):

  train_running_loss = train_fn(
    model,
    dataloader_train,
    device,
    optimizer,
    criterion,
    BATCHSIZE,
    HIDDEN_SIZE,
  )
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(
      model, dataset_train, device, criterion
  )
  test_loss, test_acc = calculate_loss_and_accuracy(
      model, dataset_test, device, criterion
  )

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

print(f"train_acc: {train_acc}")
print(f"test_acc: {test_acc}")

  0%|          | 0/10 [00:00<?, ?it/s]

1.2637914074514989


  0%|          | 0/10 [00:27<?, ?it/s]


RuntimeError: ignored

knock85

In [54]:
model = RNN(
  vocab_size=N_LETTERS,
  emb_dim=EMB_SIZE,
  hidden_size=HIDDEN_SIZE,
  output_size=N_CATEGORIES,
  word_id_dict=word_id_dict,
).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

train_losses = []
train_accs = []
test_losses = []
test_accs = []

EPOCH = 10
for epoch in tqdm(range(EPOCH)):

  train_running_loss = train_fn(
    model,
    dataloader_train,
    device,
    optimizer,
    criterion,
    BATCHSIZE,
    HIDDEN_SIZE,
  )
  print(train_running_loss)

  train_loss, train_acc = calculate_loss_and_accuracy(model, dataset_train, device, criterion)
  test_loss, test_acc = calculate_loss_and_accuracy(model, dataset_test, device, criterion)

  train_losses.append(train_loss)
  train_accs.append(train_acc)

  test_losses.append(test_loss)
  test_accs.append(test_acc)

print(f"train_acc: {train_acc: .4f}")
print(f"test_acc: {test_acc: .4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

1.313560871845826


  0%|          | 0/10 [00:27<?, ?it/s]


RuntimeError: ignored