In [3]:
!pwd
%cd drive/MyDrive/nlp100/chapter08

/content
/content/drive/MyDrive/nlp100/chapter08


In [4]:
# knock50

import pandas as pd
from sklearn.model_selection import train_test_split
# FORMAT: ID \t TITLE \t URL \t PUBLISHER \t CATEGORY \t STORY \t HOSTNAME \t TIMESTAMP
df = pd.read_csv("newsCorpora.csv", sep="\t", header=None, names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])

# 該当するpublisherの記事を抽出する
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
df = df[df['PUBLISHER'].isin(publishers)]
# TITLEとCATEGORYのみ抽出
df = df[["TITLE", "CATEGORY"]]

#データを分割しシャッフルする
train, test = train_test_split(df, test_size=0.2, shuffle=True)
test, valid = train_test_split(test, test_size=0.5, shuffle=True)

#ファイルに保存する
train.to_csv("train.txt", sep="\t", index=False, header=None)
valid.to_csv("valid.txt", sep="\t", index=False, header=None)
test.to_csv("test.txt", sep="\t", index=False, header=None)

print("train\n", train["CATEGORY"].value_counts())
print("valid\n", valid["CATEGORY"].value_counts())
print("test\n", test["CATEGORY"].value_counts())

train
 b    4496
e    4220
t    1210
m     746
Name: CATEGORY, dtype: int64
valid
 b    559
e    535
t    158
m     82
Name: CATEGORY, dtype: int64
test
 b    572
e    524
t    156
m     82
Name: CATEGORY, dtype: int64


In [5]:
# knock60

from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [10]:
# knock70
import string
import torch
from tqdm import tqdm

# タイトルを特徴ベクトル化する
def transform_w2v(text):
  # 記号を削除する
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) # 記号をスペースに置換する文字列変換を作成
  words = text.translate(table).split() # 変換を実行してスペースで分割する
  vec = []
  # ベクトル化する
  for word in words:
    if word in model:
      vec.append(model[word])
  return torch.tensor(sum(vec) / len(vec))

# 特徴ベクトルを作成
X_train = torch.stack([transform_w2v(text) for text in train['TITLE']])
X_valid = torch.stack([transform_w2v(text) for text in valid['TITLE']])
X_test = torch.stack([transform_w2v(text) for text in test['TITLE']])

print(X_train)

# ラベルベクトルを作成
category = {'b': 0, 't': 1, 'e': 2, 'm': 3}
# カテゴリ名を数字に変えてテンソル化
y_train = torch.tensor(train['CATEGORY'].map(lambda x: category[x]).values)
y_valid = torch.tensor(valid['CATEGORY'].map(lambda x: category[x]).values)
y_test = torch.tensor(test['CATEGORY'].map(lambda x: category[x]).values)

print(y_train)

torch.save(X_train, 'X_train.pt')
torch.save(X_valid, 'X_valid.pt')
torch.save(X_test, 'X_test.pt')
torch.save(y_train, 'y_train.pt')
torch.save(y_valid, 'y_valid.pt')
torch.save(y_test, 'y_test.pt')

tensor([[ 0.0221,  0.1608, -0.0503,  ..., -0.0142,  0.0284,  0.1073],
        [ 0.0812,  0.0148, -0.0837,  ..., -0.0396,  0.0188,  0.0505],
        [-0.0361,  0.0535, -0.0784,  ..., -0.0192,  0.0441, -0.0964],
        ...,
        [-0.0222,  0.1884,  0.0219,  ..., -0.0295,  0.0500, -0.0215],
        [ 0.0419, -0.0294,  0.0578,  ...,  0.0140,  0.0212,  0.0078],
        [ 0.0942, -0.0175, -0.0270,  ..., -0.0232,  0.1205, -0.0556]])
tensor([2, 2, 0,  ..., 2, 2, 2])


In [11]:
# knock71
from torch import nn

class SLPNet(nn.Module):
  def __init__(self, input_size, output_size):
    # 以下の処理を追加
    super().__init__()
    # 全結合層を定義
    self.fc = nn.Linear(input_size, output_size, bias=False)
    # 正規乱数で重みを初期化する
    nn.init.normal_(self.fc.weight, 0.0, 1.0)

  def forward(self, x):
    x = self.fc(x)
    return x

# 単層ニューラルネットワークを初期化
model = SLPNet(300, 4)
y_hat_1 = torch.softmax(model(X_train[:1]), dim=-1)
print(y_hat_1)
Y_hat = torch.softmax(model.forward(X_train[:4]), dim=-1)
print(Y_hat)

tensor([[0.4494, 0.0115, 0.1775, 0.3617]], grad_fn=<SoftmaxBackward0>)
tensor([[0.4494, 0.0115, 0.1775, 0.3617],
        [0.1374, 0.0695, 0.3424, 0.4507],
        [0.2929, 0.0241, 0.0339, 0.6491],
        [0.2620, 0.0510, 0.2106, 0.4764]], grad_fn=<SoftmaxBackward0>)


In [13]:
# knock72

criterion = nn.CrossEntropyLoss()
l_1 = criterion(model(X_train[:1]), y_train[:1])
model.zero_grad()
l_1.backward()
print(f'損失: {l_1}')
print(f'勾配:\n{model.fc.weight.grad}')

l = criterion(model(X_train[:4]), y_train[:4])
model.zero_grad()
l.backward()
print(f'損失: {l}')
print(f'勾配:\n{model.fc.weight.grad}')

損失: 1.728939414024353
勾配:
tensor([[ 0.0099,  0.0723, -0.0226,  ..., -0.0064,  0.0128,  0.0482],
        [ 0.0003,  0.0018, -0.0006,  ..., -0.0002,  0.0003,  0.0012],
        [-0.0182, -0.1322,  0.0414,  ...,  0.0117, -0.0233, -0.0882],
        [ 0.0080,  0.0582, -0.0182,  ..., -0.0052,  0.0103,  0.0388]])
損失: 1.3420171737670898
勾配:
tensor([[ 0.0025,  0.0096,  0.0384,  ..., -0.0146, -0.0183,  0.0496],
        [ 0.0019,  0.0010, -0.0044,  ...,  0.0002,  0.0017, -0.0007],
        [-0.0156, -0.0352,  0.0140,  ...,  0.0136, -0.0045, -0.0365],
        [ 0.0112,  0.0246, -0.0480,  ...,  0.0008,  0.0211, -0.0124]])


In [15]:
# knock73

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# dataset型
class NewsDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return [self.X[idx], self.y[idx]]

# Dataset（ミニバッチ）を作成
dataset_train = NewsDataset(X_train, y_train)
dataset_valid = NewsDataset(X_valid, y_valid)
dataset_test = NewsDataset(X_test, y_test)

# Dataloaderを作成
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False)

model = SLPNet(300, 4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# 学習
num_epochs = 100
for epoch in tqdm(range(num_epochs)):
  model.train()
  loss_train = 0.0
  for i, (inputs, labels) in enumerate(dataloader_train):
    optimizer.zero_grad()
    # 順伝播
    outputs = model(inputs)
    # 誤差逆伝播
    loss = criterion(outputs, labels)
    loss.backward()
    # 更新
    optimizer.step()

    loss_train += loss.item()
  # 平均損失を計算
  loss_train = loss_train / i
  # 検証データの損失を計算
  model.eval()
  with torch.no_grad():
    inputs, labels = next(iter(dataloader_valid))
    outputs = model(inputs)
    loss_valid = criterion(outputs, labels)
  print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}')

  1%|          | 1/100 [00:05<08:51,  5.37s/it]

epoch: 1, loss_train: 0.4767, loss_valid: 0.3416


  2%|▏         | 2/100 [00:09<07:28,  4.57s/it]

epoch: 2, loss_train: 0.3166, loss_valid: 0.3013


  3%|▎         | 3/100 [00:13<06:58,  4.32s/it]

epoch: 3, loss_train: 0.2868, loss_valid: 0.2857


  4%|▍         | 4/100 [00:18<07:21,  4.60s/it]

epoch: 4, loss_train: 0.2701, loss_valid: 0.2806


  5%|▌         | 5/100 [00:22<06:54,  4.36s/it]

epoch: 5, loss_train: 0.2595, loss_valid: 0.2764


  6%|▌         | 6/100 [00:26<06:37,  4.23s/it]

epoch: 6, loss_train: 0.2524, loss_valid: 0.2710


  7%|▋         | 7/100 [00:31<06:59,  4.51s/it]

epoch: 7, loss_train: 0.2461, loss_valid: 0.2690


  8%|▊         | 8/100 [00:35<06:41,  4.36s/it]

epoch: 8, loss_train: 0.2419, loss_valid: 0.2709


  9%|▉         | 9/100 [00:39<06:27,  4.25s/it]

epoch: 9, loss_train: 0.2384, loss_valid: 0.2688


 10%|█         | 10/100 [00:44<06:34,  4.38s/it]

epoch: 10, loss_train: 0.2351, loss_valid: 0.2707


 11%|█         | 11/100 [00:48<06:32,  4.41s/it]

epoch: 11, loss_train: 0.2324, loss_valid: 0.2734


 12%|█▏        | 12/100 [00:52<06:15,  4.26s/it]

epoch: 12, loss_train: 0.2308, loss_valid: 0.2675


 13%|█▎        | 13/100 [00:56<06:08,  4.24s/it]

epoch: 13, loss_train: 0.2286, loss_valid: 0.2695


 14%|█▍        | 14/100 [01:01<06:21,  4.44s/it]

epoch: 14, loss_train: 0.2281, loss_valid: 0.2699


 15%|█▌        | 15/100 [01:05<06:06,  4.32s/it]

epoch: 15, loss_train: 0.2256, loss_valid: 0.2668


 16%|█▌        | 16/100 [01:09<05:54,  4.22s/it]

epoch: 16, loss_train: 0.2249, loss_valid: 0.2703


 17%|█▋        | 17/100 [01:14<06:12,  4.48s/it]

epoch: 17, loss_train: 0.2236, loss_valid: 0.2744


 18%|█▊        | 18/100 [01:18<05:55,  4.34s/it]

epoch: 18, loss_train: 0.2229, loss_valid: 0.2696


 19%|█▉        | 19/100 [01:22<05:41,  4.22s/it]

epoch: 19, loss_train: 0.2221, loss_valid: 0.2730


 20%|██        | 20/100 [01:27<05:53,  4.42s/it]

epoch: 20, loss_train: 0.2208, loss_valid: 0.2729


 21%|██        | 21/100 [01:31<05:45,  4.37s/it]

epoch: 21, loss_train: 0.2205, loss_valid: 0.2767


 22%|██▏       | 22/100 [01:35<05:32,  4.26s/it]

epoch: 22, loss_train: 0.2196, loss_valid: 0.2747


 23%|██▎       | 23/100 [01:40<05:44,  4.47s/it]

epoch: 23, loss_train: 0.2194, loss_valid: 0.2754


 24%|██▍       | 24/100 [01:45<05:38,  4.46s/it]

epoch: 24, loss_train: 0.2185, loss_valid: 0.2860


 25%|██▌       | 25/100 [01:49<05:24,  4.32s/it]

epoch: 25, loss_train: 0.2179, loss_valid: 0.2752


 26%|██▌       | 26/100 [01:53<05:18,  4.31s/it]

epoch: 26, loss_train: 0.2173, loss_valid: 0.2820


 27%|██▋       | 27/100 [01:58<05:28,  4.51s/it]

epoch: 27, loss_train: 0.2166, loss_valid: 0.2761


 28%|██▊       | 28/100 [02:02<05:14,  4.37s/it]

epoch: 28, loss_train: 0.2165, loss_valid: 0.2781


 29%|██▉       | 29/100 [02:06<05:08,  4.34s/it]

epoch: 29, loss_train: 0.2160, loss_valid: 0.2809


 30%|███       | 30/100 [02:12<05:24,  4.64s/it]

epoch: 30, loss_train: 0.2159, loss_valid: 0.2774


 31%|███       | 31/100 [02:16<05:07,  4.46s/it]

epoch: 31, loss_train: 0.2157, loss_valid: 0.2806


 32%|███▏      | 32/100 [02:20<04:54,  4.33s/it]

epoch: 32, loss_train: 0.2151, loss_valid: 0.2793


 33%|███▎      | 33/100 [02:25<05:11,  4.65s/it]

epoch: 33, loss_train: 0.2155, loss_valid: 0.2817


 34%|███▍      | 34/100 [02:29<04:59,  4.53s/it]

epoch: 34, loss_train: 0.2143, loss_valid: 0.2825


 35%|███▌      | 35/100 [02:34<04:49,  4.46s/it]

epoch: 35, loss_train: 0.2140, loss_valid: 0.2894


 36%|███▌      | 36/100 [02:39<04:56,  4.64s/it]

epoch: 36, loss_train: 0.2141, loss_valid: 0.2803


 37%|███▋      | 37/100 [02:43<04:45,  4.52s/it]

epoch: 37, loss_train: 0.2140, loss_valid: 0.2795


 38%|███▊      | 38/100 [02:47<04:30,  4.37s/it]

epoch: 38, loss_train: 0.2136, loss_valid: 0.2817


 39%|███▉      | 39/100 [02:52<04:30,  4.44s/it]

epoch: 39, loss_train: 0.2138, loss_valid: 0.2853


 40%|████      | 40/100 [02:56<04:27,  4.46s/it]

epoch: 40, loss_train: 0.2137, loss_valid: 0.2823


 41%|████      | 41/100 [03:00<04:14,  4.32s/it]

epoch: 41, loss_train: 0.2136, loss_valid: 0.2838


 42%|████▏     | 42/100 [03:04<04:09,  4.30s/it]

epoch: 42, loss_train: 0.2127, loss_valid: 0.2841


 43%|████▎     | 43/100 [03:09<04:16,  4.50s/it]

epoch: 43, loss_train: 0.2137, loss_valid: 0.2818


 44%|████▍     | 44/100 [03:13<04:04,  4.36s/it]

epoch: 44, loss_train: 0.2124, loss_valid: 0.2846


 45%|████▌     | 45/100 [03:17<03:54,  4.26s/it]

epoch: 45, loss_train: 0.2126, loss_valid: 0.2848


 46%|████▌     | 46/100 [03:23<04:05,  4.55s/it]

epoch: 46, loss_train: 0.2131, loss_valid: 0.2877


 47%|████▋     | 47/100 [03:27<03:52,  4.38s/it]

epoch: 47, loss_train: 0.2128, loss_valid: 0.2831


 48%|████▊     | 48/100 [03:31<03:42,  4.27s/it]

epoch: 48, loss_train: 0.2123, loss_valid: 0.2881


 49%|████▉     | 49/100 [03:36<03:48,  4.48s/it]

epoch: 49, loss_train: 0.2120, loss_valid: 0.2886


 50%|█████     | 50/100 [03:40<03:40,  4.40s/it]

epoch: 50, loss_train: 0.2117, loss_valid: 0.2885


 51%|█████     | 51/100 [03:44<03:30,  4.29s/it]

epoch: 51, loss_train: 0.2114, loss_valid: 0.2850


 52%|█████▏    | 52/100 [03:48<03:30,  4.38s/it]

epoch: 52, loss_train: 0.2110, loss_valid: 0.2853


 53%|█████▎    | 53/100 [03:53<03:28,  4.44s/it]

epoch: 53, loss_train: 0.2112, loss_valid: 0.2862


 54%|█████▍    | 54/100 [03:57<03:18,  4.32s/it]

epoch: 54, loss_train: 0.2112, loss_valid: 0.2864


 55%|█████▌    | 55/100 [04:01<03:13,  4.29s/it]

epoch: 55, loss_train: 0.2117, loss_valid: 0.2864


 56%|█████▌    | 56/100 [04:06<03:17,  4.49s/it]

epoch: 56, loss_train: 0.2113, loss_valid: 0.2920


 57%|█████▋    | 57/100 [04:10<03:07,  4.36s/it]

epoch: 57, loss_train: 0.2110, loss_valid: 0.2868


 58%|█████▊    | 58/100 [04:14<02:58,  4.25s/it]

epoch: 58, loss_train: 0.2110, loss_valid: 0.2870


 59%|█████▉    | 59/100 [04:19<03:05,  4.52s/it]

epoch: 59, loss_train: 0.2109, loss_valid: 0.2914


 60%|██████    | 60/100 [04:23<02:55,  4.38s/it]

epoch: 60, loss_train: 0.2105, loss_valid: 0.2880


 61%|██████    | 61/100 [04:27<02:46,  4.27s/it]

epoch: 61, loss_train: 0.2112, loss_valid: 0.2872


 62%|██████▏   | 62/100 [04:35<03:15,  5.16s/it]

epoch: 62, loss_train: 0.2107, loss_valid: 0.2907


 63%|██████▎   | 63/100 [04:39<02:58,  4.82s/it]

epoch: 63, loss_train: 0.2100, loss_valid: 0.2885


 64%|██████▍   | 64/100 [04:43<02:45,  4.60s/it]

epoch: 64, loss_train: 0.2106, loss_valid: 0.2919


 65%|██████▌   | 65/100 [04:48<02:45,  4.72s/it]

epoch: 65, loss_train: 0.2107, loss_valid: 0.2896


 66%|██████▌   | 66/100 [04:52<02:35,  4.58s/it]

epoch: 66, loss_train: 0.2104, loss_valid: 0.2882


 67%|██████▋   | 67/100 [04:56<02:25,  4.42s/it]

epoch: 67, loss_train: 0.2101, loss_valid: 0.2906


 68%|██████▊   | 68/100 [05:01<02:23,  4.48s/it]

epoch: 68, loss_train: 0.2108, loss_valid: 0.2902


 69%|██████▉   | 69/100 [05:05<02:20,  4.52s/it]

epoch: 69, loss_train: 0.2103, loss_valid: 0.2869


 70%|███████   | 70/100 [05:09<02:11,  4.38s/it]

epoch: 70, loss_train: 0.2101, loss_valid: 0.2907


 71%|███████   | 71/100 [05:14<02:06,  4.36s/it]

epoch: 71, loss_train: 0.2104, loss_valid: 0.2876


 72%|███████▏  | 72/100 [05:19<02:06,  4.53s/it]

epoch: 72, loss_train: 0.2097, loss_valid: 0.2883


 73%|███████▎  | 73/100 [05:23<01:58,  4.38s/it]

epoch: 73, loss_train: 0.2098, loss_valid: 0.2895


 74%|███████▍  | 74/100 [05:27<01:50,  4.27s/it]

epoch: 74, loss_train: 0.2096, loss_valid: 0.2913


 75%|███████▌  | 75/100 [05:32<01:53,  4.54s/it]

epoch: 75, loss_train: 0.2105, loss_valid: 0.2907


 76%|███████▌  | 76/100 [05:36<01:45,  4.38s/it]

epoch: 76, loss_train: 0.2100, loss_valid: 0.2961


 77%|███████▋  | 77/100 [05:40<01:38,  4.27s/it]

epoch: 77, loss_train: 0.2102, loss_valid: 0.2961


 78%|███████▊  | 78/100 [05:45<01:39,  4.52s/it]

epoch: 78, loss_train: 0.2097, loss_valid: 0.2916


 79%|███████▉  | 79/100 [05:49<01:32,  4.41s/it]

epoch: 79, loss_train: 0.2098, loss_valid: 0.2929


 80%|████████  | 80/100 [05:53<01:25,  4.29s/it]

epoch: 80, loss_train: 0.2097, loss_valid: 0.2909


 81%|████████  | 81/100 [05:58<01:24,  4.46s/it]

epoch: 81, loss_train: 0.2097, loss_valid: 0.2900


 82%|████████▏ | 82/100 [06:02<01:20,  4.46s/it]

epoch: 82, loss_train: 0.2094, loss_valid: 0.2903


 83%|████████▎ | 83/100 [06:06<01:13,  4.33s/it]

epoch: 83, loss_train: 0.2094, loss_valid: 0.2905


 84%|████████▍ | 84/100 [06:11<01:09,  4.37s/it]

epoch: 84, loss_train: 0.2094, loss_valid: 0.2941


 85%|████████▌ | 85/100 [06:16<01:07,  4.50s/it]

epoch: 85, loss_train: 0.2095, loss_valid: 0.2930


 86%|████████▌ | 86/100 [06:20<01:00,  4.35s/it]

epoch: 86, loss_train: 0.2096, loss_valid: 0.2914


 87%|████████▋ | 87/100 [06:24<00:55,  4.25s/it]

epoch: 87, loss_train: 0.2094, loss_valid: 0.2945


 88%|████████▊ | 88/100 [06:29<00:54,  4.51s/it]

epoch: 88, loss_train: 0.2095, loss_valid: 0.2913


 89%|████████▉ | 89/100 [06:33<00:48,  4.37s/it]

epoch: 89, loss_train: 0.2097, loss_valid: 0.2925


 90%|█████████ | 90/100 [06:37<00:42,  4.28s/it]

epoch: 90, loss_train: 0.2091, loss_valid: 0.2927


 91%|█████████ | 91/100 [06:42<00:40,  4.55s/it]

epoch: 91, loss_train: 0.2091, loss_valid: 0.3012


 92%|█████████▏| 92/100 [06:46<00:35,  4.40s/it]

epoch: 92, loss_train: 0.2095, loss_valid: 0.2925


 93%|█████████▎| 93/100 [06:50<00:30,  4.29s/it]

epoch: 93, loss_train: 0.2090, loss_valid: 0.2942


 94%|█████████▍| 94/100 [06:55<00:26,  4.44s/it]

epoch: 94, loss_train: 0.2092, loss_valid: 0.2924


 95%|█████████▌| 95/100 [06:59<00:22,  4.42s/it]

epoch: 95, loss_train: 0.2093, loss_valid: 0.2955


 96%|█████████▌| 96/100 [07:03<00:17,  4.31s/it]

epoch: 96, loss_train: 0.2091, loss_valid: 0.2948


 97%|█████████▋| 97/100 [07:08<00:13,  4.35s/it]

epoch: 97, loss_train: 0.2092, loss_valid: 0.2918


 98%|█████████▊| 98/100 [07:13<00:08,  4.45s/it]

epoch: 98, loss_train: 0.2089, loss_valid: 0.2931


 99%|█████████▉| 99/100 [07:17<00:04,  4.32s/it]

epoch: 99, loss_train: 0.2089, loss_valid: 0.2928


100%|██████████| 100/100 [07:21<00:00,  4.41s/it]

epoch: 100, loss_train: 0.2092, loss_valid: 0.2937





In [16]:
# knock74

def calculate_accuracy(model, loader):
  model.eval()
  total = 0
  correct = 0
  with torch.no_grad():
    for inputs, labels in loader:
      outputs = model(inputs)
      pred = torch.argmax(outputs, dim=-1)
      total += len(inputs)
      correct += (pred == labels).sum().item()
  return correct / total

acc_train = calculate_accuracy(model, dataloader_train)
acc_test = calculate_accuracy(model, dataloader_test)
print(f'学習データ: {acc_train}')
print(f'評価データ: {acc_test}')

学習データ: 0.9324400299850075
評価データ: 0.8883058470764618
