In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**knock70.py**

In [2]:
#knock50 train test valid
import pandas as pd
from sklearn.model_selection import train_test_split

#ファイルを読み込む
data = pd.read_csv('/content/drive/MyDrive/chapter08/newsCorpora.csv', sep = '\t', header = None, names = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

#事例（記事）を抽出する
publishers = ['Reuters', 'Huffington Post', 'Businessweek', '“Contactmusic.com', 'Daily Mail']
##isin:探す
data = data[data['PUBLISHER'].isin(publishers)]
data = data[['TITLE', 'CATEGORY']]

#分割する
##shuffle：分割する前dataをランダムにする
train, valid_test = train_test_split(data, test_size = 0.2, random_state = 0, shuffle = True, stratify = data['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, random_state=0, shuffle = True, stratify=valid_test['CATEGORY'])

In [3]:
#knock60の単語ベクトル
from gensim.models import KeyedVectors
file = '/content/drive/MyDrive/chapter07/GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(file, binary = True)

In [4]:
#knock70
import string
import torch
def tensor1(text): #テキストの平均ベクトル化
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #マークをspaceに変換
  words = text.translate(table).split()  # スペースで分割してリスト化
  vec = [model[word] for word in words if word in model]  # 1語ずつベクトル化

  return torch.tensor(sum(vec) / len(vec))  # 平均ベクトルをTensor型に変換して出力 （multidimensional array）

# 特徴ベクトルxi
X_train = torch.stack([tensor1(text) for text in train['TITLE']])
X_valid = torch.stack([tensor1(text) for text in valid['TITLE']])
X_test = torch.stack([tensor1(text) for text in test['TITLE']])


# ラベルベクトルy
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
y_train = torch.tensor(train['CATEGORY'].map(lambda x: category_dict[x]).values)
y_valid = torch.tensor(valid['CATEGORY'].map(lambda x: category_dict[x]).values)
y_test = torch.tensor(test['CATEGORY'].map(lambda x: category_dict[x]).values)

torch.save(X_train, '/content/drive/MyDrive/chapter08/X_train.pt')
torch.save(X_valid, '/content/drive/MyDrive/chapter08/X_valid.pt')
torch.save(X_test, '/content/drive/MyDrive/chapter08/X_test.pt')
torch.save(y_train, '/content/drive/MyDrive/chapter08/y_train.pt')
torch.save(y_valid, '/content/drive/MyDrive/chapter08/y_valid.pt')
torch.save(y_test, '/content/drive/MyDrive/chapter08/y_test.pt')

**knock71.py**

In [15]:
from torch import nn

#単層のニューラルネットワークを定義
class net(nn.Module):
  def __init__(self, input, output):
    super().__init__() #親クラスのメソッドを呼び出す
    self.fc = nn.Linear(input, output, bias=False)
    nn.init.normal_(self.fc.weight, 0.0, 1.0) #重みを更新

  def forward(self, x): #順伝播
    x = self.fc(x)
    return x

model = net(300, 4)
y1 = torch.softmax(model(X_train[:1]), dim=-1)
Y1 = torch.softmax(model.forward(X_train[:4]), dim=-1)
print(y1)
print(Y1)

tensor([[0.2365, 0.1750, 0.5700, 0.0185]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2365, 0.1750, 0.5700, 0.0185],
        [0.1835, 0.5266, 0.1383, 0.1516],
        [0.0948, 0.3698, 0.5280, 0.0074],
        [0.1957, 0.2949, 0.2856, 0.2237]], grad_fn=<SoftmaxBackward0>)


**knock72.py**

In [16]:
criterion = nn.CrossEntropyLoss() #クロスエントロピー損失を計算する

loss = criterion(model(X_train[:1]), y_train[:1]) #(outputs, labels)
model.zero_grad() #勾配を0で初期化
loss.backward() #逆伝播を実行

print(f'損失: {loss}')
print(f'勾配:\n{model.fc.weight.grad}') #行列Wに関する勾配

損失: 1.441821575164795
勾配:
tensor([[-0.0460,  0.0163, -0.0442,  ...,  0.0223, -0.0839, -0.0049],
        [ 0.0105, -0.0037,  0.0101,  ..., -0.0051,  0.0192,  0.0011],
        [ 0.0343, -0.0122,  0.0330,  ..., -0.0167,  0.0627,  0.0036],
        [ 0.0011, -0.0004,  0.0011,  ..., -0.0005,  0.0020,  0.0001]])


In [52]:
loss1 = criterion(model(X_train[:4]), y_train[:4])
model.zero_grad()
loss1.backward()

print(f'損失: {loss1}')
print(f'勾配:\n{model.fc.weight.grad}')

損失: 1.8929638862609863
勾配:
tensor([[ 0.0002,  0.0234, -0.0146,  ..., -0.0061, -0.0038, -0.0023],
        [ 0.0056,  0.0027,  0.0009,  ..., -0.0047,  0.0071, -0.0012],
        [-0.0309, -0.0327,  0.0122,  ...,  0.0321, -0.0286,  0.0114],
        [ 0.0251,  0.0067,  0.0015,  ..., -0.0213,  0.0253, -0.0079]])


**knock73.py**

In [38]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class dataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y
  def __len__(self): #データセットのサイズを返す
    return len(self.y)
  def __getitem__(self, index): #indexに対応するデータを取得する
    return [self.X[index], self.y[index]]

#datasetを作成
dataset_train = dataset(X_train, y_train)
dataset_valid = dataset(X_valid, y_valid)
dataset_test = dataset(X_test, y_test)
#dataloaderを作成
dataloader_train = DataLoader(dataset_train, batch_size= 1, shuffle= False) #shuffle:データをランダムにシャッフルする
dataloader_valid = DataLoader(dataset_valid, batch_size= len(dataset_valid), shuffle= False)
dataloader_test = DataLoader(dataset_test, batch_size= len(dataset_test), shuffle= False)


In [39]:
model = net(300, 4)
criterion = nn.CrossEntropyLoss() #loss
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1) #optimizer,SGDを応用

#学習
num_epochs = 10
for epoch in range(num_epochs):
  model.train() #訓練モード
  loss_train = 0.0
  for i, (inputs, labels) in enumerate(dataloader_train):
    optimizer.zero_grad() #勾配をゼロで初期化
    outputs = model(inputs) #順伝播
    loss = criterion(outputs, labels)
    loss.backward() #逆伝播
    optimizer.step() #重みを更新
    loss_train += loss.item() #損失を記録
  loss_train = loss_train / i #バッチ単位の平均損失、最後のi

  #valid dataの損失
  model.eval()
  with torch.no_grad():
    inputs, labels = next(iter(dataloader_valid))
    outputs = model(inputs)
    loss_valid = criterion(outputs, labels)

  #lossを出力
  print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, loss_valid: {loss_valid:.4f}')

epoch: 1, loss_train: 0.5607, loss_valid: 0.4005
epoch: 2, loss_train: 0.3619, loss_valid: 0.3621
epoch: 3, loss_train: 0.3277, loss_valid: 0.3459
epoch: 4, loss_train: 0.3095, loss_valid: 0.3366
epoch: 5, loss_train: 0.2976, loss_valid: 0.3307
epoch: 6, loss_train: 0.2892, loss_valid: 0.3267
epoch: 7, loss_train: 0.2828, loss_valid: 0.3240
epoch: 8, loss_train: 0.2777, loss_valid: 0.3222
epoch: 9, loss_train: 0.2737, loss_valid: 0.3210
epoch: 10, loss_train: 0.2703, loss_valid: 0.3202


**knock74.py**

In [60]:
def calculate_accuracy(model, loader):
  model.eval()
  total = 0
  correct = 0
  with torch.no_grad(): #バッチごとにデータを取得
    for inputs, labels in loader:
      outputs = model(inputs)
      pred = torch.argmax(outputs, dim=-1)
      total += len(inputs)
      correct += (pred == labels).sum().item()
  return correct / len(loader.dataset)

acc_train = calculate_accuracy(model, dataloader_train)
acc_test = calculate_accuracy(model, dataloader_test)
print(acc_train)
print(acc_test)

0.9125397546569741
0.8955495004541326
