## どうにも詰まってしまったので[こちら](https://mori-memo.hateblo.jp/entry/2022/10/16/173011)をとりあえず動かしてみる

## 80

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ディレクトリの移動
%cd /content/drive/MyDrive/nlp100
%pwd
!ls
%cd ./nlp100/

/content/drive/MyDrive/nlp100
 analogy.txt		  datas_ch8			       set1.csv
 chapter09.ipynb	  github.ipynb			       set1.tab
 chapter6.ipynb		  GoogleNews-vectors-negative300.bin   set2.csv
 chapter7.ipynb		  instructions.txt		       set2.tab
'chapter8 (1).ipynb'	  NewsAggregatorDataset		       sytactic_analogy.csv
 chapter8.ipynb		  NewsAggregatorDatasettest.txt        test_feature.txt
 chapter9_archive.ipynb   NewsAggregatorDatasettrain.txt       train_feature.txt
 combined.csv		  NewsAggregatorDatasetval.txt	       valid_feature.txt
 combined.tab		  nlp100			       wordsim353
 countries.csv		  semantic_analogy.csv		       wordsim353.zip
/content/drive/MyDrive/nlp100/nlp100


In [3]:
# データのロード
import pandas as pd
import re
import numpy as np

# ファイル読み込み
file = "../NewsAggregatorDataset/newsCorpora.csv"
data = pd.read_csv(file, encoding='utf-8', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
data = data.replace('"', "'")
# 特定のpublisherのみ抽出
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
data = data.loc[data['PUBLISHER'].isin(publishers), ['TITLE', 'CATEGORY']].reset_index(drop=True)

# 前処理
def preprocessing(text):
    text_clean = re.sub(r'[\"\'.,:;\(\)#\|\*\+\!\?#$%&/\]\[\{\}]', '', text)
    text_clean = re.sub('[0-9]+', '0', text_clean)
    text_clean = re.sub('\s-\s', ' ', text_clean)
    return text_clean

data['TITLE'] = data['TITLE'].apply(preprocessing)

# 学習用、検証用、評価用に分割する
from sklearn.model_selection import train_test_split

train, valid_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=64, stratify=data['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=64, stratify=valid_test['CATEGORY'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

# データ数の確認
print('学習データ')
print(train['CATEGORY'].value_counts())
print('検証データ')
print(valid['CATEGORY'].value_counts())
print('評価データ')
print(test['CATEGORY'].value_counts())

学習データ
b    4502
e    4223
t    1219
m     728
Name: CATEGORY, dtype: int64
検証データ
b    562
e    528
t    153
m     91
Name: CATEGORY, dtype: int64
評価データ
b    563
e    528
t    152
m     91
Name: CATEGORY, dtype: int64


In [4]:
# 単語の辞書を作成
from collections import Counter
words = []
for text in train['TITLE']:
    for word in text.rstrip().split():
        words.append(word)
c = Counter(words)
word2id = {}
for i, cnt in enumerate(c.most_common()):
    if cnt[1] > 1:
        word2id[cnt[0]] = i + 1
for i, cnt in enumerate(word2id.items()):
    if i >= 10:
        break
    print(cnt[0], cnt[1])


to 1
0 2
in 3
as 4
on 5
UPDATE 6
for 7
The 8
of 9
US 10


In [5]:
# 単語のID化
def tokenizer(text):
    words = text.rstrip().split()
    return [word2id.get(word, 0) for word in words]

sample = train.at[0, 'TITLE']
print(sample)
print(tokenizer(sample))

Justin Bieber Under Investigation For Attempted Robbery At Dave  Busters
[68, 76, 782, 1974, 21, 5054, 5055, 34, 1602, 0]


## 81

In [6]:
import torch
from torch import nn

VOCAB_SIZE = 4
EMB_SIZE = 3
emb = nn.Embedding(VOCAB_SIZE, EMB_SIZE)

words = torch.tensor([1, 3, 0, 2, 1, 2])
embed_words = emb(words)
print(embed_words)
print(words.shape, '->', embed_words.shape)

tensor([[-0.0076, -1.0310,  1.1935],
        [-0.3319, -1.1299, -0.5302],
        [ 0.6527, -1.4976,  0.2532],
        [-1.2356,  1.4862, -1.6536],
        [-0.0076, -1.0310,  1.1935],
        [-1.2356,  1.4862, -1.6536]], grad_fn=<EmbeddingBackward0>)
torch.Size([6]) -> torch.Size([6, 3])


In [7]:
# RNNの作成
# モデルの構築
import random
import torch
from torch import nn
import torch.utils.data as data

# 乱数のシードを設定
# parserなどで指定
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
        self.rnn = nn.LSTM(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0=None):
        x = self.emb(x)
        x, h = self.rnn(x, h0)
        x = x[:, -1, :]
        logits = self.fc(x)
        return logits

# パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 2  # 辞書のID数 + unknown + パディングID
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values())) + 1
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 1

# モデルの定義
model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS)
print(model)

RNN(
  (emb): Embedding(9725, 300, padding_idx=9724)
  (rnn): LSTM(300, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)


In [8]:
# 初期状態での推論
x = torch.tensor([tokenizer(sample)], dtype=torch.int64)
print(x)
print(x.size())
print(nn.Softmax(dim=-1)(model(x)))


tensor([[  68,   76,  782, 1974,   21, 5054, 5055,   34, 1602,    0]])
torch.Size([1, 10])
tensor([[0.2783, 0.2129, 0.2804, 0.2284]], grad_fn=<SoftmaxBackward0>)


## 82

In [9]:
# ターゲットのテンソル化
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)
Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)
Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)
print(Y_train.size())
print(Y_train)

torch.Size([10672])
tensor([2, 0, 2,  ..., 0, 0, 0])


In [19]:
class NewsDataset(data.Dataset):
    """
    newsのDatasetクラス

    Attributes
    ----------------------------
    X : データフレーム
        単語ベクトルの平均をまとめたテンソル
    y : テンソル
        カテゴリをラベル化したテンソル
    phase : 'train' or 'val'
        学習か訓練かを設定する
    """
    def __init__(self, X, y, phase='train'):
        self.X = X['TITLE']
        self.y = y
        self.phase = phase

    def __len__(self):
        """全データサイズを返す"""
        return len(self.y)

    def __getitem__(self, idx):
        """idxに対応するテンソル形式のデータとラベルを取得"""
        inputs = torch.tensor(tokenizer(self.X[idx]))
        return inputs, self.y[idx]

train_dataset = NewsDataset(train, Y_train, phase='train')
valid_dataset = NewsDataset(valid, Y_valid, phase='val')
test_dataset = NewsDataset(test, Y_test, phase='val')
# 動作確認
idx = 0
print(train_dataset.__getitem__(idx)[0].size())
print(train_dataset.__getitem__(idx)[1])
print(valid_dataset.__getitem__(idx)[0].size())
print(valid_dataset.__getitem__(idx)[1])
print(test_dataset.__getitem__(idx)[0].size())
print(test_dataset.__getitem__(idx)[1])

print(train)
print(type(train_dataset))

torch.Size([10])
tensor(2)
torch.Size([11])
tensor(3)
torch.Size([13])
tensor(2)
                                                   TITLE CATEGORY
0      Justin Bieber Under Investigation For Attempte...        e
1      Exxon Report Claims World Highly Unlikely To L...        b
2      Jack White Records Releases Single In Hours Fo...        e
3      President Barack Obama Releases Proclamation D...        t
4      Samsung Shares Steady After Chairmans Heart At...        m
...                                                  ...      ...
10667  JK Rowling Brings Her Magic To TV HBO And BBC ...        e
10668  UPDATE 0-Peace Corps pulls volunteers from Wes...        m
10669  GRAINS-US soybean prices climb on rising Chine...        b
10670  Seafood Fraud Under Fire As Lawmakers Look To ...        b
10671  A Hedge Fund Wants to Teach PetSmart Some New ...        b

[10672 rows x 2 columns]
<class '__main__.NewsDataset'>


In [17]:
print(train_dataset.__len__())
print(train_dataset.__getitem__(0))

#train_datasetはtorch.utils.data.Datasetクラス
    #自分のデータもdataset.Datasetクラスにする

10672
(tensor([  68,   76,  782, 1974,   21, 5054, 5055,   34, 1602,    0]), tensor(2))


In [11]:
# DataLoaderを作成
batch_size = 1

train_dataloader = data.DataLoader(
            train_dataset, batch_size=batch_size, shuffle=True, worker_init_fn=seed_worker, generator=g)
valid_dataloader = data.DataLoader(
            valid_dataset, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g)
test_dataloader = data.DataLoader(
            test_dataset, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g)

dataloaders_dict = {'train': train_dataloader,
                    'val': valid_dataloader,
                    'test': test_dataloader,
                   }

# 動作確認
batch_iter = iter(dataloaders_dict['train'])
inputs, labels = next(batch_iter)
print(inputs.size())
print(labels)

torch.Size([1, 11])
tensor([2])


In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm

# 学習用の関数を定義
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):
    train_loss = []
    train_acc = []
    valid_loss = []
    valid_acc = []
    # epochのループ
    for epoch in range(num_epochs):
        print('Epoch {} / {}'.format(epoch + 1, num_epochs))
        print('--------------------------------------------')

        # epochごとの学習と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train() # 訓練モード
            else:
                net.eval() # 検証モード

            epoch_loss = 0.0 # epochの損失和
            epoch_corrects = 0 # epochの正解数

            # データローダーからミニバッチを取り出すループ
            for inputs, labels in tqdm(dataloaders_dict[phase]):
                optimizer.zero_grad() # optimizerを初期化

                # 順伝播計算(forward)
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels) # 損失を計算
                    _, preds = torch.max(outputs, 1) # ラベルを予想

                    # 訓練時は逆伝播
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    # イテレーション結果の計算
                    # lossの合計を更新
                    epoch_loss += loss.item() * inputs.size(0)
                    # 正解数の合計を更新
                    epoch_corrects += torch.sum(preds == labels.data)

            # epochごとのlossと正解率の表示
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)
            if phase == 'train':
                train_loss.append(epoch_loss)
                train_acc.append(epoch_acc)
            else:
                valid_loss.append(epoch_loss)
                valid_acc.append(epoch_acc)

            print('{} Loss: {:.4f}, Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
    return train_loss, train_acc, valid_loss, valid_acc

# 学習を実行する

# モデルの定義
net = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS)
net.train()

# 損失関数の定義
criterion = nn.CrossEntropyLoss()

# 最適化手法の定義
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

num_epochs = 5
train_loss, train_acc, valid_loss, valid_acc = train_model(net,
            dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

Epoch 1 / 5
--------------------------------------------


100%|██████████| 10672/10672 [00:52<00:00, 204.50it/s]


train Loss: 0.9959, Acc: 0.6041


100%|██████████| 1334/1334 [00:00<00:00, 1903.02it/s]


val Loss: 0.7686, Acc: 0.7391
Epoch 2 / 5
--------------------------------------------


100%|██████████| 10672/10672 [00:50<00:00, 211.37it/s]


train Loss: 0.6217, Acc: 0.7737


100%|██████████| 1334/1334 [00:00<00:00, 1936.94it/s]


val Loss: 0.6238, Acc: 0.7729
Epoch 3 / 5
--------------------------------------------


100%|██████████| 10672/10672 [00:49<00:00, 216.05it/s]


train Loss: 0.3893, Acc: 0.8621


100%|██████████| 1334/1334 [00:00<00:00, 1883.28it/s]


val Loss: 0.5844, Acc: 0.8021
Epoch 4 / 5
--------------------------------------------


100%|██████████| 10672/10672 [00:48<00:00, 217.86it/s]


train Loss: 0.2284, Acc: 0.9231


100%|██████████| 1334/1334 [00:00<00:00, 1924.60it/s]


val Loss: 0.5823, Acc: 0.8096
Epoch 5 / 5
--------------------------------------------


100%|██████████| 10672/10672 [00:48<00:00, 220.69it/s]


train Loss: 0.1206, Acc: 0.9639


100%|██████████| 1334/1334 [00:00<00:00, 1934.03it/s]

val Loss: 0.6249, Acc: 0.8193





- ↑そもそもなんか学習時間がこっちのほうが圧倒的に短いので、データに問題がある気もする
    - padding周りだったりするかなー
    - 学習の精度もちゃんと(概ね参考記事どおり)出てる。