# train filter

スレタイの尤もらしさをスコアリングするFilterを学習する。

FilterはCNNによる文書分類器、詳細は[scripts/model.py](scripts/model.py)を参照


|パラメータ名|内容|
|:--|:--|
| `conv_filters` | Filterのパラメータ、詳細は[scripts/model.py](scripts/model.py)を参照 |
| `conv_kernel_sizes` | Filterのパラメータ、詳細は[scripts/model.py](scripts/model.py)を参照 |
| `d_model` | Filterのパラメータ、詳細は[scripts/model.py](scripts/model.py)を参照 |
| `EPOCHS` | 学習のエポック数 |
| `BATCH_SIZE` | 学習のバッチサイズ |

## 入力ファイル

* `real_dataset.pickle`
* `fake_dataset.pickle`

## 出力ファイル

* `model/generator/weights_epoch*.h5`


In [1]:
import os
import time
import pickle
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
# model parameters 
conv_filters = [32, 64, 128]
conv_kernel_sizes = [16, 8, 4]
d_model = 128

# 学習パラメータ
EPOCHS = 20
BATCH_SIZE = 128

In [3]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load('sentencepiece.model')

True

In [4]:
with open("real_dataset.pickle", "rb") as f:
    ids = pickle.load(f)
pos_tensor = tf.keras.preprocessing.sequence.pad_sequences(ids, padding='post')

In [5]:
with open("fake_dataset.pickle", "rb") as f:
    neg_tensor = pickle.load(f)

In [6]:
# 検証用: 学習データを減らす
# pos_tensor = pos_tensor[:10000]

In [7]:
# 数を揃える
neg_tensor = neg_tensor[:len(pos_tensor)]

In [8]:
pos_tensor_train, pos_tensor_valid = train_test_split(pos_tensor, test_size=0.1)
print(len(pos_tensor_train), len(pos_tensor_valid))

1536857 170762


In [9]:
neg_tensor_train, neg_tensor_valid = train_test_split(neg_tensor, test_size=0.1)
print(len(neg_tensor_train), len(neg_tensor_valid))

1536857 170762


In [10]:
dataset_pos_train = tf.data.Dataset.from_tensor_slices((pos_tensor_train, [1] * len(pos_tensor_train)))
dataset_neg_train = tf.data.Dataset.from_tensor_slices((neg_tensor_train, [0] * len(neg_tensor_train)))
dataset_pos_valid = tf.data.Dataset.from_tensor_slices((pos_tensor_valid, [1] * len(pos_tensor_valid)))
dataset_neg_valid = tf.data.Dataset.from_tensor_slices((neg_tensor_valid, [0] * len(neg_tensor_valid)))

In [11]:
BUFFER_SIZE = len(pos_tensor_train) + len(neg_tensor_train)
dataset_train = dataset_pos_train.concatenate(dataset_neg_train).shuffle(BUFFER_SIZE)
BUFFER_SIZE = len(pos_tensor_valid) + len(neg_tensor_valid)
dataset_valid = dataset_pos_valid.concatenate(dataset_neg_valid).shuffle(BUFFER_SIZE)

In [12]:
# Model Parameters
vocab_size = sp.get_piece_size()
seq_len = pos_tensor.shape[1]

In [13]:
from scripts.model import Filter
nanj_filter = Filter(conv_filters, conv_kernel_sizes, d_model, vocab_size)

In [None]:
# 動作確認（正例）
test_case_pos = tf.constant(pos_tensor_valid[:10])
scores = tf.math.sigmoid(nanj_filter(test_case_pos))
for ids, score in zip(test_case_pos.numpy(), scores):
    ids_int = list(map(lambda x: int(x), ids))
    print(sp.decode_ids(ids_int))
    print(f"score: {score}")

In [None]:
# 動作確認（負例）
test_case_neg = tf.constant(neg_tensor_valid[:10])
scores = tf.math.sigmoid(nanj_filter(test_case_neg))
for ids, score in zip(test_case_neg.numpy(), scores):
    ids_int = list(map(lambda x: int(x), ids))
    print(sp.decode_ids(ids_int))
    print(f"score: {score}")

In [16]:
steps_per_epoch_train = (len(pos_tensor_train) + len(neg_tensor_train))//BATCH_SIZE
steps_per_epoch_valid = (len(pos_tensor_valid) + len(neg_tensor_valid))//BATCH_SIZE

In [17]:
optimizer = tf.keras.optimizers.Adam()

binary_cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')


def filter_loss(real, pred):
    loss = binary_cross_entropy(real, pred)
    return tf.reduce_mean(loss)

In [18]:
@tf.function
def train_step(inp, label):    # x: (BATCH_SIZE, seq_len)
    with tf.GradientTape() as tape:
        pred = nanj_filter(inp, training=True)        
        loss = filter_loss(label, pred)
        
    gradients = tape.gradient(loss, nanj_filter.trainable_variables)
    optimizer.apply_gradients(zip(gradients, nanj_filter.trainable_variables))

    return loss

In [19]:
@tf.function
def valid_step(inp, label):    # x: (BATCH_SIZE, seq_len)    
    pred = nanj_filter(inp, training=False)
    loss = filter_loss(label, pred)
    return loss

In [20]:
dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)
dataset_valid = dataset_valid.batch(BATCH_SIZE, drop_remainder=True)

In [21]:
model_dir = "model/filter"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    # TRAIN
    total_loss = 0
    for (batch, dataset) in enumerate(dataset_train.take(steps_per_epoch_train)):
        inp, label = dataset
        # batch_start = time.time()
        batch_loss = train_step(inp, label)
        total_loss += batch_loss
        # print('Time taken for 1 batch {} sec'.format(time.time() - batch_start))

        if batch % 500 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
            
    nanj_filter.save_weights(f"{model_dir}/weights_epoch{epoch+1}.h5")

    print(f'Train Epoch {epoch+1} Gen Loss {total_loss/steps_per_epoch_train:.4f}')
    
    # VALIDATION
    total_valid_loss = 0
    for (batch, dataset) in enumerate(dataset_valid.take(steps_per_epoch_valid)):
        inp, label = dataset
        batch_loss = valid_step(inp, label)
        total_valid_loss += batch_loss
        
    print(f'Validation Loss {total_valid_loss/steps_per_epoch_valid:.4f}')

    # スコア付けを確認する
    print(">>> pos test case <<<")
    scores = tf.math.sigmoid(nanj_filter(test_case_pos))
    for ids, score in zip(test_case_pos.numpy(), scores):
        ids_int = list(map(lambda x: int(x), ids))
        print(sp.decode_ids(ids_int))
        print(f"score: {score}")

    print(">>> neg test case <<<")
    scores = tf.math.sigmoid(nanj_filter(test_case_neg))
    for ids, score in zip(test_case_neg.numpy(), scores):
        ids_int = list(map(lambda x: int(x), ids))
        print(sp.decode_ids(ids_int))
        print(f"score: {score}")

    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))