<a href="https://colab.research.google.com/github/yujiimt/NLP/blob/master/book/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir data
!wget https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip -P data/
!unzip data/ja.text8.zip -d data/

--2020-05-03 07:02:31--  https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/ja.text8.zip
Resolving s3-ap-northeast-1.amazonaws.com (s3-ap-northeast-1.amazonaws.com)... 52.219.0.166
Connecting to s3-ap-northeast-1.amazonaws.com (s3-ap-northeast-1.amazonaws.com)|52.219.0.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33905114 (32M) [application/zip]
Saving to: ‘data/ja.text8.zip’


2020-05-03 07:02:36 (7.73 MB/s) - ‘data/ja.text8.zip’ saved [33905114/33905114]

Archive:  data/ja.text8.zip
  inflating: data/ja.text8           


In [2]:
!wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz -P data/
!gunzip -d data/amazon_reviews_multilingual_JP_v1_00.tsv.gz

--2020-05-03 07:02:57--  https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.110.5
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.110.5|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94688992 (90M) [application/x-gzip]
Saving to: ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’


2020-05-03 07:03:00 (28.7 MB/s) - ‘data/amazon_reviews_multilingual_JP_v1_00.tsv.gz’ saved [94688992/94688992]



In [3]:
!pip install janome

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/79/f0/bd7f90806132d7d9d642d418bdc3e870cfdff5947254ea3cab27480983a7/Janome-0.3.10-py2.py3-none-any.whl (21.5MB)
[K     |████████████████████████████████| 21.5MB 1.3MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.3.10


In [0]:
!mkdir models

In [0]:
import string
import pandas as pd
import tensorflow as tf
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Embedding, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

t = Tokenizer(wakati = True)

In [0]:
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold

def load_dataset(filename, n=5000, state = 6):
    df = pd.read_csv(filename, sep='\t')

    mapping = {1: 0, 2: 0, 4: 1, 5: 1}
    df = df[df.star_rating != 3]
    df.star_rating = df.star_rating.map(mapping)

    is_jp = df.review_body.apply(filter_by_ascii_rate)
    df = df[is_jp]


    df = df.sample(frac = 1, random_state = state)
    grouped = df.groupby('star_rating')
    df = grouped.head(n=n)
    return df.review_body.values, df.star_rating.values

In [0]:
def build_vocabulary(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words = num_words, oov_token = '<UNK>'
    )
    tokenizer.fit_on_texts(texts)
    return tokenizer


def clean_html(html, strip = False):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(strip=strip)
    return text

def tokenize(text):
    return t.tokenize(text)

def preprocess_dataset(texts):
    texts = [clean_html(text) for text in texts]
    texts = [" ".join(tokenizer(text)) for text in texts]
    return texts


In [0]:
class RNNModel:

    def __init__(self, input_dim, output_dim, emb_dim = 300,
                 hid_dim = 100, embeddings = None, trainable = True):
      self.input = Input(shape = (None, ), name = 'input')

      if embeddings is None:
        self.embedding = Embedding(input_dim = input_dim,
                                   output_dim = emb_dim,
                                   mask_zero = True,
                                   trainable = trainable,
                                   name = 'embedding')
      else:
          self.embedding = Embedding(input_dim = embeddings.shape[0],
                                     output_dim = embeddings.shape[1],
                                     mask_zero = True,
                                     trainable = trainable,
                                     weights = [embeddings],
                                     name = 'embedding')
      self.rnn = SimpleRNN(hid_dim, name = 'rnn')
      self.fc = Dense(output_dim, activation = 'softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        output = self.rnn(embedding)
        y = self.fc(output)
        return Model(inputs = x, outputs = y)

In [0]:
class InferenceAPI:

    def __init__(self, model, vocab, preprocess):
        self.model = model
        self.vocab = vocab
        self.preprocess = preprocess


    def predict_from_texts(self, texts):
        x = self.preprocess(texts)
        x = self.voca.texts.texts_to_sequences(x)
        return self.predict_from_sequences(x)

    def predict_from_sequences(self, sequences):
        sequences = pad_sequences(sequences, truncating='post')
        y = self.model.predict(sequences)
        return np.argmax(y, -1)

In [0]:
def main():
    batch_size = 128
    epochs = 100
    maxlen = 300
    model_path = 'models/rnn_models.h5'
    num_words = 40000
    num_label = 2

    x, y = load_dataset('/content/data/amazon_reviews_multilingual_JP_v1_00.tsv')


  # 前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
    vocab = build_vocabulary(x_train, num_words)
    x_train = vocab.texts_to_sequences(x_train)
    x_test = vocab.texts_to_sequences(x_test)
    x_train = pad_sequences(x_train, maxlen = maxlen, truncating="post")
    x_test = pad_sequences(x_test, maxlen = maxlen, truncating='post')

    model = RNNModel(num_words, num_label, embeddings=None).build()
    model.compile(optimizer = 'adam',
                loss = 'sparse_categorical_crossentropy',
                metrics = ['acc'])
    callbacks = [
               EarlyStopping(patience=3),
               ModelCheckpoint(model_path, save_best_only=True)
  ]            

    model.fit(
      x = x_train,
      y = y_train,
      batch_size = batch_size,
      epochs = epochs,
      validation_split = 0.2,
      callbacks = callbacks,
      shuffle = True
        )
  
    model = load_model(model_path)
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)
    print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average = 'binary')))
    print('recall : {:.4f}'.format(recall_score(y_test, y_pred, average = 'binary')))
    print('f1 : {:.4f}'.format(f1_score(y_test, y_pred, average = 'binary')))

In [50]:
if __name__ == "__main__":
    main()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
precision: 0.6631
recall : 0.8018
f1 : 0.7259
