# AmazonレビューのClassification

In [1]:
import pandas as pd
import math
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout, Bidirectional
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping

# まずはデータを準備

In [2]:
# データをロード
files = {"train": "amazon_reviews_train.csv", "test": "amazon_reviews_test.csv"}
train_df = pd.read_csv(files["train"], index_col=0)
test_df  = pd.read_csv(files["test"], index_col=0)

In [3]:
# MeCab で分かち書き
import MeCab
from tqdm import tqdm
tqdm.pandas()
def yield_wakati_terms(text):
    mecab = MeCab.Tagger ("-Owakati")
    wakati_text = mecab.parse (text)
    return wakati_text.strip()
        
train_df["terms"] = train_df["sentence1"].progress_map(yield_wakati_terms)
test_df["terms"]  = test_df["sentence1"].progress_map(yield_wakati_terms)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17647/17647 [00:35<00:00, 502.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1961/1961 [00:03<00:00, 499.83it/s]


# TF-IDF＋Naive Bayesで分類する

In [4]:
# TF-IDFを構築する
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b', ngram_range=(1, 2))
vectorizer.fit(train_df["terms"].values)

TfidfVectorizer(ngram_range=(1, 2), token_pattern='(?u)\\b\\w+\\b')

In [5]:
# 必要なデータを抽出
X_train = vectorizer.transform(train_df["terms"].values)
X_test  = vectorizer.transform(test_df["terms"].values)
y_train = train_df["label"].astype(float)
y_test  = test_df["label"].astype(float)
num_labels = len(y_train.unique())

In [25]:
# モデル構築とtrainingを行う
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

parameters = [
    {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]}
    ]

clf = GridSearchCV(
    MultinomialNB(),
    parameters,
    cv=5,
    scoring={"acc": "accuracy", "logloss": "neg_log_loss"}, 
    refit="logloss",
    verbose=0)
clf.fit(X_train, y_train)
print("best params:", clf.best_params_)
print("best score:", clf.best_score_)

y_pred_proba = clf.best_estimator_.predict_proba(X_test)
y_pred = clf.best_estimator_.predict(X_test)
print("validation log_loss:", log_loss(y_test, y_pred_proba))
print("validation accuracy:", accuracy_score(y_test, y_pred))

best params: {'alpha': 0.01}
best score: -0.6757729264785761
validation log_loss: 0.5519795891567499
validation accuracy: 0.8077511473737888


# TensorFlowで分類する

In [7]:
# 必要なデータを再度抽出
X_train = train_df["terms"].values
X_test  = test_df["terms"].values

# 1から5までの数値になっているのでそれを0から4に変換する
y_train = train_df["label"].astype(float) - 1
y_test  = test_df["label"].astype(float) - 1

num_labels = len(y_train.unique())

In [8]:
# Token化するためのレイヤーを定義
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
vocab_size = 20000
sequence_length = 256

vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(X_train)

# 1. Dense (+Dropout) でモデル構築

In [11]:
embedding_dim=256

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"), # output shape=(17647, 256, 256): vocab_size, sequence_length, embedding_dim
  GlobalAveragePooling1D(),
  Dropout(0.2),
  Dense(256, activation='relu'),
  Dropout(0.2),
  Dense(64, activation='relu'),
  Dense(num_labels, activation="softmax")
])
model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['acc'])

callbacks = [EarlyStopping(
    monitor='val_loss', patience=5)]

dense_history = model.fit(
    X_train, 
    y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    callbacks=callbacks)

min_index = np.argmin(dense_history.history["val_loss"])
print("min val_loss:", dense_history.history["val_loss"][min_index], "val_acc:", dense_history.history["val_acc"][min_index])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
min val_loss: 0.7799156308174133 val_acc: 0.7542070150375366


# 2. LSTM

In [16]:
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout, Bidirectional, LSTM, BatchNormalization
model_lstm = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, mask_zero=True), # output shape=(17647, 256, 256): vocab_size, sequence_length, embedding_dim
    Bidirectional(LSTM(256,  return_sequences=True)),
    Bidirectional(LSTM(128)),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(num_labels, activation="softmax")
])
model_lstm.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['acc'])
lstm_history = model_lstm.fit(
    X_train, 
    y_train,
    validation_data=(X_test, y_test),
    epochs=20,
    callbacks=callbacks)

min_index = np.argmin(lstm_history.history["val_loss"])
print("min val_loss:", lstm_history.history["val_loss"][min_index], "val_acc:", lstm_history.history["val_acc"][min_index])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
min val_loss: 0.8076110482215881 val_acc: 0.7256501913070679


# 3. TD-IDF + Dense

In [25]:
# TF-IDFでToken化するためのレイヤーを定義
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
vocab_size = 20000

tfidf_vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='tf-idf',
    ngrams=(1, 2)
)

tfidf_vectorize_layer.adapt(train_df["terms"].values)

In [33]:
model_tfidf = Sequential([
    tfidf_vectorize_layer,
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_labels, activation="softmax")
])
model_tfidf.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['acc'])
tfidf_history = model_tfidf.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    callbacks=callbacks)

min_index = np.argmin(tfidf_history.history["val_loss"])
print("min val_loss:", tfidf_history.history["val_loss"][min_index], "val_acc:", tfidf_history.history["val_acc"][min_index])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
min val_loss: 0.6191643476486206 val_acc: 0.7904130816459656


# 参考. nnlm-ja-dim128 + Dense

In [60]:
# 必要なデータを再度抽出
X_train = train_df["terms"].values
X_test  = test_df["terms"].values
y_train = train_df["label"].astype(float) - 1
y_test  = test_df["label"].astype(float) - 1
num_labels = len(y_train.unique())

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17647/17647 [00:42<00:00, 420.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1961/1961 [00:04<00:00, 414.01it/s]


In [85]:
import tensorflow_hub as hub
import tensorflow as tf
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-ja-dim128-with-normalization/2",
                           input_shape=[], dtype=tf.string)
model = Sequential()
model.add(hub_layer)
model.add(Dense(16, activation='relu'))
model.add(Dense(num_labels, activation='softmax'))

model.summary()

model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['acc'])









Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_6 (KerasLayer)   (None, 128)               117568128 
_________________________________________________________________
dense_6 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 85        
Total params: 117,570,277
Trainable params: 2,149
Non-trainable params: 117,568,128
_________________________________________________________________


In [86]:
callbacks = [EarlyStopping(
    monitor='val_loss', patience=5)]

history = model.fit(train_dataset.shuffle(10000).batch(512),
    validation_data=test_dataset.batch(512),
    epochs=200,
    callbacks=callbacks)
min_index = np.argmin(history.history["val_loss"])
print("min val_loss:", history.history["val_loss"][min_index], "val_acc:", history.history["val_acc"][min_index])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
min val_loss: 1.1055011749267578 val_acc: 0.5543090105056763
