Сентемент анализ корпуса текстов

In [None]:
import os
import time
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional, Dense, Dropout,
                                     SpatialDropout1D, GlobalMaxPooling1D, Attention)
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant

In [None]:
max_features = 20000
maxlen = 200
embedding_dim = 100
batch_size = 64
epochs = 20

In [None]:
print("IMDB...")
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print("Train shape:", x_train.shape, "Test shape:", x_test.shape)

if not os.path.exists("glove.6B.100d.txt"):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip -q glove.6B.zip

IMDB...
Train shape: (25000, 200) Test shape: (25000, 200)


In [None]:
word_index = imdb.get_word_index()
reverse_word_index = {value+3: key for (key, value) in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"

In [None]:
print("GloVe...")
embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector


print(f"Загружено векторов: {len(embedding_index)}")


embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

GloVe...
Загружено векторов: 400000


In [None]:
def build_model():
    inputs = Input(shape=(maxlen,))
    x = Embedding(
        input_dim=max_features,
        output_dim=embedding_dim,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=maxlen,
        trainable=False
    )(inputs)

    x = SpatialDropout1D(0.25)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)

    attention = Attention()([x, x])
    x = tf.keras.layers.Add()([x, attention])  # residual connection

    x = GlobalMaxPooling1D()(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.3)(x)
    outputs = Dense(1, activation="sigmoid")(x)

    model = Model(inputs, outputs)
    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])
    return model

In [None]:
device = "GPU" if tf.config.list_physical_devices("GPU") else "CPU"

model = build_model()
model.summary()

start = time.time()
history = model.fit(
    x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test),
    verbose=1
)

Epoch 1/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 43ms/step - accuracy: 0.5132 - loss: 0.6978 - val_accuracy: 0.5690 - val_loss: 0.6903
Epoch 2/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.5486 - loss: 0.6850 - val_accuracy: 0.6471 - val_loss: 0.6255
Epoch 3/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 41ms/step - accuracy: 0.6721 - loss: 0.6030 - val_accuracy: 0.7517 - val_loss: 0.5074
Epoch 4/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 39ms/step - accuracy: 0.7537 - loss: 0.5021 - val_accuracy: 0.7863 - val_loss: 0.4520
Epoch 5/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.8027 - loss: 0.4358 - val_accuracy: 0.8090 - val_loss: 0.4151
Epoch 6/20
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 41ms/step - accuracy: 0.8271 - loss: 0.3988 - val_accuracy: 0.8245 - val_loss: 0.3801
Epoch 7/20
[1m3

In [None]:
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print(f"Точность на тесте: {acc*100:.2f}%")

Точность на тесте: 86.34%
