In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
import gensim.downloader as api
from datasets import load_dataset

2025-04-18 19:16:15.104112: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-18 19:16:15.315917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744992975.400125     972 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744992975.422270     972 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744992975.565598     972 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

1. Загрузка GloVe 

In [2]:
glove = api.load("glove-wiki-gigaword-100")
embedding_dim = 100
max_words = 10000
max_len = 50

2. Загрузка датасета Sentiment140

In [3]:
dataset = load_dataset("sentiment140", trust_remote_code=True)
texts = dataset["train"]["text"]
labels = dataset["train"]["sentiment"]
labels = [0 if l == 0 else 1 for l in labels]

3. Токенизация

In [4]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = np.array(labels)

4. Эмбеддинг матрицы

In [6]:
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        try:
            embedding_matrix[i] = glove[word]
        except KeyError:
            continue

5. Модель LSTM

In [7]:
model = Sequential([
    Embedding(input_dim=max_words,
              output_dim=embedding_dim,
              input_length=max_len,
              weights=[embedding_matrix],
              trainable=False),
    LSTM(128),
    Dense(2, activation='softmax')  # 1 выход для бинарной классификации
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # бинарная кросс-энтропия
              metrics=['accuracy'])

model.summary()

I0000 00:00:1744993036.343851     972 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9571 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6


Загрузка модели

In [8]:
model = tf.keras.models.load_model("model_epoch_05.h5")



Callback для сохранения модели после каждой эпохи

In [9]:
checkpoint_callback = ModelCheckpoint(
    filepath="model_epoch_{epoch+5:02d}.h5",
    save_weights_only=False,  # True — сохранять только веса, False — всю модель
    save_freq='epoch',        # сохранять после каждой эпохи
    verbose=1                 # вывод уведомлений
)

6. Обучение

In [15]:
model.fit(X, y, epochs=5, batch_size=64, validation_split=0.2, callbacks=[checkpoint_callback])

Epoch 1/5


I0000 00:00:1744939520.970707   11937 cuda_dnn.cc:529] Loaded cuDNN version 90800


[1m19997/20000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - accuracy: 0.7719 - loss: 0.4775
Epoch 1: saving model to model_epoch_01.h5




[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 11ms/step - accuracy: 0.7719 - loss: 0.4775 - val_accuracy: 0.6634 - val_loss: 0.6181
Epoch 2/5
[1m19997/20000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.8207 - loss: 0.3950
Epoch 2: saving model to model_epoch_02.h5




[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 11ms/step - accuracy: 0.8207 - loss: 0.3950 - val_accuracy: 0.6939 - val_loss: 0.5957
Epoch 3/5
[1m19997/20000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.8310 - loss: 0.3750
Epoch 3: saving model to model_epoch_03.h5




[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 11ms/step - accuracy: 0.8310 - loss: 0.3750 - val_accuracy: 0.6470 - val_loss: 0.6639
Epoch 4/5
[1m19999/20000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.8376 - loss: 0.3624
Epoch 4: saving model to model_epoch_04.h5




[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 11ms/step - accuracy: 0.8376 - loss: 0.3624 - val_accuracy: 0.7041 - val_loss: 0.5936
Epoch 5/5
[1m19996/20000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.8430 - loss: 0.3531
Epoch 5: saving model to model_epoch_05.h5




[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 11ms/step - accuracy: 0.8430 - loss: 0.3531 - val_accuracy: 0.7249 - val_loss: 0.5579


<keras.src.callbacks.history.History at 0x7fac44167f40>

7. Сохранение модели

In [10]:
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())

Предсказывание

In [None]:
label_names = ["Negative", "Positive"]
print("Введите текст (или 'exit' для выхода):")
while True:
    text = input(">>> ").strip()
    if text.lower() == "exit":
        break

    # Токенизация и паддинг
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post')

    # Предсказание
    prediction = model.predict(padded, verbose=0)
    class_index = np.argmax(prediction)
    confidence = float(np.max(prediction))

    print(f"Класс: {label_names[class_index]} (уверенность: {confidence:.2f})\n")

Введите текст (или 'exit' для выхода):
Класс: Negative (уверенность: 0.72)

Класс: Negative (уверенность: 0.93)

Класс: Negative (уверенность: 0.77)

Класс: Positive (уверенность: 0.53)

Класс: Positive (уверенность: 0.51)

Класс: Negative (уверенность: 0.83)

Класс: Negative (уверенность: 0.83)

Класс: Positive (уверенность: 0.53)

Класс: Positive (уверенность: 0.53)



Oh, I just love spending my entire weekend responding to emails. There’s nothing quite as thrilling as watching my inbox grow faster than my will to live. And the best part? The constant reminder that ‘this could have been an email’ after every pointless meeting. Truly, modern work culture is a masterpiece of efficiency

Случайные ошибочные примеры

In [22]:
# ========== Быстрый анализ ошибок (оптимизированная версия) ==========
print("\nАнализ ошибок на небольшой выборке (для скорости):")

# Берем только 5000 случайных примеров для анализа
sample_size = 5000
random_indices = np.random.choice(len(texts), sample_size, replace=False)
sample_texts = [texts[i] for i in random_indices]
sample_labels = np.array([labels[i] for i in random_indices])

# Токенизация и паддинг
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
X_sample = pad_sequences(sample_sequences, maxlen=max_len, padding='post')

# Получаем предсказания
sample_pred = model.predict(X_sample, batch_size=512, verbose=1)  # Увеличиваем batch_size для скорости
sample_pred_classes = np.argmax(sample_pred, axis=1)

# Находим ошибочные предсказания
wrong_indices = np.where(sample_pred_classes != sample_labels)[0]

# Выводим 10 случайных ошибочных примеров
num_errors_to_show = min(10, len(wrong_indices))
selected_errors = np.random.choice(wrong_indices, num_errors_to_show, replace=False)

print(f"\nПроанализировано {sample_size} примеров. Найдено {len(wrong_indices)} ошибок.")
print(f"Показываем {num_errors_to_show} случайных примеров ошибок:\n")

for i, idx in enumerate(selected_errors):
    original_text = sample_texts[idx]
    true_label = "Negative" if sample_labels[idx] == 0 else "Positive"
    pred_label = "Negative" if sample_pred_classes[idx] == 0 else "Positive"
    confidence = np.max(sample_pred[idx])
    
    print(f"Пример {i+1}:")
    print(f"Текст: {original_text}")
    print(f"Истинный класс: {true_label}")
    print(f"Предсказанный класс: {pred_label} (уверенность: {confidence:.2f})")
    print("-" * 80)

print("\nАнализ завершен. Можно продолжать работу.")


Анализ ошибок на небольшой выборке (для скорости):
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2s/step

Проанализировано 5000 примеров. Найдено 856 ошибок.
Показываем 10 случайных примеров ошибок:

Пример 1:
Текст: @cece_newnew thanks  n I was dead serious bout me bein less entertainin den u cuz idk wat 2 do....i think ima jus freestyle it
Истинный класс: Positive
Предсказанный класс: Negative (уверенность: 0.58)
--------------------------------------------------------------------------------
Пример 2:
Текст: @benpatrick90069 but im not waiting for him to do so...chaste=he's and undercover freak who'll unleash it on the 3rd or 4th date! 
Истинный класс: Positive
Предсказанный класс: Negative (уверенность: 0.82)
--------------------------------------------------------------------------------
Пример 3:
Текст: @michbek that's exactly how I felt when I ran outta pages in book 1 - love to say itoldyaso  have u listened to any Nathan Lowell yet?
Истинный класс: Positive