In [1]:
import numpy as np
from datasets import load_dataset
from keras_preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import (
    Input,
    Embedding,
    Conv1D,
    MaxPooling1D,
    Flatten,
    Reshape,
    LSTM,
    Dense
)
from keras.models import Model
from keras.optimizers import Adam

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset("k1tub/sentiment_dataset")  

print("Dataset splits:", data)

Dataset splits: DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'src'],
        num_rows: 290458
    })
})


In [3]:
train_texts = data["train"]["text"]
train_labels = np.array(data["train"]["label"])

train_labels = np.clip(train_labels, 0, 2)


In [4]:
max_words = 30000
max_len = 100

In [5]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<UNK>")
tokenizer.fit_on_texts(train_texts)


In [6]:
X_train = pad_sequences(
    tokenizer.texts_to_sequences(train_texts),
    maxlen=max_len
)
y_train = train_labels

In [7]:
inputs = Input(shape=(max_len,))

x = Embedding(max_words, 100, input_length=max_len)(inputs)
x = Conv1D(128, 3, activation="relu")(x)
x = MaxPooling1D(2)(x)
x = Flatten()(x)

x = Reshape((49, 128))(x)

x = LSTM(64)(x)

x = Dense(64, activation="relu")(x)
outputs = Dense(3, activation="softmax")(x)

model = Model(inputs, outputs)
model.compile(
    optimizer=Adam(1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()



In [8]:

model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1
)

Epoch 1/5
[1m8170/8170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 32ms/step - accuracy: 0.6955 - loss: 0.6637 - val_accuracy: 0.4815 - val_loss: 1.2480
Epoch 2/5
[1m8170/8170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 34ms/step - accuracy: 0.7585 - loss: 0.5461 - val_accuracy: 0.4872 - val_loss: 1.1780
Epoch 3/5
[1m8170/8170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 34ms/step - accuracy: 0.8095 - loss: 0.4477 - val_accuracy: 0.4144 - val_loss: 1.7793
Epoch 4/5
[1m8170/8170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 35ms/step - accuracy: 0.8624 - loss: 0.3368 - val_accuracy: 0.4641 - val_loss: 1.8175
Epoch 5/5
[1m8170/8170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 33ms/step - accuracy: 0.9024 - loss: 0.2467 - val_accuracy: 0.4351 - val_loss: 2.3161


<keras.src.callbacks.history.History at 0x26b9924e4b0>

In [10]:


labels_map = {0: "neutral", 1: "positive", 2: "negative"}

texts_to_predict = [
    "Это просто ужасно, сервис отвратительный",
    "Ну нормально, пойдет",
    "Мне очень понравилось, всё супер"
]

seq = tokenizer.texts_to_sequences(texts_to_predict)
pad = pad_sequences(seq, maxlen=max_len)

preds = model.predict(pad)

for text, pred in zip(texts_to_predict, preds):
    idx = np.argmax(pred)
    print(f"Text: {text}")
    print(f"→ {labels_map[idx]} ({pred[idx]:.2%})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Text: Это просто ужасно, сервис отвратительный
→ negative (99.01%)
Text: Ну нормально, пойдет
→ neutral (52.73%)
Text: Мне очень понравилось, всё супер
→ positive (98.82%)
