In [1]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import word_tokenize

Скачиваем необходимые данные для токенизации


In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/bastard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/bastard/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Загружаем датасет

In [None]:
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")
df = pd.read_csv(f"{path}/Combined Data.csv")

Преобразуем тексты в строковый формат и чистим данные

In [6]:
df['statement'] = df['statement'].astype(str)
df = df.dropna(subset=['status']) 

Преобразуем текстовые метки в числовые

In [7]:
label_encoder = LabelEncoder()
df['status'] = label_encoder.fit_transform(df['status'])

Подготавливаем данные

In [8]:
texts = df["statement"].values
labels = df["status"].values

Разделяем на обучающую и тестовую выборки

In [9]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

Токенизируем тексты

In [10]:
tokenized_texts = [word_tokenize(text.lower()) for text in train_texts]

Обучаем Word2Vec модель

In [11]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=300,  # Размерность векторов
    window=5,        # Размер окна контекста
    min_count=1,     # Минимальная частота слова
    workers=4        # Количество потоков
)
word2vec_model.save("word2vec.model")


Подготавливаем данные для нейросети

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
max_length = 200

# Создаем и настраиваем токенизатор
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Преобразуем тексты в последовательности чисел
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Добавляем паддинг (дополняем нулями до одинаковой длины)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")


Создаем матрицу эмбеддингов

In [16]:
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size and word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]


Строим модель нейросети

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

num_classes = len(label_encoder.classes_)  # Количество уникальных классов

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=max_length,
        weights=[embedding_matrix],
        trainable=False
    ),
    Conv1D(128, 5, activation="relu"),  # Сверточный слой
    GlobalMaxPooling1D(),               # Пулинг
    Dense(64, activation="relu"),      # Полносвязный слой
    Dropout(0.5),                      # Регуляризация
    Dense(num_classes, activation="softmax")  # Выходной слой
])

# Компилируем модель
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [None]:
model = tf.keras.models.load_model("model_epoch_05.h5")

Обучаем модель

In [19]:
history = model.fit(
    train_padded,
    train_labels,
    epochs=10,
    validation_data=(test_padded, test_labels),
    batch_size=32
)

Epoch 1/10


I0000 00:00:1744948317.544723   37986 service.cc:152] XLA service 0x7f470c005200 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744948317.544771   37986 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2025-04-18 06:51:57.572424: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1744948317.673910   37986 cuda_dnn.cc:529] Loaded cuDNN version 90800


[1m  72/1327[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 2ms/step - accuracy: 0.3567 - loss: 2.2912

I0000 00:00:1744948319.261987   37986 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1321/1327[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.5360 - loss: 1.2680

2025-04-18 06:52:02.341937: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 18.91GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m1327/1327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.5363 - loss: 1.2670 - val_accuracy: 0.6703 - val_loss: 0.8372
Epoch 2/10
[1m1327/1327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6401 - loss: 0.8895 - val_accuracy: 0.7149 - val_loss: 0.7392
Epoch 3/10
[1m1327/1327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6774 - loss: 0.7975 - val_accuracy: 0.7252 - val_loss: 0.7133
Epoch 4/10
[1m1327/1327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.7028 - loss: 0.7224 - val_accuracy: 0.7367 - val_loss: 0.6865
Epoch 5/10
[1m1327/1327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7207 - loss: 0.6801 - val_accuracy: 0.7344 - val_loss: 0.6773
Epoch 6/10
[1m1327/1327[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7292 - loss: 0.6544 - val_accuracy: 0.7441 - val_loss: 0.6866
Epoch 7/10
[1m1327/1327[0

Сохраняем модель

In [20]:
model.save("sentiment_model.h5")



Предсказывание

In [None]:
def predict_sentiment(text):
    text = str(text)
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding="post", truncating="post")
    prediction = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(prediction)])[0]
print("Введите текст (или 'exit' для выхода):")
while True:
    text = input(">>> ").strip()
    if text.lower() == "exit":
        break
    
    print(f"Класс: {predict_sentiment(text)}")

Случайные ошибочные примеры

In [22]:
# Получаем предсказания для тестовой выборки
test_pred = model.predict(test_padded)
test_pred_classes = np.argmax(test_pred, axis=1)

# Находим индексы ошибочных предсказаний
wrong_indices = np.where(test_pred_classes != test_labels)[0]

# Выбираем 10 случайных ошибок (или меньше, если ошибок мало)
num_errors_to_show = min(10, len(wrong_indices))
selected_errors = np.random.choice(wrong_indices, num_errors_to_show, replace=False)

# Выводим ошибочные примеры с деталями
print(f"\nНайдено {len(wrong_indices)} ошибок. Показываем {num_errors_to_show} примеров:\n")

for i, idx in enumerate(selected_errors):
    original_text = test_texts[idx]
    true_label = label_encoder.inverse_transform([test_labels[idx]])[0]
    pred_label = label_encoder.inverse_transform([test_pred_classes[idx]])[0]
    confidence = np.max(test_pred[idx])
    
    print(f"Пример {i+1}:")
    print(f"Текст: {original_text}")
    print(f"Истинная метка: {true_label}")
    print(f"Предсказанная метка: {pred_label} (уверенность: {confidence:.2f})")
    print("-" * 80)

# Дополнительная статистика
accuracy = 1 - len(wrong_indices)/len(test_labels)
print(f"\nОбщая точность на тестовой выборке: {accuracy:.2%}")
print(f"Примеры ошибок показывают, где модель испытывает наибольшие трудности")

[1m332/332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Найдено 2687 ошибок. Показываем 10 примеров:

Пример 1:
Текст: can anxiety because 24/7 confusion? confused, intrusive thoughts. I am afraid of dementia. I have chronic anxiety, panic, hipochondriac, etc Anxiety and confusion
Истинная метка: Depression
Предсказанная метка: Anxiety (уверенность: 0.93)
--------------------------------------------------------------------------------
Пример 2:
Текст: Grad School Vent Anyone else in a Master's program and hate it? Nothing is ever explained, such complicated assignments, not to mention professors that are not invested in their students but care more about their research interests, a department that is so disorganized it is falling to pieces. Anyone else can relate? Thank god it is almost over, it has been one of the WORST experiences, most stressful, and its been so draining on my mental health.
Истинная метка: Stress
Предсказанная метка: Anxiety (уверенность: 0.35)
