In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer
stop_words = set(stopwords.words('russian'))
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd

# Замените 'path_to_file.jsonl' на путь к вашему файлу
file_path = '/Users/valeriaalesnikova/Desktop/bootcamp/nlp_project-1/restaurants_reviews.jsonl'

# Чтение файла jsonl в DataFrame
df = pd.read_json(file_path, lines=True)

# Печать первых 5 строк для проверки
print(df.head())


   review_id  general food interior service  \
0          0        0   10       10      10   
1          1        0    9       10       9   
2          2        0    9       10      10   
3          3        0    -        5      10   
4          4        0    7       10      10   

                                                text  
0  Вытянули меня сегодня в город и раз уж была в ...  
1  проводили корпоратив на 60 чел. в этот - уже т...  
2  Был в Гостях с женой один раз и еще раз с жено...  
3  Бар понравился на первый взгляд .  Интерьер к ...  
4  В « Bel Canto » мы отмечали юбилей моего отца ...  


In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Токенизация
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Преобразование текстов в последовательности
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

# Подготовка меток
labels = pd.get_dummies(df['general']).values


In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, Bidirectional, Layer

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], input_shape[-1]),
                                 initializer='glorot_uniform', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[-1],),
                                 initializer='glorot_uniform', trainable=True)
        self.u = self.add_weight(name='context_vector', shape=(input_shape[-1], 1),
                                 initializer='glorot_uniform', trainable=True)
        super(AttentionLayer, self).build(input_shape)
        
    def call(self, x):
        u_t = tf.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        a_t = tf.nn.softmax(tf.tensordot(u_t, self.u, axes=1), axis=1)
        output = x * a_t
        return tf.reduce_sum(output, axis=1)

# Определение архитектуры модели
input_layer = Input(shape=(100,))
embedding_layer = Embedding(10000, 128)(input_layer)
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
attention_layer = AttentionLayer()(lstm_layer)
output_layer = Dense(labels.shape[1], activation='softmax')(attention_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Обучение модели
model.fit(padded_sequences, labels, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 72ms/step - accuracy: 0.9147 - loss: 0.4013 - val_accuracy: 0.8347 - val_loss: 0.4229
Epoch 2/10
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 76ms/step - accuracy: 0.9230 - loss: 0.2561 - val_accuracy: 0.9612 - val_loss: 0.1613
Epoch 3/10
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 88ms/step - accuracy: 0.9428 - loss: 0.1750 - val_accuracy: 0.9015 - val_loss: 0.2380
Epoch 4/10
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 73ms/step - accuracy: 0.9596 - loss: 0.1223 - val_accuracy: 0.8720 - val_loss: 0.3038
Epoch 5/10
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 75ms/step - accuracy: 0.9752 - loss: 0.0774 - val_accuracy: 0.8350 - val_loss: 0.4862
Epoch 6/10
[1m1179/1179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 76ms/step - accuracy: 0.9860 - loss: 0.0457 - val_accuracy: 0.8573 - val_loss: 0.4943
Epo

<keras.src.callbacks.history.History at 0x3084b8310>

In [None]:
# Оценка модели на тестовых данных (при наличии)
loss, accuracy = model.evaluate(padded_sequences, labels)
print(f'Accuracy: {accuracy:.4f}')


In [None]:
# Функция для предсказания класса отзыва
def predict_review(review):
    seq = tokenizer.texts_to_sequences([review])
    padded = pad_sequences(seq, padding='post', maxlen=100)
    pred = model.predict(padded)
    return df['general'].unique()[pred.argmax()]

# Пример предсказания
review = "The food was great and the service was excellent."
print(f'Predicted class: {predict_review(review)}')


In [None]:
# Сохранение модели
model.save('restaurant_review_model.h5')

# Загрузка модели
from tensorflow.keras.models import load_model
model = load_model('restaurant_review_model.h5', custom_objects={'AttentionLayer': AttentionLayer})
