In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 3.3 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 5.0 MB/s eta 0:00:00
Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl (277 kB)
Installing collected packages: regex, nltk

   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [n

In [3]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [6]:
imdb_path = 'datasets/IMDB_Dataset.csv'

In [7]:
data = pd.read_csv(imdb_path)

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [8]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

english_stops = set(stopwords.words('english'))
print(english_stops)

{'as', 'them', 'up', 'do', 'aren', 'same', 't', 'through', 'had', 'isn', 'itself', 'nor', "you're", 'yourself', "won't", 'i', 'when', 'whom', 'the', 'd', "hasn't", 'too', "they've", 'at', "i'd", 'few', 'her', 'over', "wasn't", 'wouldn', 'y', 'during', 'most', 'just', 'from', 'doesn', 'he', 'you', 'should', 'own', 'for', 'those', 'does', 'o', "hadn't", 'am', 've', "it'll", "wouldn't", 's', 'while', 'there', 'did', "didn't", 'more', 'then', 'don', 'has', 'who', "i'm", 'no', 'what', 'doing', 'or', 'where', 'which', 'won', 'shouldn', 'but', "needn't", 'himself', 'being', 'were', 'your', "don't", "haven't", "it's", 'that', "you'll", 'both', "isn't", 'be', "we'd", 'so', 'very', 'couldn', 'their', 'between', 'can', "weren't", 'weren', "we'll", 'once', 'here', 'this', 'yours', "that'll", "you'd", 'our', 'his', 'some', "we've", 'ours', 'if', "aren't", 'into', 'ourselves', 'all', 'm', "couldn't", "he'd", 'we', 'above', "you've", "i've", 'these', 'ma', "doesn't", 'of', 'now', "they're", "she's", 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Пользователь\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Пользователь\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def load_dataset():
    df = pd.read_csv(imdb_path)
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [10]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
40244    [significant, spoilers, this, sick, disturbing...
3205     [i, excited, hyped, watching, film, promos, fi...
38579    [this, film, released, year, i, born, like, i,...
35314    [two, years, ago, berlin, film, festival, watc...
23174    [having, seen, full, length, film, kieslowski,...
                               ...                        
15768    [alice, kind, movie, made, never, attempts, ev...
32163    [pet, sematary, good, horror, film, believe, s...
13057    [i, sophomore, college, movie, came, i, never,...
30972    [this, movie, somewhat, based, exit, rob, half...
11186    [the, positive, reviews, page, planted, filmma...
Name: review, Length: 40000, dtype: object 

13627    [an, unusual, revisionist, western, well, wort...
31765    [this, movie, blows, feet, this, debut, movie,...
47660    [the, best, scene, the, people, across, the, l...
36459    [i, luxury, seeing, movie, i, rather, young, m...
34913    [john, huston, actor, director, better, known,...
 

In [11]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [12]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 2659   983     8 ... 20121   480  9495]
 [    1  2267  5870 ...   662     1    31]
 [    8     4   523 ...  2948 10634  1296]
 ...
 [    1 15897  1013 ...  4389   557     6]
 [    8     3   569 ...     3     9   330]
 [    2  1073   750 ...     0     0     0]] 

Encoded X Test
 [[  691  1747 17768 ...     0     0     0]
 [    8     3  3224 ...     0     0     0]
 [    2    45    57 ... 17468 21011  5349]
 ...
 [12277   695    42 ...   189    12  1977]
 [    1    71    70 ...     0     0     0]
 [    1    38     3 ...     1   167     5]] 

Maximum review length:  130


In [13]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2957664   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,982,561
Trainable params: 2,982,561
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
import os
models_path = 'models'
os.makedirs(models_path, exist_ok=True)

checkpoint = ModelCheckpoint(
    models_path,
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [20]:
model.fit(x_train, y_train, batch_size = 128, epochs = 10, callbacks=[checkpoint])

Epoch 1/10
Epoch 1: accuracy improved from -inf to 0.96167, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 2/10
Epoch 2: accuracy improved from 0.96167 to 0.97887, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 3/10
Epoch 3: accuracy improved from 0.97887 to 0.98600, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 4/10
Epoch 4: accuracy improved from 0.98600 to 0.98820, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 5/10
Epoch 5: accuracy improved from 0.98820 to 0.98913, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 6/10
Epoch 6: accuracy improved from 0.98913 to 0.99063, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 7/10
Epoch 7: accuracy improved from 0.99063 to 0.99415, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 8/10
Epoch 8: accuracy did not improve from 0.99415
Epoch 9/10
Epoch 9: accuracy improved from 0.99415 to 0.99520, saving model to models




INFO:tensorflow:Assets written to: models\assets


INFO:tensorflow:Assets written to: models\assets


Epoch 10/10
Epoch 10: accuracy did not improve from 0.99520


<keras.callbacks.History at 0x14276b2b0>

In [21]:
y_pred_proba = model.predict(x_test, batch_size=128)

# Для многоклассовой классификации - берем класс с максимальной вероятностью
y_pred = np.argmax(y_pred_proba, axis=1)

# Если y_test тоже в one-hot encoding, преобразуем его
if y_test.ndim > 1:
    y_test_labels = np.argmax(y_test, axis=1)
else:
    y_test_labels = y_test

true = 0
for i, y in enumerate(y_test_labels):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 4984
Wrong Prediction: 5016
Accuracy: 49.84


In [22]:
loaded_model = load_model(models_path)

In [23]:
review = str(input('Movie Review: '))

Movie Review:  it's amazing, googd film


In [24]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  its amazing googd film
Filtered:  ['amazing googd film']


In [25]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[396   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [26]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.9993506]]


In [27]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive


In [36]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU available: {gpus[0]}")
    # Вывод информации о GPU
    for gpu in gpus:
        details = tf.config.experimental.get_device_details(gpu)
        print(f"{details.get('device_name', 'Unknown GPU')}")
else:
    print("GPU doesn\t available, CPU using")

GPU available: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
NVIDIA GeForce GTX 1650 Ti


In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
import random
import numpy as np

def prepare_char_level_data(texts, seq_length=5, max_texts=1000):
    if len(texts) > max_texts:
        texts = random.sample(texts, max_texts)

    print(f"texts: {len(texts)}")

    all_text = ' '.join(texts)
    
    if len(all_text) == 0:
        return [], [], {}, {}, 0

    chars = sorted(list(set(all_text)))
    print(f"unique symbols: {len(chars)}")

    char_to_idx = {char: idx for idx, char in enumerate(chars)}
    idx_to_char = {idx: char for char, idx in char_to_idx.items()}

    sequences = []
    next_chars = []

    for text in texts:
        for i in range(0, len(text) - seq_length):
            seq = text[i:i + seq_length]
            next_char = text[i + seq_length]
            sequences.append([char_to_idx[char] for char in seq])
            next_chars.append(char_to_idx[next_char])

    print(f"sequences count: {len(sequences)}")
    return sequences, next_chars, char_to_idx, idx_to_char, len(chars)

def build_char_model(vocab_size, seq_length=5):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 128, input_length=seq_length),
        BatchNormalization(),
        
        tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        
        tf.keras.layers.Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        tf.keras.layers.Dense(64, activation='relu'),
        Dropout(0.2),
        
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    return model

def generate_char_text(model, seed_text, char_to_idx, idx_to_char, seq_length=5, num_chars=50):
    generated = seed_text

    for _ in range(num_chars):
        seq = [char_to_idx.get(char, 0) for char in seed_text[-seq_length:]]
        seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=seq_length, padding='pre')

        pred = model.predict(seq, verbose=0)[0]
        next_idx = random.choices(range(len(pred)), weights=pred)[0]
        next_char = idx_to_char.get(next_idx, ' ')

        generated += next_char
        seed_text = seed_text[1:] + next_char if len(seed_text) >= seq_length else seed_text + next_char

    return generated

def prepare_word_level_data(texts, seq_length=3, max_texts=1000):
    if len(texts) > max_texts:
        texts = random.sample(texts, max_texts)

    print(f"texts: {len(texts)}")

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)

    sequences = []
    for text in texts:
        words = text.split()
        for i in range(seq_length, len(words)):
            seq = words[i-seq_length:i]
            sequences.append(' '.join(seq))

    sequences = tokenizer.texts_to_sequences(sequences)

    X = []
    y = []

    for seq in sequences:
        if len(seq) == seq_length:
            X.append(seq)  # Вся последовательность
            y.append(seq[-1])  # Последнее слово как целевое

    # Теперь X содержит последовательности длины seq_length, нужно разделить на входы и цели
    X_input = [seq[:-1] for seq in X]  # Все кроме последнего слова
    y_target = [seq[-1] for seq in X]   # Только последнее слово

    print(f"word sequences: {len(X_input)}")
    print(f"dictionary size: {len(tokenizer.word_index) + 1}")
    print(f"input sequence length: {len(X_input[0]) if len(X_input) > 0 else 0}")

    return X_input, y_target, tokenizer, len(tokenizer.word_index) + 1

def build_word_model(vocab_size, seq_length=2):  # Изменено на 2, т.к. входная длина = seq_length - 1
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 256, input_length=seq_length),
        BatchNormalization(),
        
        tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        
        tf.keras.layers.Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        tf.keras.layers.Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        
        tf.keras.layers.Dense(64, activation='relu'),
        Dropout(0.2),
        
        tf.keras.layers.Dense(vocab_size, activation='softmax')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer,
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    return model

def generate_word_text(model, seed_text, tokenizer, seq_length=2, num_words=10, temperature=1.0):
    generated = seed_text.split()

    for _ in range(num_words):
        seed_words = generated[-seq_length:]  # Берем последние seq_length слов
        token_list = tokenizer.texts_to_sequences([' '.join(seed_words)])[0]

        if len(token_list) < seq_length:
            token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list], 
                                                                     maxlen=seq_length, 
                                                                     padding='pre')[0]
        else:
            token_list = token_list[:seq_length]

        predictions = model.predict(np.array([token_list]), verbose=0)[0]

        predictions = np.log(predictions + 1e-7) / temperature
        exp_preds = np.exp(predictions)
        predictions = exp_preds / np.sum(exp_preds)

        next_idx = random.choices(range(len(predictions)), weights=predictions)[0]

        next_word = ""
        for word, idx in tokenizer.word_index.items():
            if idx == next_idx:
                next_word = word
                break

        if next_word:
            generated.append(next_word)

    return ' '.join(generated)

# Проверка GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU: {gpus[0]}")
else:
    print("CPU")

# Загрузка данных
from tensorflow.keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=15000)

word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

texts = []
for i in range(min(10000, len(x_train))):
    texts.append(decode_review(x_train[i]))

print(f"Загружено текстов: {len(texts)}")

# Обучение моделей
print("\n1. symbols gen")
char_sequences, next_chars, char_to_idx, idx_to_char, char_vocab_size = prepare_char_level_data(texts)

if len(char_sequences) > 0:
    X_char = np.array(char_sequences)
    y_char = np.array(next_chars)

    char_model = build_char_model(char_vocab_size)
    print("training symbols model")
    char_model.fit(X_char, y_char, batch_size=256, epochs=15, verbose=1)

    print("testing symbols gen")
    seed_texts = ["the movie", "this film", "i think", "the story"]
    for seed in seed_texts:
        if all(char in char_to_idx for char in seed):
            generated = generate_char_text(char_model, seed, char_to_idx, idx_to_char)
            print(f"'{seed}' -> '{generated}'")
        else:
            print(f"symbols '{seed}' not found in dictionary")

# Демонстрация
if 'char_model' in locals() and len(char_sequences) > 0:
    print("\ntext gen: SYMBOLS")
    demo_seeds = ["the movie was", "i really liked", "the acting is"]
    for seed in demo_seeds:
        if all(char in char_to_idx for char in seed):
            result = generate_char_text(char_model, seed, char_to_idx, idx_to_char, num_chars=30)
            print(f"'{seed}' -> '{result}'")

print('done')

GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Загружено текстов: 10000

1. symbols gen
texts: 1000
unique symbols: 46
sequences count: 1224915
training symbols model
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
testing symbols gen
'the movie' -> 'the movie is unlik ever the night on to a personals used by'
'this film' -> 'this film rane's stroughout any to be deeply films passion '
'i think' -> 'i thinking ? but caption heavid is a movie take drams muc'
'the story' -> 'the story time ? guy the interest is working quite people v'

2. words gen
texts: 1000
word sequences: 206586
dictionary size: 11622
input sequence length: 2
training words model
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
testing words gen
'this mo

In [10]:
def prepare_word_level_data(texts, seq_length=4, max_texts=2000, num_words=3000):
    """Улучшенная подготовка данных с фильтрацией и балансировкой"""
    if len(texts) > max_texts:
        texts = random.sample(texts, max_texts)

    print(f"Processing {len(texts)} texts")
    print(f"Target vocabulary: {num_words} words")
    print(f"Sequence length: {seq_length}")

    # Фильтруем слишком короткие тексты
    filtered_texts = []
    for text in texts:
        words = text.split()
        if len(words) >= seq_length + 2:  # Минимум seq_length + 2 слова
            filtered_texts.append(text)
    
    texts = filtered_texts
    print(f"After length filtering: {len(texts)} texts")

    tokenizer = Tokenizer(
        num_words=num_words, 
        oov_token="<OOV>",
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'  # Базовые фильтры
    )
    tokenizer.fit_on_texts(texts)

    sequences = tokenizer.texts_to_sequences(texts)

    X = []
    y = []

    for seq in sequences:
        if len(seq) >= seq_length + 1:
            for i in range(seq_length, len(seq)):
                # Проверяем, что все слова в последовательности ненулевые
                if all(token != 0 for token in seq[i-seq_length:i+1]):
                    X.append(seq[i-seq_length:i])
                    y.append(seq[i])

    print(f"Generated {len(X)} clean training sequences")
    print(f"Actual vocabulary used: {len([w for w in tokenizer.word_counts.items() if w[1] >= 2])} words (appearing ≥2 times)")
    
    return X, y, tokenizer, min(num_words, len(tokenizer.word_index) + 1)

def generate_word_text(model, seed_text, tokenizer, seq_length=4, num_words=8, temperature=1.0):
    """Улучшенная генерация с обработкой ошибок"""
    generated = seed_text.split()
    
    # Убедимся, что начальное семя не слишком короткое
    if len(generated) < seq_length:
        # Дополним случайными словами из словаря если нужно
        while len(generated) < seq_length:
            random_word = random.choice(list(tokenizer.word_index.keys())[:100])  # Только частые слова
            generated.insert(0, random_word)
    
    for _ in range(num_words):
        try:
            # Берем последние seq_length слов
            seed_words = generated[-seq_length:]
            
            # Преобразуем в последовательность токенов
            token_list = tokenizer.texts_to_sequences([' '.join(seed_words)])[0]
            
            # Дополняем до нужной длины
            if len(token_list) < seq_length:
                token_list = [0] * (seq_length - len(token_list)) + token_list
            else:
                token_list = token_list[-seq_length:]
            
            # Предсказание
            predictions = model.predict(np.array([token_list]), verbose=0)[0]
            
            # Температурное преобразование
            predictions = np.log(predictions + 1e-7) / temperature
            exp_preds = np.exp(predictions)
            predictions = exp_preds / np.sum(exp_preds)
            
            # Исключаем OOV токены из выбора
            valid_indices = [i for i in range(len(predictions)) 
                           if i in tokenizer.index_word and i != 0]  # Исключаем padding и OOV
            
            if not valid_indices:
                next_word = "the"  # Fallback слово
            else:
                valid_predictions = [predictions[i] for i in valid_indices]
                valid_predictions = np.array(valid_predictions) / np.sum(valid_predictions)
                
                next_idx = random.choices(valid_indices, weights=valid_predictions)[0]
                next_word = tokenizer.index_word[next_idx]
            
            generated.append(next_word)
            
        except Exception as e:
            print(f"Warning in generation: {e}")
            generated.append("the")  # Fallback
            break
    
    return ' '.join(generated)

print("\n=== IMPROVED WORD-LEVEL TEXT GENERATION ===")

# Улучшенная подготовка данных
word_sequences, next_words, word_tokenizer, word_vocab_size = prepare_word_level_data(
    texts, 
    seq_length=4,  # Уменьшил для лучшего обучения
    max_texts=2000,  # Уменьшил количество текстов
    num_words=3000   # Сильно уменьшил словарь
)

if len(word_sequences) > 0:
    X_word = np.array(word_sequences)
    y_word = np.array(next_words)
    
    print(f"Training data shape: {X_word.shape}")
    print(f"Vocabulary size: {word_vocab_size}")
    print(f"Sample sequences: {X_word[:3]}")
    print(f"Sample targets: {y_word[:3]}")

    # УПРОЩЕННАЯ модель для лучшего обучения
    word_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(
            word_vocab_size, 
            64,  # Уменьшил размерность
            input_length=4,
            mask_zero=True
        ),
        
        # Упростил архитектуру - убрал BatchNormalization и уменьшил слои
        tf.keras.layers.LSTM(128, return_sequences=False, dropout=0.1),
        
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(32, activation='relu'),
        
        tf.keras.layers.Dense(word_vocab_size, activation='softmax')
    ])

    # Увеличил learning rate и настроил оптимизатор
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=0.005,  # Увеличил в 5 раз
        beta_1=0.9,
        beta_2=0.999
    )
    
    word_model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    print("Model summary:")
    word_model.summary()

    # Улучшенные callback'и
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='loss', 
            patience=5,
            min_delta=0.01,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='loss', 
            factor=0.5, 
            patience=3, 
            min_lr=0.0001,
            verbose=1
        )
    ]
    
    print("\nTraining word-level model...")
    history = word_model.fit(
        X_word, 
        y_word, 
        batch_size=64,  # Уменьшил batch_size
        epochs=50,       # Увеличил эпохи
        verbose=1,
        callbacks=callbacks,
        validation_split=0.15,
        shuffle=True     # Добавил перемешивание
    )

    # Детальный анализ обучения
    print("\n=== TRAINING ANALYSIS ===")
    final_accuracy = history.history['accuracy'][-1]
    final_loss = history.history['loss'][-1]
    print(f"Final training accuracy: {final_accuracy:.4f}")
    print(f"Final training loss: {final_loss:.4f}")
    
    if 'val_accuracy' in history.history:
        val_accuracy = history.history['val_accuracy'][-1]
        val_loss = history.history['val_loss'][-1]
        print(f"Final validation accuracy: {val_accuracy:.4f}")
        print(f"Final validation loss: {val_loss:.4f}")

    # Проверка на тренировочных данных
    print("\n=== QUICK TRAINING SAMPLE CHECK ===")
    sample_indices = np.random.choice(len(X_word), min(5, len(X_word)), replace=False)
    for idx in sample_indices:
        sample_input = X_word[idx:idx+1]
        sample_target = y_word[idx]
        prediction = word_model.predict(sample_input, verbose=0)
        pred_word_idx = np.argmax(prediction[0])
        actual_word = word_tokenizer.index_word.get(sample_target, '<?>')
        pred_word = word_tokenizer.index_word.get(pred_word_idx, '<?>')
        print(f"Input seq: {sample_input[0]} -> Actual: '{actual_word}', Pred: '{pred_word}'")

    # Тестирование генерации
    print("\n=== IMPROVED GENERATION TESTING ===")
    
    # Функция для безопасной генерации
    def safe_generate(seed, temperature=0.7, num_words=6):
        try:
            return generate_word_text(
                word_model, 
                seed, 
                word_tokenizer, 
                seq_length=4, 
                num_words=num_words, 
                temperature=temperature
            )
        except Exception as e:
            return f"Generation error: {e}"

    # Простые тестовые сиды
    test_seeds = [
        "i like this",
        "the movie is",
        "this film",
        "it was very"
    ]
    
    print("\nSimple generation tests:")
    for seed in test_seeds:
        result = safe_generate(seed, temperature=0.8, num_words=4)
        print(f"'{seed}' -> '{result}'")

    # Основная демонстрация
    if final_accuracy > 0.3:  # Только если модель обучилась достаточно хорошо
        print("\n" + "="*50)
        print("MAIN DEMONSTRATION")
        print("="*50)
        
        demo_seeds = [
            "this movie is",
            "i really think", 
            "the story was",
            "the acting is"
        ]
        
        for seed in demo_seeds:
            print(f"\n--- {seed} ---")
            for temp in [0.5, 0.7, 1.0]:
                result = safe_generate(seed, temperature=temp, num_words=6)
                print(f"Temp {temp}: {result}")
    else:
        print(f"\nModel accuracy too low ({final_accuracy:.4f}) for meaningful generation.")
        print("Consider: increasing training data, decreasing vocabulary size, or simplifying model further.")

print('\nWord generation process completed!')


=== IMPROVED WORD-LEVEL TEXT GENERATION ===
Processing 2000 texts
Target vocabulary: 3000 words
Sequence length: 4
After length filtering: 2000 texts
Generated 454074 clean training sequences
Actual vocabulary used: 11735 words (appearing ≥2 times)
Training data shape: (454074, 4)
Vocabulary size: 3000
Sample sequences: [[ 465   75 1171   12]
 [  75 1171   12  105]
 [1171   12  105    1]]
Sample targets: [105   1  17]
Model summary:
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 4, 64)             192000    
                                                                 
 lstm_15 (LSTM)              (None, 128)               98816     
                                                                 
 dense_26 (Dense)            (None, 64)                8256      
                                                                 
 dropout_15 

KeyboardInterrupt: 