In [1]:
import os
import numpy as np
import cv2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, RepeatVector, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [34]:
video_folder = '/Users/vlyrdv/Desktop/hack/rutube_hackathon_novosibirsk/train/train_video'
caption_folder = '/Users/vlyrdv/Desktop/hack/rutube_hackathon_novosibirsk/train/train_stt'

# Загрузка видео и описаний
video_files = os.listdir(video_folder)
caption_files = os.listdir(caption_folder)

videos = []
captions = []

for video_file in video_files:
    video_path = os.path.join(video_folder, video_file)
    video = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while video.isOpened():
        ret, frame = video.read()
        if count % 60 == 0:
            frames.append(frame)
        if not ret:
            break
        if len(frames) >= 25:
            videos.append(frames)
            break
 
video.release()
print(len(videos))
print(len(videos[0]))
print(videos[0][0].shape)
print()


for caption_file in caption_files:
    caption_path = os.path.join(caption_folder, caption_file)
    with open(caption_path, 'r') as f:
        caption = f.read()
        captions.append(caption)

# Предобработка данных
def preprocess_data(videos, captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    captions_seq = tokenizer.texts_to_sequences(captions)

    max_len = max(len(seq) for seq in captions_seq)
    captions_seq_padded = pad_sequences(captions_seq, maxlen=max_len, padding='post')

    return np.array(videos), captions_seq_padded, tokenizer

videos, captions_seq_padded, tokenizer = preprocess_data(videos, captions)

def create_model(max_video_length, max_caption_length, num_words):
    input_video = Input((360, 640, 3))
    input_caption = Input(shape=(max_caption_length))

    video_encoder = LSTM(256)(input_video)

    caption_encoder = Embedding(num_words, 256, mask_zero=True)(input_caption)
    caption_encoder = LSTM(256)(caption_encoder)

    decoder = Dense(256, activation="relu")(video_encoder)
    decoder = RepeatVector(max_caption_length)(decoder)
    decoder = concatenate([decoder, caption_encoder])
    decoder = LSTM(512, video[0].shape, return_sequences=True)(decoder)
    decoder = Dense(num_words, activation="softmax")(decoder)

    model = Model(inputs=[input_video, input_caption], outputs=decoder)
    return model

max_video_length = 25
max_caption_length = captions_seq_padded.shape[1]
num_words = len(tokenizer.word_index) + 1


model = create_model(max_video_length, max_caption_length, num_words)


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit([videos, captions_seq_padded[:, :-1]], np.expand_dims(captions_seq_padded[:, 1:], axis=-1), epochs=10, batch_size=64)

500
25
(360, 640, 3)



ValueError: Input 0 of layer "lstm_8" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 360, 640, 3)