In [1]:
import os
import json
import time
import random
import requests

from tqdm.notebook import tqdm

### 1. Настройка API TMDB

In [None]:
TMDB_API_KEY = "<TMDB API KEY>"
url = f"https://api.themoviedb.org/3/movie/popular?api_key={TMDB_API_KEY}&language=ru-RU&page=1"

response = requests.get(url)
movies = response.json()["results"]
movies

ConnectionError: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/popular?api_key=64cff5eb8f8dc04937d31db59bc01d80&language=ru-RU&page=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f5f830412d0>: Failed to establish a new connection: [Errno 111] Connection refused'))

### 2. Сбор метаданных

In [None]:
NUM_MOVIES = 500
OUTPUT_DIR = "movie_data"
LANGUAGE = "ru-RU"

def get_diverse_movies(api_key, num_movies=500, language="ru-RU"):
    all_movies = []
    movie_ids = set()
    
    endpoints = [
        "movie/popular",
        "movie/top_rated",
        "discover/movie?sort_by=vote_count.desc",
        "discover/movie?with_genres=28",  # Боевики
        "discover/movie?with_genres=35",  # Комедии
        "discover/movie?with_genres=18",  # Драмы
        "discover/movie?with_genres=27",  # Ужасы
        "discover/movie?with_genres=10749", # Романтика
        "discover/movie?with_genres=878"   # Научная фантастика
    ]
    
    years = list(range(1980, 2024, 5))
    for year in years:
        endpoints.append(f"discover/movie?primary_release_year={year}&sort_by=popularity.desc")
    
    target_per_endpoint = num_movies // len(endpoints) + 1
    pages_per_endpoint = (target_per_endpoint + 19) // 20
    
    for endpoint in tqdm(endpoints, desc="Запрос разных категорий"):
        endpoint_movies = 0
        
        for page in range(1, pages_per_endpoint + 1):
            if "?" in endpoint:
                url = f"https://api.themoviedb.org/3/{endpoint}&api_key={api_key}&language={language}&page={page}"
            else:
                url = f"https://api.themoviedb.org/3/{endpoint}?api_key={api_key}&language={language}&page={page}"
            
            try:
                response = requests.get(url)
                
                if response.status_code == 200:
                    page_data = response.json()
                    
                    for movie in page_data["results"]:
                        if movie["id"] not in movie_ids and "release_date" in movie and movie["release_date"]:
                            all_movies.append(movie)
                            movie_ids.add(movie["id"])
                            endpoint_movies += 1
                    
                    if endpoint_movies >= target_per_endpoint:
                        break
                        
                    time.sleep(0.5)
                elif response.status_code == 429:
                    print(f"Превышен лимит запросов. Ожидание 10 секунд...")
                    time.sleep(10)
                    page -= 1
                else:
                    print(f"Ошибка при загрузке {endpoint}, страница {page}: {response.status_code}")
                    time.sleep(1)
            except Exception as e:
                print(f"Исключение при запросе {url}: {e}")
                time.sleep(2)
    
    print(f"Собрано {len(all_movies)} уникальных фильмов")
    
    random.shuffle(all_movies)
    return all_movies[:num_movies]

def get_movie_details(movie_id, api_key, language="ru-RU"):
    time.sleep(random.uniform(0.2, 0.7))
    
    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language={language}&append_to_response=credits,keywords"
    
    try:
        response = requests.get(movie_url)
        
        if response.status_code == 200:
            movie_data = response.json()
            
            movie_details = {
                "id": movie_data["id"],
                "title": movie_data["title"],
                "original_title": movie_data["original_title"],
                "release_date": movie_data.get("release_date", ""),
                "year": movie_data["release_date"][:4] if movie_data.get("release_date") else None,
                "genres": [genre["name"] for genre in movie_data.get("genres", [])],
                "overview": movie_data.get("overview", ""),
                "poster_path": movie_data.get("poster_path"),
                "backdrop_path": movie_data.get("backdrop_path"),
                "popularity": movie_data.get("popularity"),
                "vote_average": movie_data.get("vote_average"),
                "runtime": movie_data.get("runtime"),
            }
            
            if "credits" in movie_data:
                directors = [person["name"] for person in movie_data["credits"].get("crew", []) 
                            if person.get("job") == "Director"]
                movie_details["director"] = directors[0] if directors else None
                
                cast = movie_data["credits"].get("cast", [])
                movie_details["cast"] = [person["name"] for person in cast[:5]]
            
            if "keywords" in movie_data and "keywords" in movie_data["keywords"]:
                movie_details["keywords"] = [kw["name"] for kw in movie_data["keywords"]["keywords"]]
            
            return movie_details
        
        elif response.status_code == 429:
            print(f"Превышен лимит запросов для фильма {movie_id}. Ожидание 10 секунд...")
            time.sleep(10)
            return get_movie_details(movie_id, api_key, language)
        else:
            print(f"Ошибка при получении информации о фильме {movie_id}: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"Исключение при получении данных о фильме {movie_id}: {e}")
        return None

os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Шаг 1: Получение списка фильмов из разных категорий")
movies_list = get_diverse_movies(TMDB_API_KEY, num_movies=NUM_MOVIES, language=LANGUAGE)

with open(os.path.join(OUTPUT_DIR, "movies_list.json"), "w", encoding="utf-8") as f:
    json.dump(movies_list, f, ensure_ascii=False, indent=4)

print("Шаг 2: Получение детальной информации о фильмах")
detailed_movies = {}
errors = []

for movie in tqdm(movies_list, desc="Получение детальной информации"):
    movie_id = movie["id"]
    movie_details = get_movie_details(movie_id, TMDB_API_KEY, LANGUAGE)
    
    if movie_details:
        detailed_movies[str(movie_id)] = movie_details
    else:
        errors.append(movie_id)

with open(os.path.join(OUTPUT_DIR, "detailed_movies.json"), "w", encoding="utf-8") as f:
    json.dump(detailed_movies, f, ensure_ascii=False, indent=4)

if errors:
    with open(os.path.join(OUTPUT_DIR, "errors.json"), "w") as f:
        json.dump(errors, f)

print(f"Сбор метаданных завершен!")
print(f"Собрана детальная информация о {len(detailed_movies)} фильмах")
print(f"Ошибки при получении данных для {len(errors)} фильмов")
print(f"Результаты сохранены в директории: {OUTPUT_DIR}")

if detailed_movies:
    first_movie_id = next(iter(detailed_movies))
    print("\nПример данных о первом фильме:")
    print(f"Название: {detailed_movies[first_movie_id]['title']}")
    print(f"Год: {detailed_movies[first_movie_id].get('year')}")
    print(f"Жанры: {', '.join(detailed_movies[first_movie_id].get('genres', []))}")
    print(f"Режиссер: {detailed_movies[first_movie_id].get('director')}")

Шаг 1: Получение списка фильмов из разных категорий


Запрос разных категорий:   0%|          | 0/18 [00:00<?, ?it/s]

Собрано 580 уникальных фильмов
Шаг 2: Получение детальной информации о фильмах


Получение детальной информации:   0%|          | 0/500 [00:00<?, ?it/s]

Сбор метаданных завершен!
Собрана детальная информация о 500 фильмах
Ошибки при получении данных для 0 фильмов
Результаты сохранены в директории: movie_data

Пример данных о первом фильме:
Название: Мстители
Год: 2012
Жанры: фантастика, боевик, приключения
Режиссер: Джосс Уидон


### 3. Сбор кадров к фильмам с TMDB

In [None]:
import os
import json
import time
import random
import requests
from PIL import Image
import numpy as np
from io import BytesIO
from tqdm import tqdm
import concurrent.futures

INPUT_FILE = "movie_data/detailed_movies.json"
OUTPUT_DIR = "movie_data/frames"
MAX_FRAMES_PER_MOVIE = 30
MIN_WIDTH = 500
MIN_BRIGHTNESS = 30 
MIN_FILE_SIZE = 20000  # bytes
MAX_THREADS = 5

TMDB_IMAGE_BASE_URL = "https://image.tmdb.org/t/p/original"
TMDB_API_URL = "https://api.themoviedb.org/3"

MOVIE_STILLS_DB_BASE_URL = "https://www.moviestillsdb.com/movies"

os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    movies_metadata = json.load(f)

print(f"Загружены метаданные для {len(movies_metadata)} фильмов")

def get_movie_frames_from_tmdb(movie_id, api_key, max_frames=15):
    images_url = f"{TMDB_API_URL}/movie/{movie_id}/images?api_key={api_key}&include_image_language=en,null,ru"
    
    frame_urls = []
    try:
        response = requests.get(images_url)
        
        if response.status_code == 200:
            image_data = response.json()
            
            stills = image_data.get("stills", [])
            
            if stills:
                stills = sorted(stills, key=lambda x: x.get("width", 0) * x.get("height", 0), reverse=True)
                
                for still in stills[:max_frames]:
                    frame_urls.append(f"{TMDB_IMAGE_BASE_URL}{still['file_path']}")
            
            if len(frame_urls) < max_frames:
                backdrops = image_data.get("backdrops", [])
                backdrops = sorted(backdrops, key=lambda x: x.get("width", 0) * x.get("height", 0), reverse=True)
                
                for backdrop in backdrops[:max_frames - len(frame_urls)]:
                    frame_urls.append(f"{TMDB_IMAGE_BASE_URL}{backdrop['file_path']}")
        
        elif response.status_code == 429:
            print(f"Превышен лимит запросов для кадров фильма {movie_id}. Ожидание 10 секунд...")
            time.sleep(10)
            return get_movie_frames_from_tmdb(movie_id, api_key, max_frames)
            
    except Exception as e:
        print(f"Ошибка при получении кадров для фильма {movie_id}: {e}")
    
    return frame_urls



def search_alternate_sources(movie_data, max_frames=15):
    frame_urls = []
    
    title = movie_data.get("original_title", "").replace(" ", "-").lower()
    year = movie_data.get("year", "")
    
    if title and year:
        pass
    
    return frame_urls

def download_and_validate_frame(url, output_path, min_width=500, min_brightness=30, min_file_size=20000):
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code != 200:
            return False
        
        if len(response.content) < min_file_size:
            return False
        
        image = Image.open(BytesIO(response.content))
        
        width, height = image.size
        if width < min_width:
            return False
        
        img_gray = image.convert('L')
        brightness = np.mean(np.array(img_gray))
        if brightness < min_brightness:
            return False
        
        with open(output_path, "wb") as f:
            f.write(response.content)
        
        return True
        
    except Exception as e:
        print(f"Ошибка при обработке {url}: {e}")
        return False

def process_movie(movie_id, movie_data, output_dir, api_key, max_frames=30):
    movie_dir = os.path.join(output_dir, str(movie_id))
    os.makedirs(movie_dir, exist_ok=True)
    
    stats = {
        "downloaded": 0,
        "filtered": 0,
        "errors": 0
    }
    
    tmdb_frames = get_movie_frames_from_tmdb(movie_id, api_key, max_frames)
    
    alt_frames = []
    if len(tmdb_frames) < max_frames:
        alt_frames = search_alternate_sources(movie_data, max_frames - len(tmdb_frames))
    
    all_frame_urls = tmdb_frames + alt_frames
    
    for i, url in enumerate(all_frame_urls):
        output_path = os.path.join(movie_dir, f"frame_{i:03d}.jpg")
        
        if os.path.exists(output_path):
            stats["downloaded"] += 1
            continue
        
        time.sleep(random.uniform(0.2, 0.5))
        
        if download_and_validate_frame(
            url, 
            output_path, 
            min_width=MIN_WIDTH,
            min_brightness=MIN_BRIGHTNESS,
            min_file_size=MIN_FILE_SIZE
        ):
            stats["downloaded"] += 1
        else:
            stats["filtered"] += 1
    
    return stats

def process_all_movies():
    total_stats = {"downloaded": 0, "filtered": 0, "errors": 0, "movies_with_frames": 0}
    movie_frame_counts = {}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        future_to_movie = {
            executor.submit(
                process_movie, 
                movie_id, 
                movie_data, 
                OUTPUT_DIR, 
                TMDB_API_KEY, 
                MAX_FRAMES_PER_MOVIE
            ): (movie_id, movie_data) 
            for movie_id, movie_data in movies_metadata.items()
        }
        
        for future in tqdm(concurrent.futures.as_completed(future_to_movie), total=len(movies_metadata), desc="Обработка фильмов"):
            movie_id, movie_data = future_to_movie[future]
            
            try:
                stats = future.result()
                
                total_stats["downloaded"] += stats["downloaded"]
                total_stats["filtered"] += stats["filtered"]
                total_stats["errors"] += stats["errors"]
                
                movie_dir = os.path.join(OUTPUT_DIR, str(movie_id))
                if os.path.exists(movie_dir):
                    frame_count = len([f for f in os.listdir(movie_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
                    movie_frame_counts[movie_id] = frame_count
                    
                    if frame_count > 0:
                        total_stats["movies_with_frames"] += 1
                        print(f"Скачано {frame_count} кадров для фильма {movie_data.get('title')}")
                
            except Exception as e:
                print(f"Ошибка при обработке фильма {movie_id}: {e}")
                total_stats["errors"] += 1
    
    return total_stats, movie_frame_counts

print("Начинаем скачивание кадров из фильмов...")
total_stats, movie_frame_counts = process_all_movies()

for movie_id in movies_metadata:
    movies_metadata[movie_id]["frame_count"] = movie_frame_counts.get(movie_id, 0)

updated_metadata_file = os.path.join(os.path.dirname(INPUT_FILE), "movies_with_frames.json")
with open(updated_metadata_file, "w", encoding="utf-8") as f:
    json.dump(movies_metadata, f, ensure_ascii=False, indent=4)

print("\nСтатистика скачивания:")
print(f"Всего скачано кадров: {total_stats['downloaded']}")
print(f"Отфильтровано низкокачественных кадров: {total_stats['filtered']}")
print(f"Ошибок при скачивании: {total_stats['errors']}")
print(f"Фильмов с кадрами: {total_stats['movies_with_frames']} из {len(movies_metadata)}")

print("\nФильмы с наибольшим количеством кадров:")
top_movies = sorted(movie_frame_counts.items(), key=lambda x: x[1], reverse=True)[:10]

for movie_id, count in top_movies:
    if movie_id in movies_metadata:
        print(f"{movies_metadata[movie_id]['title']}: {count} кадров")

def visualize_random_frames(num_movies=3, frames_per_movie=3):
    """Показывает случайные кадры из случайных фильмов"""
    import matplotlib.pyplot as plt
    
    movies_with_frames = [movie_id for movie_id, count in movie_frame_counts.items() if count >= frames_per_movie]
    
    if not movies_with_frames or len(movies_with_frames) < num_movies:
        print("Недостаточно фильмов с кадрами для визуализации")
        return
    
    sampled_movies = random.sample(movies_with_frames, num_movies)
    
    fig, axes = plt.subplots(num_movies, frames_per_movie, figsize=(15, 4 * num_movies))
    
    for i, movie_id in enumerate(sampled_movies):
        movie_dir = os.path.join(OUTPUT_DIR, str(movie_id))
        frame_files = [f for f in os.listdir(movie_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
        
        if len(frame_files) < frames_per_movie:
            continue
            
        sampled_frames = random.sample(frame_files, frames_per_movie)
        
        for j, frame_file in enumerate(sampled_frames):
            frame_path = os.path.join(movie_dir, frame_file)
            img = Image.open(frame_path)
            
            if num_movies == 1:
                ax = axes[j]
            else:
                ax = axes[i, j]
                
            ax.imshow(img)
            ax.set_title(f"{movies_metadata[movie_id]['title']}\n{frame_file}")
            ax.axis('off')
    
    plt.tight_layout()
    plt.show()

if total_stats["movies_with_frames"] >= 3:
    try:
        visualize_random_frames(num_movies=3, frames_per_movie=3)
    except Exception as e:
        print(f"Ошибка при визуализации кадров: {e}")

### 4. Сбор кадров из дополнительных источников

In [None]:
import os
import json
import time
import random
import requests
from PIL import Image
import numpy as np
from io import BytesIO
from tqdm.notebook import tqdm
import concurrent.futures
import re
from bs4 import BeautifulSoup
from urllib.parse import quote, urljoin
from difflib import SequenceMatcher 

INPUT_FILE = "movie_data/movies_with_frames.json"
OUTPUT_DIR = "movie_data/frames"
MAX_FRAMES_PER_MOVIE = 100
MIN_WIDTH = 500
MIN_BRIGHTNESS = 30
MIN_FILE_SIZE = 20000
MAX_THREADS = 5
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
TMDB_API_KEY = "<TMDB API KEY>" 

os.makedirs(OUTPUT_DIR, exist_ok=True)

def similar(a, b, threshold=0.6):
    a = a.lower().strip()
    b = b.lower().strip()
    
    for prefix in ["the ", "a ", "an "]:
        if a.startswith(prefix):
            a = a[len(prefix):]
        if b.startswith(prefix):
            b = b[len(prefix):]
    
    similarity = SequenceMatcher(None, a, b).ratio()
    
    contains = a in b or b in a
    
    return similarity >= threshold or contains

def get_imdb_id(tmdb_id, api_key=None):
    if not api_key:
        return None
    
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/external_ids?api_key={api_key}"
    
    try:
        response = requests.get(url, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            return data.get("imdb_id")
    except Exception as e:
        print(f"Ошибка при получении IMDB ID: {e}")
    
    return None

def load_movie_metadata(file_path):
    print(f"Загрузка метаданных из файла {file_path}")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            metadata = json.load(f)
        print(f"Загружены метаданные для {len(metadata)} фильмов")
        return metadata
    except Exception as e:
        print(f"Ошибка при загрузке метаданных: {e}")
        return {}

def get_frames_from_movie_stills_db(movie_title, year, imdb_id=None, max_frames=15):
    frame_urls = []
    
    base_url = "https://www.moviestillsdb.com"
    
    urls_to_try = []
    
    if imdb_id and imdb_id.startswith('tt'):
        urls_to_try.append((f"{base_url}/movies/i{imdb_id}", True))
    
    formatted_title = movie_title.lower().replace(' ', '-')
    formatted_title = re.sub(r'[^\w\-]', '', formatted_title)
    
    urls_to_try.append((f"{base_url}/movies/{formatted_title}-{year}", True))
    urls_to_try.append((f"{base_url}/movies/{formatted_title}", False))
    
    search_url = f"{base_url}/search/?query={quote(movie_title)}"
    urls_to_try.append((search_url, False))
    
    headers = {"User-Agent": USER_AGENT}
    
    for url, is_reliable in urls_to_try:
        try:
            response = requests.get(url, headers=headers, timeout=15)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                
                if url == search_url:
                    search_results = soup.select('.grid-item > a')
                    movie_url = None
                    
                    for result in search_results:
                        result_text = result.get_text(strip=True)
                        
                        if similar(result_text, movie_title, 0.6) and str(year) in result_text:
                            movie_url = urljoin(base_url, result['href'])
                            print(f"MovieStillsDB: Нашел совпадение для {movie_title} ({year}): {result_text}")
                            break
                    
                    if movie_url:
                        response = requests.get(movie_url, headers=headers, timeout=15)
                        if response.status_code == 200:
                            soup = BeautifulSoup(response.text, 'html.parser')
                        else:
                            continue
                    else:
                        continue
                
                image_links = soup.select('.grid-item > a')
                
                for link in image_links[:max_frames]:
                    if 'href' in link.attrs:
                        still_url = urljoin(base_url, link['href'])
                        still_response = requests.get(still_url, headers=headers, timeout=15)
                        
                        if still_response.status_code == 200:
                            still_soup = BeautifulSoup(still_response.text, 'html.parser')
                            
                            img_tag = still_soup.select_one('.largepic img')
                            if img_tag and 'src' in img_tag.attrs:
                                img_url = img_tag['src']
                                if img_url.startswith('http'):
                                    frame_urls.append(img_url)
                                else:
                                    frame_urls.append(urljoin(base_url, img_url))
                
                if frame_urls:
                    print(f"MovieStillsDB: Нашел {len(frame_urls)} кадров для {movie_title} ({year})")
                    break
        
        except Exception as e:
            print(f"Ошибка при получении кадров из MovieStillsDB для {movie_title} ({year}): {e}")
    
    return frame_urls

def get_frames_from_film_grab(movie_title, year, max_frames=15):
    frame_urls = []
    
    formatted_title = movie_title.lower().replace(' ', '-')
    formatted_title = re.sub(r'[^\w\-]', '', formatted_title)
    
    search_url = f"https://film-grab.com/?s={quote(movie_title)}"
    
    headers = {"User-Agent": USER_AGENT}
    
    try:
        response = requests.get(search_url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            search_results = soup.select('article.post')
            
            film_url = None
            best_match_title = None
            found_exact_match = False
            
            for result in search_results:
                title_elem = result.select_one('h2.entry-title a')
                if not title_elem:
                    continue
                
                result_title = title_elem.text.strip()
                
                if similar(result_title, movie_title, 0.8) and str(year) in result_title:
                    film_url = title_elem['href']
                    best_match_title = result_title
                    found_exact_match = True
                    print(f"Film-Grab: Нашел хорошее совпадение для {movie_title} ({year}): {result_title}")
                    break
            
            if not found_exact_match:
                for result in search_results:
                    title_elem = result.select_one('h2.entry-title a')
                    if not title_elem:
                        continue
                    
                    result_title = title_elem.text.strip()
                    
                    if similar(result_title, movie_title, 0.7):
                        film_url = title_elem['href']
                        best_match_title = result_title
                        print(f"Film-Grab: Нашел возможное совпадение для {movie_title} ({year}): {result_title}")
                        
                        if str(year) in result_title:
                            break
            
            if not film_url and search_results:
                title_elem = search_results[0].select_one('h2.entry-title a')
                if title_elem:
                    result_title = title_elem.text.strip()
                    if str(year) not in result_title and re.search(r'\b(19|20)\d{2}\b', result_title):
                        found_years = re.findall(r'\b(19|20)\d{2}\b', result_title)
                        if found_years and abs(int(found_years[0]) - int(year)) > 5:
                            print(f"Film-Grab: Пропуск результата для {movie_title} ({year}) из-за несоответствия года: {result_title}")
                            return frame_urls
                    
                    film_url = title_elem['href']
                    best_match_title = result_title
                    print(f"Film-Grab: Использую первый результат для {movie_title} ({year}): {result_title}")
            
            if film_url:
                film_response = requests.get(film_url, headers=headers, timeout=10)
                
                if film_response.status_code == 200:
                    film_soup = BeautifulSoup(film_response.text, 'html.parser')
                    
                    gallery_images = film_soup.select('div.entry-content img')
                    
                    for img in gallery_images[:max_frames]:
                        if 'src' in img.attrs:
                            img_url = img['src']
                            if img_url.startswith('http'):
                                frame_urls.append(img_url)
                    
                    if frame_urls:
                        print(f"Film-Grab: Получено {len(frame_urls)} кадров для {best_match_title}")
    
    except Exception as e:
        print(f"Ошибка при получении кадров из Film-Grab для {movie_title} ({year}): {e}")
    
    return frame_urls

def get_frames_from_imdb(imdb_id, max_frames=15):
    frame_urls = []
    
    if not imdb_id:
        return frame_urls
    
    url = f"https://www.imdb.com/title/{imdb_id}/mediaindex"
    
    headers = {"User-Agent": USER_AGENT}
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            image_links = soup.select('div.media_index_thumb_list a img')
            
            for img in image_links[:max_frames]:
                if 'src' in img.attrs:
                    img_url = img['src']
                    
                    if '_V1_' in img_url:
                        base_url = img_url.split('_V1_')[0]
                        img_url = f"{base_url}_V1_.jpg"
                    
                    frame_urls.append(img_url)
            
            if frame_urls:
                print(f"IMDB: Получено {len(frame_urls)} кадров для фильма с ID {imdb_id}")
    
    except Exception as e:
        print(f"Ошибка при получении кадров из IMDB для фильма {imdb_id}: {e}")
    
    return frame_urls

def download_and_validate_frame(url, output_path, min_width=500, min_brightness=30, min_file_size=20000):
    try:
        headers = {"User-Agent": USER_AGENT}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            return False
        
        if len(response.content) < min_file_size:
            return False
        
        image = Image.open(BytesIO(response.content))
        
        width, height = image.size
        if width < min_width:
            return False
        
        img_gray = image.convert('L')
        brightness = np.mean(np.array(img_gray))
        if brightness < min_brightness:
            return False
        
        with open(output_path, "wb") as f:
            f.write(response.content)
        
        return True
        
    except Exception as e:
        print(f"Ошибка при обработке {url}: {e}")
        return False

def get_next_frame_number(movie_dir):
    existing_frames = [f for f in os.listdir(movie_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
    
    if not existing_frames:
        return 0
    
    frame_numbers = []
    for frame in existing_frames:
        match = re.search(r'frame_(\d+)', frame)
        if match:
            frame_numbers.append(int(match.group(1)))
    
    if not frame_numbers:
        return 0
    
    return max(frame_numbers) + 1

def process_movie_additional_frames(movie_id, movie_data, output_dir, sources_limit=3):
    movie_dir = os.path.join(output_dir, str(movie_id))
    os.makedirs(movie_dir, exist_ok=True)
    
    stats = {
        "downloaded": 0,
        "filtered": 0,
        "errors": 0,
        "sources_used": []
    }
    
    original_title = movie_data.get("original_title", "")
    year = movie_data.get("year", "")
    imdb_id = movie_data.get("imdb_id", "")
    
    if not imdb_id and TMDB_API_KEY:
        imdb_id = get_imdb_id(movie_id, TMDB_API_KEY)
    
    existing_frames_count = len([f for f in os.listdir(movie_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
    
    if existing_frames_count >= MAX_FRAMES_PER_MOVIE:
        return stats
    
    frames_needed = MAX_FRAMES_PER_MOVIE - existing_frames_count
    
    next_frame_num = get_next_frame_number(movie_dir)

    sources = [
        (get_frames_from_movie_stills_db, (original_title, year, imdb_id, frames_needed), "MovieStillsDB"),
        (get_frames_from_film_grab, (original_title, year, frames_needed), "FilmGrab"),
        (get_frames_from_imdb, (imdb_id, frames_needed), "IMDB")
    ]
    
    random.shuffle(sources)
    
    sources = sources[:sources_limit]
    
    for func, params, source_name in sources:
        frames_count = len([f for f in os.listdir(movie_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
        if frames_count >= MAX_FRAMES_PER_MOVIE:
            break
        
        if source_name == "IMDB" and not imdb_id:
            continue
        
        frame_urls = func(*params)
        
        if frame_urls:
            stats["sources_used"].append(source_name)
        
        frames_needed = MAX_FRAMES_PER_MOVIE - frames_count
        
        for i, url in enumerate(frame_urls):
            if stats["downloaded"] >= frames_needed:
                break
            
            output_path = os.path.join(movie_dir, f"frame_{next_frame_num + i:03d}.jpg")
            
            time.sleep(random.uniform(0.2, 0.5))
            
            if download_and_validate_frame(
                url, 
                output_path, 
                min_width=MIN_WIDTH,
                min_brightness=MIN_BRIGHTNESS,
                min_file_size=MIN_FILE_SIZE
            ):
                stats["downloaded"] += 1
            else:
                stats["filtered"] += 1
    
    return stats

def process_all_movies_additional_frames(metadata_file, output_dir, max_threads=5, limit_movies=None):
    movies_metadata = load_movie_metadata(metadata_file)
    
    if limit_movies and limit_movies < len(movies_metadata):
        movie_ids = list(movies_metadata.keys())
        random.shuffle(movie_ids)
        movie_ids = movie_ids[:limit_movies]
        limited_metadata = {movie_id: movies_metadata[movie_id] for movie_id in movie_ids}
        movies_metadata = limited_metadata
        print(f"Обработка ограничена {limit_movies} фильмами")
    
    total_stats = {"downloaded": 0, "filtered": 0, "errors": 0, "sources_used": set()}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        future_to_movie = {
            executor.submit(
                process_movie_additional_frames, 
                movie_id, 
                movie_data, 
                output_dir
            ): (movie_id, movie_data) 
            for movie_id, movie_data in movies_metadata.items()
        }
        
        for future in tqdm(concurrent.futures.as_completed(future_to_movie), total=len(movies_metadata), desc="Дополнительные кадры"):
            movie_id, movie_data = future_to_movie[future]
            
            try:
                stats = future.result()
                
                total_stats["downloaded"] += stats["downloaded"]
                total_stats["filtered"] += stats["filtered"]
                total_stats["errors"] += stats["errors"]
                total_stats["sources_used"].update(stats["sources_used"])
                
                movie_dir = os.path.join(output_dir, str(movie_id))
                if os.path.exists(movie_dir):
                    frame_count = len([f for f in os.listdir(movie_dir) if f.endswith(('.jpg', '.jpeg', '.png'))])
                    movies_metadata[movie_id]["frame_count"] = frame_count
                    
                    if stats["downloaded"] > 0:
                        print(f"Добавлено {stats['downloaded']} кадров для фильма {movie_data.get('title')} ({movie_id}) из источников: {', '.join(stats['sources_used'])}")
                
            except Exception as e:
                print(f"Ошибка при обработке фильма {movie_id}: {e}")
                total_stats["errors"] += 1
    
    updated_metadata_file = os.path.join(os.path.dirname(metadata_file), "movies_with_additional_frames.json")
    with open(updated_metadata_file, "w", encoding="utf-8") as f:
        json.dump(movies_metadata, f, ensure_ascii=False, indent=4)
    
    total_stats["sources_used"] = list(total_stats["sources_used"])
    
    return total_stats, movies_metadata


limit_movies = 10

print(f"Запуск сбора дополнительных кадров из альтернативных источников...")
print(f"Максимальное количество кадров для каждого фильма: {MAX_FRAMES_PER_MOVIE}")
print(f"Количество параллельных потоков: {MAX_THREADS}")

total_stats, updated_metadata = process_all_movies_additional_frames(
    INPUT_FILE, 
    OUTPUT_DIR, 
    MAX_THREADS,
    limit_movies
)

print("\nСтатистика дополнительных кадров:")
print(f"Всего скачано новых кадров: {total_stats['downloaded']}")
print(f"Отфильтровано низкокачественных кадров: {total_stats['filtered']}")
print(f"Ошибок при скачивании: {total_stats['errors']}")
print(f"Использованы источники: {', '.join(total_stats['sources_used'])}")

print("\nФильмы с наибольшим количеством кадров:")
top_movies = sorted([(movie_id, data["frame_count"]) for movie_id, data in updated_metadata.items() if "frame_count" in data], 
                     key=lambda x: x[1], reverse=True)[:10]

for movie_id, count in top_movies:
    if movie_id in updated_metadata:
        print(f"{updated_metadata[movie_id]['title']}: {count} кадров")

Запуск сбора дополнительных кадров из альтернативных источников...
Максимальное количество кадров для каждого фильма: 100
Количество параллельных потоков: 5
Загрузка метаданных из файла movie_data/movies_with_frames.json
Загружены метаданные для 500 фильмов
Обработка ограничена 10 фильмами


Дополнительные кадры:   0%|          | 0/10 [00:00<?, ?it/s]

Ошибка при получении IMDB ID: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/1297763/external_ids?api_key=64cff5eb8f8dc04937d31db59bc01d80 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f4abc4a1fc0>: Failed to establish a new connection: [Errno 111] Connection refused'))
Ошибка при получении IMDB ID: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/11688/external_ids?api_key=64cff5eb8f8dc04937d31db59bc01d80 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f4abc4a3d00>: Failed to establish a new connection: [Errno 111] Connection refused'))
Ошибка при получении IMDB ID: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/234121/external_ids?api_key=64cff5eb8f8dc04937d31db59bc01d80 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f4abc4a1720>: Failed 