This script looks for GoPro video files in AWS and concatenates them using the "dropID" part of the Key as its filename

# Requirements

In [None]:
import subprocess
import logging
import os
import time
import boto3
import pandas as pd
from typing import List, Iterator, Optional
from dataclasses import dataclass
from pathlib import Path
from botocore.exceptions import ClientError
import getpass
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from functools import partial
import hashlib

# Configure logging with a more detailed format
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class AWSCredentials:
    access_key_id: str
    secret_access_key: str
    
    @classmethod
    def from_user_input(cls) -> 'AWSCredentials':
        """Securely prompt user for AWS credentials."""
        access_key = getpass.getpass("Enter AWS Access Key ID: ")
        secret_key = getpass.getpass("Enter AWS Secret Access Key: ")
        return cls(access_key, secret_key)

class S3Client:
    def __init__(self, credentials: Optional[AWSCredentials] = None, max_connections: int = 10):
        self.max_connections = max_connections
        self.client = self._initialize_client(credentials)

    def _initialize_client(self, credentials: Optional[AWSCredentials]) -> boto3.client:
        """Initialize S3 client with credentials from env vars, provided credentials, or user input."""
        if credentials is None:
            # Try environment variables first
            access_key = os.getenv("AWS_ACCESS_KEY_ID")
            secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
            
            if not access_key or not secret_key:
                logger.info("AWS credentials not found in environment variables. Please enter them manually.")
                credentials = AWSCredentials.from_user_input()
            else:
                credentials = AWSCredentials(access_key, secret_key)

        try:
            # Configure for better performance and reliability
            config = boto3.session.Config(
                max_pool_connections=self.max_connections,
                retries={
                    'max_attempts': 5,
                    'mode': 'adaptive'
                },
                connect_timeout=120,
                read_timeout=300,
                # Add TCP keepalive
                tcp_keepalive=True
            )
            
            client = boto3.client(
                "s3",
                aws_access_key_id=credentials.access_key_id,
                aws_secret_access_key=credentials.secret_access_key,
                config=config
            )
            # Test the credentials by making a simple API call
            client.list_buckets()
            logger.info("Successfully authenticated with AWS")
            return client
        except ClientError as e:
            logger.error("Failed to authenticate with AWS")
            if "InvalidAccessKeyId" in str(e) or "SignatureDoesNotMatch" in str(e):
                logger.error("Invalid credentials provided. Please try again.")
                credentials = AWSCredentials.from_user_input()
                return self._initialize_client(credentials)
            raise

    def list_objects(self, bucket: str, prefix: str = "", suffix: str = "") -> Iterator[dict]:
        """List objects in an S3 bucket with optional prefix and suffix filtering."""
        paginator = self.client.get_paginator("list_objects_v2")
        
        for prefix_item in [prefix] if isinstance(prefix, str) else prefix:
            try:
                for page in paginator.paginate(Bucket=bucket, Prefix=prefix_item):
                    if "Contents" not in page:
                        continue
                    
                    for obj in page["Contents"]:
                        if obj["Key"].endswith(suffix):
                            yield obj
            except ClientError as e:
                logger.error(f"Error listing objects: {e}")
                raise

    def download_file_with_retry(self, bucket: str, key: str, filename: Path, max_retries: int = 3) -> bool:
        """Download a file from S3 with retry logic and integrity checking."""
        for attempt in range(max_retries):
            try:
                logger.info(f"Downloading {key} (attempt {attempt + 1}/{max_retries})")
                
                # Get object metadata first to check expected size
                response = self.client.head_object(Bucket=bucket, Key=key)
                expected_size = response['ContentLength']
                
                # Download the file
                self.client.download_file(
                    Bucket=bucket,
                    Key=key,
                    Filename=str(filename)
                )
                
                # Verify the download
                if filename.exists():
                    actual_size = filename.stat().st_size
                    if actual_size == expected_size:
                        logger.info(f"‚úÖ Successfully downloaded {key} ({actual_size/1024/1024:.2f} MB)")
                        return True
                    else:
                        logger.warning(f"‚ùå Size mismatch for {key}: expected {expected_size}, got {actual_size}")
                        filename.unlink(missing_ok=True)
                else:
                    logger.warning(f"‚ùå Downloaded file {filename} does not exist")
                    
            except Exception as e:
                logger.error(f"‚ùå Download attempt {attempt + 1} failed for {key}: {str(e)}")
                filename.unlink(missing_ok=True)
                
                if attempt < max_retries - 1:
                    wait_time = 2 ** attempt  # Exponential backoff
                    logger.info(f"Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
        
        logger.error(f"‚ùå Failed to download {key} after {max_retries} attempts")
        return False

class VideoProcessor:
    MOVIE_EXTENSIONS = {'.wmv', '.mpg', '.mov', '.avi', '.mp4', '.MOV', '.MP4'}
    
    def __init__(self, s3_client: S3Client, bucket: str, max_workers: int = 2):
        self.s3_client = s3_client
        self.bucket = bucket
        self.max_workers = max_workers  # Reduced to prevent connection pool exhaustion
        self.download_dir = Path("downloaded_movies")
        self.output_dir = Path("concatenated_videos")
        
        # Create necessary directories
        self.download_dir.mkdir(exist_ok=True)
        self.output_dir.mkdir(exist_ok=True)
        
        # Find and verify ffmpeg
        self.ffmpeg_path = self._find_ffmpeg()
        if not self.ffmpeg_path:
            raise RuntimeError(
                "ffmpeg not found. Please install ffmpeg:\n"
                "1. Download from https://github.com/BtbN/FFmpeg-Builds/releases\n"
                "2. Extract the zip file\n"
                "3. Add the bin folder to your system PATH or place ffmpeg.exe in your working directory"
            )

    def _find_ffmpeg(self) -> Optional[str]:
        """Find ffmpeg executable in various locations."""
        try:
            # Check if ffmpeg is in PATH
            result = subprocess.run(['ffmpeg', '-version'], 
                                 capture_output=True, 
                                 check=False)
            if result.returncode == 0:
                return 'ffmpeg'
        except FileNotFoundError:
            pass

        # Check common Windows locations
        possible_paths = [
            Path.cwd() / "ffmpeg.exe",  # Current directory
            Path.cwd() / "bin" / "ffmpeg.exe",  # bin subdirectory
            Path(os.getenv('PROGRAMFILES', '')) / "ffmpeg" / "bin" / "ffmpeg.exe",
            Path(os.getenv('PROGRAMFILES(X86)', '')) / "ffmpeg" / "bin" / "ffmpeg.exe",
        ]

        # Add conda environment path if running in conda
        conda_prefix = os.getenv('CONDA_PREFIX')
        if conda_prefix:
            possible_paths.append(Path(conda_prefix) / "Library" / "bin" / "ffmpeg.exe")

        for path in possible_paths:
            if path.exists():
                logger.info(f"Found ffmpeg at: {path}")
                return str(path)

        return None

    def verify_video_file(self, file_path: Path) -> bool:
        """Verify that a video file exists and has non-zero size."""
        try:
            if not file_path.exists():
                logger.error(f"Video file does not exist: {file_path}")
                return False
            
            size = file_path.stat().st_size
            if size == 0:
                logger.error(f"Video file is empty: {file_path}")
                return False
                
            logger.info(f"Verified video file: {file_path} (size: {size/1024/1024:.2f} MB)")
            return True
            
        except Exception as e:
            logger.error(f"Error verifying video file {file_path}: {str(e)}")
            return False

    def verify_video_file_deep(self, file_path: Path) -> bool:
        """Deep verification using ffmpeg probe."""
        try:
            # Try to read video metadata using ffprobe (more lightweight than ffmpeg)
            cmd = [
                'ffprobe',
                '-v', 'quiet',
                '-print_format', 'json',
                '-show_format',
                '-show_streams',
                str(file_path)
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
            if result.returncode != 0:
                logger.error(f"ffprobe failed for {file_path}: {result.stderr}")
                return False
            
            # Check if we got valid JSON output
            import json
            try:
                data = json.loads(result.stdout)
                if 'format' in data and 'streams' in data:
                    logger.info(f"‚úÖ Video file verified: {file_path}")
                    return True
                else:
                    logger.error(f"Invalid video format for {file_path}")
                    return False
            except json.JSONDecodeError:
                logger.error(f"Invalid ffprobe output for {file_path}")
                return False
                
        except subprocess.TimeoutExpired:
            logger.error(f"ffprobe timeout for {file_path}")
            return False
        except Exception as e:
            logger.error(f"Error verifying video file {file_path}: {str(e)}")
            return False

    def concatenate_videos(self, video_paths: List[Path], output_path: Path, verify_videos: bool) -> bool:
        """Concatenate multiple videos using ffmpeg with hardware acceleration if available."""
        list_file = None
        try:
            if verify_videos:
                # Verify all input files exist and are valid
                logger.info(f"Verifying {len(video_paths)} input videos...")
                valid_videos = []
                for path in video_paths:
                    if self.verify_video_file_deep(path):
                        valid_videos.append(path)
                    else:
                        logger.error(f"Skipping invalid video: {path}")
                        
                if not valid_videos:
                    raise ValueError("No valid videos found to concatenate")
                
                if len(valid_videos) != len(video_paths):
                    logger.warning(f"Only {len(valid_videos)} out of {len(video_paths)} videos are valid")
            else:
                valid_videos = video_paths
                
            total_input_size = sum(path.stat().st_size for path in valid_videos)
            logger.info(f"Total input size: {total_input_size/1024/1024:.2f} MB")
            
            # Create a temporary file list for ffmpeg
            list_file = self.download_dir / "file_list.txt"
            with open(list_file, 'w', encoding='utf-8') as f:
                for path in valid_videos:
                    # Use forward slashes for cross-platform compatibility
                    f.write(f"file '{str(path.absolute()).replace(chr(92), '/')}'\n")
            
            logger.info(f"Created concat list file at {list_file}")
            
            # Build ffmpeg command with hardware acceleration
            base_cmd = [self.ffmpeg_path, '-y', '-f', 'concat', '-safe', '0', '-i', str(list_file)]
            
            logger.info("üêå Using CPU (-c copy) for concatenation.")
            codec_cmd = ['-c', 'copy']
            
            cmd = base_cmd + codec_cmd + [str(output_path)]
    
            logger.info(f"Running command: {' '.join(cmd)}")
            
            start_time = time.time()
            try:
                result = subprocess.run(
                    cmd, capture_output=True, text=True, check=True, encoding='utf-8', timeout=3600
                )
                end_time = time.time()
                elapsed_time = end_time - start_time
                logger.info(f"‚úÖ Success! Concatenation took {elapsed_time:.2f} seconds.")
                
                # Verify the output file
                if not output_path.exists() or output_path.stat().st_size == 0:
                    logger.error("Output video file is missing or empty")
                    return False
                
                output_size = output_path.stat().st_size
                logger.info(f"Output file size: {output_size/1024/1024:.2f} MB")
                return True
                
            except subprocess.CalledProcessError as e:
                logger.error("‚ùå FFmpeg command failed!")
                logger.error(f"Stderr:\n{e.stderr}")
                return False
            except subprocess.TimeoutExpired:
                logger.error("‚ùå FFmpeg command timed out!")
                return False
                
        except Exception as e:
            logger.error(f"‚ùå Concatenation failed: {str(e)}")
            return False
        finally:
            # Clean up the temporary file list
            if list_file and list_file.exists():
                try:
                    list_file.unlink()
                except Exception as e:
                    logger.error(f"Error cleaning up list file: {str(e)}")

    def _download_single_video(self, key: str, progress_dict: dict = None) -> Path:
        """Download a single video file with retry logic."""
        local_path = self.download_dir / Path(key).name
        try:
            success = self.s3_client.download_file_with_retry(self.bucket, key, local_path)
            if progress_dict is not None:
                progress_dict[key] = success
            if success:
                return local_path
            else:
                raise RuntimeError(f"Failed to download {key}")
        except Exception as e:
            logger.error(f"Failed to download {key}: {str(e)}")
            if progress_dict is not None:
                progress_dict[key] = False
            raise

    def _download_videos_sequential(self, keys: pd.Series) -> List[Path]:
        """Download all videos for a drop sequentially (more reliable for large files)."""
        downloaded_files = []
        
        for key in keys:
            try:
                local_path = self._download_single_video(key)
                downloaded_files.append(local_path)
            except Exception as e:
                logger.error(f"Download failed for {key}: {str(e)}")
                # Continue with other files
                continue
        
        # Sort files by name to ensure correct order
        downloaded_files.sort()
        return downloaded_files

    def _download_videos_parallel(self, keys: pd.Series) -> List[Path]:
        """Download all videos for a drop in parallel."""
        downloaded_files = []
        progress_dict = {}
        
        # Use ThreadPoolExecutor for parallel downloads
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all download tasks
            future_to_key = {
                executor.submit(self._download_single_video, key, progress_dict): key 
                for key in keys
            }
            
            # Process completed downloads
            for future in as_completed(future_to_key):
                key = future_to_key[future]
                try:
                    local_path = future.result()
                    downloaded_files.append(local_path)
                except Exception as e:
                    logger.error(f"Download failed for {key}: {str(e)}")
        
        # Sort files by name to ensure correct order
        downloaded_files.sort()
        return downloaded_files

    def _process_single_drop(self, drop_data: pd.DataFrame, delete_originals: bool, test_mode: bool, verify_videos: bool, sequential_download: bool = True) -> None:
        """Process a single drop's worth of videos."""
        downloaded_files = []
        output_path = None
        
        try:
            # Download files (sequential is more reliable for large files)
            if sequential_download:
                downloaded_files = self._download_videos_sequential(drop_data['Key'])
            else:
                downloaded_files = self._download_videos_parallel(drop_data['Key'])
            
            if not downloaded_files:
                raise RuntimeError("No files were successfully downloaded")
            
            logger.info(f"Processing files in order: {[f.name for f in downloaded_files]}")
            
            if verify_videos:
                # Quick verification of downloaded files
                valid_files = []
                for file_path in downloaded_files:
                    if self.verify_video_file(file_path):
                        valid_files.append(file_path)
                    else:
                        logger.error(f"Downloaded file is corrupted: {file_path}")
                
                if not valid_files:
                    raise RuntimeError("No valid video files available for processing")
            else:
                valid_files = downloaded_files            
            
            output_path = self._concatenate_drop_videos(valid_files, drop_data['DropID'].iloc[0], verify_videos)
            
            if not test_mode:
                self._upload_and_cleanup(output_path, drop_data, delete_originals)
        except Exception as e:
            logger.error(f"Error processing drop: {str(e)}")
            raise
        finally:
            # Ensure cleanup happens even if there's an error
            self._cleanup_files(downloaded_files, output_path)

    def process_gopro_videos(
        self,
        filtered_df: pd.DataFrame,
        delete_originals: bool = False,
        test_mode: bool = False,
        gopro_prefix: str = "GX",
        verify_videos: bool = False,
        parallel_drops: bool = False,
        sequential_download: bool = True
    ) -> None:
        """Process GoPro videos by DropID with optional parallel processing."""
        
        drop_ids = filtered_df['DropID'].unique()
        valid_drops = []
        
        # Filter valid drops
        for drop_id in drop_ids:
            drop_data = filtered_df[filtered_df['DropID'] == drop_id]
            if all(str(name).startswith(gopro_prefix) for name in drop_data['fileName']):
                valid_drops.append((drop_id, drop_data))
            else:
                logger.warning(f"Skipping DropID {drop_id}: Not all videos start with {gopro_prefix}")
        
        logger.info(f"Processing {len(valid_drops)} valid drops")
        
        if parallel_drops and len(valid_drops) > 1:
            # Process drops in parallel (careful with resource usage)
            with ThreadPoolExecutor(max_workers=min(2, len(valid_drops))) as executor:
                futures = []
                for drop_id, drop_data in valid_drops:
                    future = executor.submit(
                        self._process_single_drop, 
                        drop_data, delete_originals, test_mode, verify_videos, sequential_download
                    )
                    futures.append((drop_id, future))
                
                for drop_id, future in futures:
                    try:
                        future.result()
                        logger.info(f"‚úÖ Successfully processed DropID {drop_id}")
                    except Exception as e:
                        logger.error(f"‚ùå Error processing DropID {drop_id}: {str(e)}")
        else:
            # Sequential processing
            for drop_id, drop_data in valid_drops:
                try:
                    logger.info(f"Processing DropID {drop_id}...")
                    self._process_single_drop(drop_data, delete_originals, test_mode, verify_videos, sequential_download)
                    logger.info(f"‚úÖ Successfully processed DropID {drop_id}")
                except Exception as e:
                    logger.error(f"‚ùå Error processing DropID {drop_id}: {str(e)}")
                    continue

    def get_movies_df(self, prefix: str = "") -> pd.DataFrame:
        """Get DataFrame of movie files in S3 bucket with their sizes.    
        Args:
            prefix: Optional prefix to filter S3 objects            
        Returns:
            DataFrame with columns 'Key' and 'Size' (in bytes)
        """
        # Get all objects matching the prefix and movie extensions
        objects = self.s3_client.list_objects(
            self.bucket,
            prefix=prefix,
            suffix=tuple(self.MOVIE_EXTENSIONS)
        )
        
        # Extract both keys and sizes
        movie_data = [
            {
                'Key': obj['Key'],
                'Size': obj['Size']  # Size in bytes
            }
            for obj in objects
        ]
        
        return pd.DataFrame(movie_data)

    def _concatenate_drop_videos(self, video_paths: List[Path], drop_id: str, verify_videos: bool) -> Path:
        """Concatenate videos for a single drop."""
        output_path = self.output_dir / f"{drop_id}.mp4"
        if not self.concatenate_videos(video_paths, output_path, verify_videos):
            raise RuntimeError("Video concatenation failed")
        return output_path

    def _upload_and_cleanup(self, output_path: Path, drop_data: pd.DataFrame, delete_originals: bool) -> None:
        """Upload concatenated video and cleanup originals if requested."""
        new_key = f"{drop_data['SurveyID'].iloc[0]}/{drop_data['DropID'].iloc[0]}/{drop_data['DropID'].iloc[0]}.mp4"
        
        try:            
            self.s3_client.client.upload_file(
                str(output_path),
                self.bucket,
                new_key
            )
                    
            logger.info(f"Successfully uploaded concatenated video to {new_key}")
            
            # Delete original files if requested
            if delete_originals:
                for key in drop_data['Key']:
                    self.s3_client.client.delete_object(Bucket=self.bucket, Key=key)
                    logger.info(f"Deleted original file {key}")
                    
        except Exception as e:
            logger.error(f"Error during upload of {new_key}: {str(e)}")
            raise

    @staticmethod
    def _cleanup_files(downloaded_files: List[Path], output_path: Optional[Path]) -> None:
        """Clean up local files."""
        for file_path in downloaded_files:
            if file_path.exists():
                try:
                    file_path.unlink()
                    logger.info(f"Cleaned up {file_path}")
                except Exception as e:
                    logger.error(f"Error deleting {file_path}: {str(e)}")
        
        if output_path and output_path.exists():
            try:
                output_path.unlink()
                logger.info(f"Cleaned up {output_path}")
            except Exception as e:
                logger.error(f"Error deleting {output_path}: {str(e)}")


def get_filtered_movies_df(movies_df: pd.DataFrame, gopro_ext: str = "BNP") -> pd.DataFrame:
    """
    Filter movies DataFrame to remove DropID groups where a file has the same name as its DropID.
    
    Args:
        movies_df: DataFrame with Key column containing file paths
        gopro_ext: Prefix used to identify GoPro files
    
    Returns:
        Filtered DataFrame containing only valid GoPro groups
    """
    # Create the SurveyID, DropID, and fileName columns from the Key
    df = movies_df.assign(
        SurveyID=movies_df['Key'].str.split('/', expand=True)[0],
        DropID=movies_df['Key'].str.split('/', expand=True)[1],
        fileName=movies_df['Key'].str.split('/', expand=True)[2]
    )
    
    # Filter for GoPro movies
    go_pro_movies_df = df[df.fileName.str.startswith(gopro_ext)].copy()
    
    # Remove .mp4 extension from fileName for comparison
    go_pro_movies_df['fileNameNoExt'] = go_pro_movies_df['fileName'].str.replace('.mp4', '')
    
    # Find DropIDs where any fileName (without extension) matches the DropID
    matching_dropids = go_pro_movies_df[
        go_pro_movies_df.apply(
            lambda row: row['fileNameNoExt'] == row['DropID'], 
            axis=1
        )
    ]['DropID'].unique()
    
    # Remove groups where DropID matches any fileName
    df_no_matching = go_pro_movies_df[~go_pro_movies_df['DropID'].isin(matching_dropids)]
    
    # Group by 'DropID' and count unique 'fileName' values for each group
    grouped_counts = df_no_matching.groupby('DropID')['fileName'].nunique()
    
    # Filter for 'DropID's with more than one unique 'fileName'
    filtered_dropids = grouped_counts[grouped_counts > 1].index
    
    # Filter the DataFrame to retain only rows with the filtered 'DropID's
    filtered_df = df_no_matching[df_no_matching['DropID'].isin(filtered_dropids)]
    
    # Drop the temporary fileNameNoExt column
    filtered_df = filtered_df.drop('fileNameNoExt', axis=1)
    
    return filtered_df

# Get info from go pro movies

In [None]:
# Initialize with better connection settings
s3_client = S3Client(max_connections=10)

# Create video processor with conservative settings
processor = VideoProcessor(s3_client, bucket="marine-buv", max_workers=2)

# Get movies
movies_df = processor.get_movies_df(prefix="AHE")
gopro_ext = "GX"
filtered_df = get_filtered_movies_df(movies_df=movies_df, gopro_ext=gopro_ext)
filtered_df
# Process with sequential downloads (more reliable)
processor.process_gopro_videos(
    filtered_df=filtered_df,
    delete_originals=False,
    test_mode=False,
    gopro_prefix=gopro_ext,
    verify_videos=True,  # Enable verification to catch corrupted files
    parallel_drops=False,  # Process one drop at a time
    sequential_download=True  # Download files one at a time (most reliable)
)

In [None]:
def find_already_concatenated_movies_df(movies_df: pd.DataFrame, size_tolerance: float = 0.05) -> pd.DataFrame:
    """
    Find individual video files that should be removed because a concatenated file already exists.
    
    This function groups files by DropID, orders by size (largest first), and checks if the largest file
    has approximately the same size as the sum of all other files. If so, it assumes the largest file
    is the concatenated version and returns the smaller files for removal.
    
    Args:
        movies_df: DataFrame with Key column containing file paths and Size column
        size_tolerance: Tolerance for size comparison (default 0.05 = 5%)
    
    Returns:
        DataFrame containing only individual files that should be removed (where concatenated version exists)
    """
    # Create the SurveyID, DropID, and fileName columns from the Key
    df = movies_df.assign(
        SurveyID=movies_df['Key'].str.split('/', expand=True)[0],
        DropID=movies_df['Key'].str.split('/', expand=True)[1],
        fileName=movies_df['Key'].str.split('/', expand=True)[2]
    )
    
    # Only consider DropIDs with multiple files
    dropid_counts = df.groupby('DropID').size()
    multi_file_dropids = dropid_counts[dropid_counts > 1].index
    
    print(f"Found {len(multi_file_dropids)} DropIDs with multiple files")
    
    files_to_remove = []
    concatenated_found = 0
    
    for drop_id in multi_file_dropids:
        drop_files = df[df['DropID'] == drop_id].copy()
        
        # Sort by size (largest first)
        drop_files = drop_files.sort_values('Size', ascending=False)
        
        if len(drop_files) < 2:
            continue
            
        largest_file = drop_files.iloc[0]
        other_files = drop_files.iloc[1:]
        
        largest_size = largest_file['Size']
        sum_others_size = other_files['Size'].sum()
        
        # Check if largest file size is approximately equal to sum of others
        # Allow for some tolerance due to encoding differences, metadata, etc.
        size_ratio = abs(largest_size - sum_others_size) / max(largest_size, sum_others_size)
        
        if size_ratio <= size_tolerance:
            # Largest file is likely concatenated version of the others
            files_to_remove.extend(other_files.to_dict('records'))
            concatenated_found += 1
            
            print(f"DropID {drop_id}: Concatenated file ({largest_size/1024/1024:.1f}MB) ‚âà Sum of {len(other_files)} files ({sum_others_size/1024/1024:.1f}MB)")
        else:
            print(f"DropID {drop_id}: Size mismatch - Largest: {largest_size/1024/1024:.1f}MB, Sum others: {sum_others_size/1024/1024:.1f}MB (ratio: {size_ratio:.3f})")
    
    if files_to_remove:
        result_df = pd.DataFrame(files_to_remove)
        total_size_gb = result_df['Size'].sum() / (1024**3)
        
        print(f"\nFound {concatenated_found} DropIDs with concatenated files")
        print(f"Total files to remove: {len(result_df)}")
        print(f"Total size to be removed: {total_size_gb:.2f} GB")
        
        # Show examples
        print("\nExample DropIDs with files to remove:")
        for drop_id in result_df['DropID'].unique()[:3]:
            files = result_df[result_df['DropID'] == drop_id]['fileName'].tolist()
            sizes = result_df[result_df['DropID'] == drop_id]['Size'].tolist()
            print(f"  {drop_id}: {len(files)} files, {sum(sizes)/1024/1024:.1f}MB total")
            for file, size in zip(files[:3], sizes[:3]):  # Show first 3 files
                print(f"    - {file} ({size/1024/1024:.1f}MB)")
        
        return result_df
    else:
        print("No files found for removal (no concatenated files detected)")
        return pd.DataFrame()

In [None]:
# Get individual files that can be removed (concatenated version exists)
files_to_remove = find_already_concatenated_movies_df(movies_df, size_tolerance=0.01)


# Remove redundant files
for _, row in files_to_remove.iterrows():
    print(f"Safe to remove: {row['Key']} ({row['Size']/1024/1024:.1f}MB)")
    # s3_client.delete_object(Bucket=bucket, Key=row['Key'])