In [1]:
import os
import shutil
import random

# Define source folders and destination folder
eacl_folder = 'eacl_1500+'
emnlp_folder = 'emnlp_500'
destination_folder = 'combined_2000'

# Create destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)
    print(f"Created folder: {destination_folder}")
else:
    print(f"Folder {destination_folder} already exists")

# Collect all PDF files from both folders
eacl_files = [f for f in os.listdir(eacl_folder) if f.lower().endswith('.pdf')]
emnlp_files = [f for f in os.listdir(emnlp_folder) if f.lower().endswith('.pdf')]

print(f"Found {len(eacl_files)} PDFs in {eacl_folder}")
print(f"Found {len(emnlp_files)} PDFs in {emnlp_folder}")

# Check for duplicates between the two folders
duplicates = set(eacl_files).intersection(set(emnlp_files))
if duplicates:
    print(f"Found {len(duplicates)} duplicate filenames between folders")
    for dup in duplicates:
        emnlp_files.remove(dup)  # Remove duplicates from the second list

# Combine both lists while ensuring no duplicates
all_files = eacl_files + emnlp_files
print(f"Total unique PDF files available: {len(all_files)}")

# Check if we have enough files
if len(all_files) < 2000:
    print(f"Error: Not enough unique PDF files. Only found {len(all_files)}")
else:
    # Randomly select exactly 2000 files if we have more
    if len(all_files) > 2000:
        all_files = random.sample(all_files, 2000)
    
    # Copy files to destination folder
    copied_count = 0
    for filename in all_files:
        # Determine source folder for each file
        if filename in eacl_files:
            source_path = os.path.join(eacl_folder, filename)
        else:
            source_path = os.path.join(emnlp_folder, filename)
        
        destination_path = os.path.join(destination_folder, filename)
        shutil.copy2(source_path, destination_path)
        copied_count += 1
    
    # Verify the count
    actual_count = len([f for f in os.listdir(destination_folder) if f.lower().endswith('.pdf')])
    print(f"Copied {copied_count} files to {destination_folder}")
    print(f"Actual count in destination folder: {actual_count}")
    
    if actual_count == 2000:
        print("Success! Exactly 2000 PDF files were copied.")
    else:
        print(f"Warning: Expected 2000 files but found {actual_count} in destination folder.")

Created folder: combined_2000
Found 1561 PDFs in eacl_1500+
Found 500 PDFs in emnlp_500
Total unique PDF files available: 2061
Copied 2000 files to combined_2000
Actual count in destination folder: 2000
Success! Exactly 2000 PDF files were copied.


In [3]:
# Add a new cell to correctly divide files into 4 folders with 500 PDFs each
import os
import random
import shutil

# Source folder containing 2000 files
source_folder = 'combined_2000'
base_output_dir = 'divided_data_4folders'

# Create the main output directory if it doesn't exist
if not os.path.exists(base_output_dir):
    os.makedirs(base_output_dir)
    print(f"Created main output directory: {base_output_dir}")

# Get list of PDF files from the source folder
pdf_files = [f for f in os.listdir(source_folder) if f.lower().endswith('.pdf')]

# Verify we have the expected number of files
print(f"Found {len(pdf_files)} PDF files in {source_folder}")

if len(pdf_files) != 2000:
    print(f"Warning: Expected 2000 files but found {len(pdf_files)}")
else:
    # Shuffle the files to ensure random distribution
    random.shuffle(pdf_files)
    
    # Create 4 folders and distribute files
    folders_to_create = 4
    files_per_folder = 500  # 2000 files ÷ 4 folders = 500 files per folder
    folders_created = 0
    files_moved = 0
    
    for folder_num in range(1, folders_to_create + 1):  # Create folders 1 through 4
        folder_name = f"folder_{folder_num}"
        folder_path = os.path.join(base_output_dir, folder_name)
        
        # Create the numbered folder
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
            folders_created += 1
        
        # Get the next 500 files for this folder
        start_idx = (folder_num - 1) * files_per_folder
        end_idx = start_idx + files_per_folder
        folder_files = pdf_files[start_idx:end_idx]
        
        # Copy files to the folder
        for file in folder_files:
            source_path = os.path.join(source_folder, file)
            dest_path = os.path.join(folder_path, file)
            shutil.copy2(source_path, dest_path)
            files_moved += 1
            
    # Verify results
    print(f"Created {folders_created} folders in {base_output_dir}")
    print(f"Moved {files_moved} files (should be 2000)")
    
    # Verify distribution
    folders_with_wrong_count = []
    for folder_num in range(1, folders_to_create + 1):
        folder_path = os.path.join(base_output_dir, f"folder_{folder_num}")
        file_count = len([f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')])
        if file_count != files_per_folder:
            folders_with_wrong_count.append((folder_num, file_count))
    
    if folders_with_wrong_count:
        print(f"Warning: {len(folders_with_wrong_count)} folders have incorrect file counts:")
        for folder_num, count in folders_with_wrong_count:
            print(f"  folder_{folder_num}: {count} files")
    else:
        print(f"Success! All {folders_created} folders contain exactly {files_per_folder} PDF files each.")

Created main output directory: divided_data_4folders
Found 2000 PDF files in combined_2000
Created 4 folders in divided_data_4folders
Moved 2000 files (should be 2000)
Success! All 4 folders contain exactly 500 PDF files each.


In [5]:
# Add a helper function to count files in a folder
def count_files(folder_path):
    """
    Count PDF files in a folder and its subfolders.
    
    Args:
        folder_path (str): Path to the folder to count files in
    
    Returns:
        dict: Dictionary with count statistics
    """
    if not os.path.exists(folder_path):
        print(f"Error: Folder '{folder_path}' does not exist.")
        return None
    
    # Initialize counters
    stats = {
        'total_pdfs': 0,
        'total_subfolders': 0,
        'subfolders': {}
    }
    
    # Check if the path is a directory
    if os.path.isdir(folder_path):
        # Get all items in the directory
        items = os.listdir(folder_path)
        
        # Count PDF files in the root folder
        root_pdfs = [f for f in items if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(folder_path, f))]
        stats['root_pdf_count'] = len(root_pdfs)
        stats['total_pdfs'] += stats['root_pdf_count']
        
        # Process subfolders
        subfolders = [f for f in items if os.path.isdir(os.path.join(folder_path, f))]
        stats['total_subfolders'] = len(subfolders)
        
        # Count files in each subfolder
        for subfolder in subfolders:
            subfolder_path = os.path.join(folder_path, subfolder)
            subfolder_pdfs = [f for f in os.listdir(subfolder_path) 
                             if f.lower().endswith('.pdf') and os.path.isfile(os.path.join(subfolder_path, f))]
            stats['subfolders'][subfolder] = len(subfolder_pdfs)
            stats['total_pdfs'] += len(subfolder_pdfs)
    
    # Print summary
    print(f"Summary for folder: {folder_path}")
    print(f"Total PDF files: {stats['total_pdfs']}")
    
    if stats.get('root_pdf_count', 0) > 0:
        print(f"PDF files in root folder: {stats['root_pdf_count']}")
    
    if stats['total_subfolders'] > 0:
        print(f"Number of subfolders: {stats['total_subfolders']}")
        
        # If there are 10 or fewer subfolders, show all counts
        if stats['total_subfolders'] <= 10:
            for subfolder, count in stats['subfolders'].items():
                print(f"  - {subfolder}: {count} PDFs")
        else:
            # Otherwise just show summary statistics
            counts = list(stats['subfolders'].values())
            avg_count = sum(counts) / len(counts)
            min_count = min(counts)
            max_count = max(counts)
            print(f"  Average PDFs per subfolder: {avg_count:.1f}")
            print(f"  Min PDFs in a subfolder: {min_count}")
            print(f"  Max PDFs in a subfolder: {max_count}")
            
            # List any unusual folders (significantly different from average)
            unusual = [(folder, count) for folder, count in stats['subfolders'].items() 
                      if abs(count - avg_count) > avg_count * 0.2]  # 20% threshold
            if unusual:
                print("  Folders with unusual counts:")
                for folder, count in unusual[:5]:  # Show up to 5 unusual folders
                    print(f"    - {folder}: {count} PDFs")
    
    return stats

# Example usage (commented out)
folder_to_check = 'divided_data_4folders/folder_4'
stats = count_files(folder_to_check)

Summary for folder: divided_data_4folders/folder_4
Total PDF files: 500
PDF files in root folder: 500
