In [2]:
import pandas as pd

In [3]:
def sort_csv_by_image_id(input_file, output_file=None):
    """
    Sorts a CSV file by image_id column in lexicographical order.
    
    Parameters:
    input_file (str): Path to the input CSV file
    output_file (str): Path to the output CSV file (optional)
                      If None, overwrites the input file
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Check if image_id column exists
    if 'image_id' not in df.columns:
        raise ValueError("CSV file does not contain 'image_id' column")
    
    # Sort by image_id in lexicographical order
    df_sorted = df.sort_values('image_id', key=lambda col: col.astype(str))
    
    # Save the sorted DataFrame
    if output_file is None:
        output_file = input_file
    
    df_sorted.to_csv(output_file, index=False)
    print(f"CSV file sorted and saved to: {output_file}")
    
    return df_sorted

In [4]:
sorted_df = sort_csv_by_image_id('submission_deepseek.csv', 'submission_deepseek_sorted.csv')
print(sorted_df.head())

CSV file sorted and saved to: submission_deepseek_sorted.csv
   image_id                                         Epithelial  \
18   slide1  1 1 91 1 93 1 1 95 1 1 97 1 1 99 1 1 109 1 1 1...   
3   slide10  1 1 2 1 4 26 1 32 11 1 45 10 1 64 13 1 81 11 1...   
0   slide11  1 1 1 1 11 1 1 21 1 1 27 1 1 86 19 1 107 2 1 1...   
4   slide12  1 1 1 1 4 85 1 90 1 1 92 1 1 94 1 1 104 1 1 10...   
5   slide13  1 1 1 1 11 1 1 261 1 1 263 1 1 265 1 1 267 1 1...   

                                           Lymphocyte  \
18  1 184573 3 1 185085 3 1 185597 3 1 186110 2 1 ...   
3   250 170 7 250 682 5 672 981 1 250 1194 5 389 1...   
0   4 6906 1 4 6908 1 4 7418 2 4 7930 3 4 8442 2 4...   
4   2 18432 1 2 18944 1 2 19456 1 2 19968 1 2 2048...   
5   10 17408 1 10 17920 1 10 18432 1 10 18944 1 10...   

                                           Neutrophil  \
18  7 92491 12 7 92998 1 7 93000 18 7 93506 26 7 9...   
3                                                   0   
0                           

In [5]:
import pandas as pd
import re

# --- Helper function for natural sorting ---
def natural_key(text):
    """
    Splits text into list of integers and non-integers for natural sorting.
    Example: 'slide10' -> ['slide', 10]
    """
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', text)]

# --- Main logic ---
def sort_csv_by_image_id(input_csv, output_csv):
    # Read CSV
    df = pd.read_csv(input_csv)

    # Ensure 'image_id' column exists
    if 'image_id' not in df.columns:
        raise ValueError("CSV must contain an 'image_id' column")

    # Sort using the natural key
    df = df.sort_values(by='image_id', key=lambda col: col.map(natural_key))

    # Save to output file
    df.to_csv(output_csv, index=False)
    print(f"✅ Sorted CSV saved to: {output_csv}")

# --- Example usage ---
if __name__ == "__main__":
    sort_csv_by_image_id("submission_aditya3.csv", "submission_aditya3_sorted.csv")

✅ Sorted CSV saved to: submission_aditya3_sorted.csv
