# Combine Deduplicated Sheets into Excel

This notebook combines all deduplicated CSV sheets into a single Excel file with multiple worksheets.

In [1]:
import pandas as pd
import os

# Paths
INPUT_DIR = 'deduplicated_sheets'
OUTPUT_EXCEL = 'deduplicated_sheets/Final_Annotations_Deduplicated.xlsx'

print(f"Input folder: {INPUT_DIR}")
print(f"Output file: {OUTPUT_EXCEL}")

Input folder: deduplicated_sheets
Output file: deduplicated_sheets/Final_Annotations_Deduplicated.xlsx


In [2]:
# List all CSV files (excluding the processing summary)
csv_files = [f for f in os.listdir(INPUT_DIR) 
             if f.endswith('.csv') and not f.startswith('_')]

print(f"Found {len(csv_files)} CSV files to combine:")
for f in sorted(csv_files):
    print(f"  - {f}")

Found 15 CSV files to combine:
  - caa.csv
  - china.csv
  - congress.csv
  - farm_laws.csv
  - farmers_protests.csv
  - hindu.csv
  - hindutva.csv
  - kashmir.csv
  - kashmiri_pandits.csv
  - modi.csv
  - muslim.csv
  - new_parliament.csv
  - rahulgandhi.csv
  - ram_mandir.csv
  - shaheen_bagh.csv


In [3]:
# Combine all CSVs into one Excel file with multiple sheets
with pd.ExcelWriter(OUTPUT_EXCEL, engine='openpyxl') as writer:
    for csv_file in sorted(csv_files):
        # Get sheet name from filename (remove .csv extension)
        sheet_name = csv_file.replace('.csv', '').replace('_', ' ')
        
        # Excel sheet names have a 31 character limit
        if len(sheet_name) > 31:
            sheet_name = sheet_name[:31]
        
        # Load the CSV
        csv_path = os.path.join(INPUT_DIR, csv_file)
        df = pd.read_csv(csv_path)
        
        # Write to Excel as a new sheet
        df.to_excel(writer, sheet_name=sheet_name, index=False)
        print(f"âœ“ Added sheet: '{sheet_name}' ({len(df)} rows)")

print(f"\nâœ… Excel file saved: {OUTPUT_EXCEL}")

âœ“ Added sheet: 'caa' (150 rows)
âœ“ Added sheet: 'china' (120 rows)
âœ“ Added sheet: 'congress' (120 rows)
âœ“ Added sheet: 'farm laws' (150 rows)
âœ“ Added sheet: 'farmers protests' (102 rows)
âœ“ Added sheet: 'hindu' (120 rows)
âœ“ Added sheet: 'hindutva' (120 rows)
âœ“ Added sheet: 'kashmir' (120 rows)
âœ“ Added sheet: 'kashmiri pandits' (94 rows)
âœ“ Added sheet: 'modi' (150 rows)
âœ“ Added sheet: 'muslim' (150 rows)
âœ“ Added sheet: 'new parliament' (150 rows)
âœ“ Added sheet: 'rahulgandhi' (120 rows)
âœ“ Added sheet: 'ram mandir' (150 rows)
âœ“ Added sheet: 'shaheen bagh' (150 rows)

âœ… Excel file saved: deduplicated_sheets/Final_Annotations_Deduplicated.xlsx


In [4]:
# Verify the Excel file
xl = pd.ExcelFile(OUTPUT_EXCEL)
print(f"\nðŸ“Š Excel file contains {len(xl.sheet_names)} sheets:")
for sheet in xl.sheet_names:
    df = pd.read_excel(OUTPUT_EXCEL, sheet_name=sheet)
    print(f"  - '{sheet}': {len(df)} rows")

# Calculate totals
total_rows = sum(len(pd.read_excel(OUTPUT_EXCEL, sheet_name=sheet)) for sheet in xl.sheet_names)
print(f"\nTotal rows across all sheets: {total_rows}")


ðŸ“Š Excel file contains 15 sheets:
  - 'caa': 150 rows
  - 'china': 120 rows
  - 'congress': 120 rows
  - 'farm laws': 150 rows
  - 'farmers protests': 102 rows
  - 'hindu': 120 rows
  - 'hindutva': 120 rows
  - 'kashmir': 120 rows
  - 'kashmiri pandits': 94 rows
  - 'modi': 150 rows
  - 'muslim': 150 rows
  - 'new parliament': 150 rows
  - 'rahulgandhi': 120 rows
  - 'ram mandir': 150 rows
  - 'shaheen bagh': 150 rows

Total rows across all sheets: 1966
