# Excel to CSV Column Extractor

This notebook provides a function to read an Excel file and export specified columns to CSV format.

In [None]:
from pathlib import Path

import pandas as pd

In [None]:
def xlsx_to_csv_columns(xlsx_path, csv_path, columns=None, sheet_name=0):
    """
    Read Excel file and export specified columns to CSV.
    
    Args:
        xlsx_path: Path to input Excel file
        csv_path: Path for output CSV file
        columns: List of column names to export (if None, exports all columns)
        sheet_name: Sheet name or index to read (default: 0 for first sheet)
        
    Returns:
        pd.DataFrame: Exported dataframe
        
    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If specified columns don't exist
        KeyError: If sheet name doesn't exist
    """
    # Convert to Path objects
    xlsx_file = Path(xlsx_path)
    csv_file = Path(csv_path)
    
    # Validate input file exists
    if not xlsx_file.exists():
        raise FileNotFoundError(f"Excel file not found: {xlsx_file}")
    
    # Create output directory if needed
    csv_file.parent.mkdir(parents=True, exist_ok=True)
    
    try:
        # Read Excel file
        print(f"Reading Excel file: {xlsx_file}")
        df = pd.read_excel(xlsx_file, sheet_name=sheet_name)
        
        print(f"Original data: {len(df)} rows, {len(df.columns)} columns")
        print(f"Available columns: {list(df.columns)}")
        
        # Handle column selection
        if columns is None:
            # Export all columns
            selected_df = df.copy()
            print("Exporting all columns")
        else:
            # Validate specified columns exist
            missing_cols = set(columns) - set(df.columns)
            if missing_cols:
                raise ValueError(f"Columns not found in Excel file: {list(missing_cols)}")
            
            # Select specified columns
            selected_df = df[columns].copy()
            print(f"Exporting {len(columns)} specified columns: {columns}")
        
        # Export to CSV
        selected_df.to_csv(csv_file, index=False)
        
        print(f"Successfully exported {len(selected_df)} rows, {len(selected_df.columns)} columns")
        print(f"CSV saved to: {csv_file}")
        
        return selected_df
        
    except Exception as e:
        print(f"Error processing file: {e}")
        raise

In [None]:
def list_excel_info(xlsx_path):
    """
    Display information about an Excel file (sheets and columns).
    
    Args:
        xlsx_path: Path to Excel file
        
    Returns:
        dict: Information about sheets and columns
    """
    xlsx_file = Path(xlsx_path)
    
    if not xlsx_file.exists():
        raise FileNotFoundError(f"Excel file not found: {xlsx_file}")
    
    try:
        # Read Excel file to get sheet info
        excel_file = pd.ExcelFile(xlsx_file)
        sheet_info = {}
        
        print(f"Excel file: {xlsx_file}")
        print(f"Number of sheets: {len(excel_file.sheet_names)}")
        print("\nSheet information:")
        
        for i, sheet_name in enumerate(excel_file.sheet_names):
            df = pd.read_excel(xlsx_file, sheet_name=sheet_name)
            sheet_info[sheet_name] = {
                'index': i,
                'rows': len(df),
                'columns': list(df.columns),
                'column_count': len(df.columns)
            }
            
            print(f"  [{i}] '{sheet_name}': {len(df)} rows, {len(df.columns)} columns")
            print(f"      Columns: {list(df.columns)}")
            print()
        
        return sheet_info
        
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        raise

## Usage Examples

### Step 1: Explore Excel file structure

In [None]:
# First, explore the Excel file to see available sheets and columns
excel_path = Path('data.xlsx')  # Update this path

try:
    sheet_info = list_excel_info(excel_path)
except FileNotFoundError as e:
    print(f"File error: {e}")
except Exception as e:
    print(f"Error: {e}")

### Step 2: Export all columns to CSV

In [None]:
# Export all columns from first sheet
excel_path = Path('data.xlsx')  # Update this path
csv_path = Path('exported_all_columns.csv')

try:
    exported_df = xlsx_to_csv_columns(excel_path, csv_path)
    print("Export completed successfully!")
except FileNotFoundError as e:
    print(f"File error: {e}")
except Exception as e:
    print(f"Error: {e}")

### Step 3: Export specific columns to CSV

In [None]:
# Export only specified columns
excel_path = Path('data.xlsx')  # Update this path
csv_path = Path('exported_selected_columns.csv')
selected_columns = ['column1', 'column2', 'column3']  # Update column names

try:
    exported_df = xlsx_to_csv_columns(excel_path, csv_path, columns=selected_columns)
    print("Export completed successfully!")
except FileNotFoundError as e:
    print(f"File error: {e}")
except ValueError as e:
    print(f"Column error: {e}")
except Exception as e:
    print(f"Error: {e}")

### Step 4: Export from specific sheet

In [None]:
# Export from a specific sheet (by name or index)
excel_path = Path('data.xlsx')  # Update this path
csv_path = Path('exported_from_sheet.csv')
sheet_name = 'Sheet2'  # Or use index like 1
selected_columns = ['id', 'name', 'value']  # Update column names

try:
    exported_df = xlsx_to_csv_columns(
        excel_path, 
        csv_path, 
        columns=selected_columns, 
        sheet_name=sheet_name
    )
    print("Export completed successfully!")
except FileNotFoundError as e:
    print(f"File error: {e}")
except KeyError as e:
    print(f"Sheet error: {e}")
except ValueError as e:
    print(f"Column error: {e}")
except Exception as e:
    print(f"Error: {e}")

## Preview Exported Data

View the first few rows of the exported data:

In [None]:
# Preview exported data if available
if 'exported_df' in locals() and not exported_df.empty:
    print(f"Exported data shape: {exported_df.shape}")
    print(f"Exported columns: {list(exported_df.columns)}")
    print("\nFirst 5 rows:")
    display(exported_df.head())
else:
    print("No exported data available to preview")

In [None]:
# Optional: Read back the CSV file to verify
if 'csv_path' in locals() and Path(csv_path).exists():
    print(f"Verifying CSV file: {csv_path}")
    csv_df = pd.read_csv(csv_path)
    print(f"CSV file contains {len(csv_df)} rows and {len(csv_df.columns)} columns")
    print(f"CSV columns: {list(csv_df.columns)}")
else:
    print("No CSV file found to verify")