In [14]:
import os
import pandas as pd
import re

# Define the directory containing the Excel files
directory = 'C:\\Users\\Ronja\\Documents\\Studium\\Semester 4\\DABI2\\Projekt\\Umsatzdaten\\DESTATIS\\Monatsbericht_Tourismus\\'

# Define the output directory for CSV files
output_directory = 'C:\\Users\\Ronja\\Documents\\Studium\\Semester 4\\DABI2\\Projekt\\Umsatzdaten\\DESTATIS\\Output\\'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Define a dictionary to specify row ranges and columns for each sheet
# Example format: { 'Sheet1': {'row_range': (1, 2), 'columns': ['A', 'B']}, ... }
sheet_specs = {
    '1.8': {'row_range': (14, 67), 'columns': ['A', 'B', 'D', 'F']},
    '1.9': {'row_range': (14, 100), 'columns': ['A', 'B', 'D', 'F']},
    '2.4': {'row_range': (13, 29), 'columns': ['A', 'B', 'E']},
    # Add more sheets and specifications as needed
}

# Function to extract specific rows and columns from a given sheet
def extract_data(sheet, row_range, columns):
    data = []
    max_row = sheet.shape[0]
    start_row, end_row = row_range
    end_row = min(end_row, max_row)  # Ensure the end row does not exceed the max rows in the sheet

    for row in range(start_row, end_row + 1):  # Ensure we include the end row
        row_data = {}
        for col in columns:
            try:
                cell_value = sheet.iloc[row-1, ord(col) - ord('A')]  # Adjust for zero-indexing
                row_data[col] = cell_value
            except Exception as e:
                print(f"Error extracting cell at row {row}, column {col}: {e}")
        if row_data:
            data.append(row_data)

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data, columns=columns)
    return df

# Function to extract the four-digit sequence from the filename
def extract_sequence(filename):
    match = re.search(r'\d{4}', filename)
    return match.group(0) if match else None

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx"):
        file_path = os.path.join(directory, filename)
        sequence = extract_sequence(filename)

        # Load the Excel file
        xl = pd.ExcelFile(file_path)

        # Iterate over each sheet in the specifications
        for sheet_name, specs in sheet_specs.items():
            if sheet_name in xl.sheet_names:
                sheet = xl.parse(sheet_name)
                df = extract_data(sheet, specs['row_range'], specs['columns'])

                # Add metadata columns to the DataFrame
                df['filename'] = filename
                df['sequence'] = sequence
                df['sheet_name'] = sheet_name

                # Define the output CSV filename and path
                output_filename = f"{sequence}_{filename.replace('.xlsx', '')}_{sheet_name}.csv"
                output_path = os.path.join(output_directory, output_filename)

                # Save the extracted data to a CSV file
                df.to_csv(output_path, index=False)