In [1]:
def clean_data(raw_records):
    """
    Cleans messy sales data: handles numeric formatting, ID verification, 
    and logical validation.
    """
    clean_records = []
    
    for record in raw_records:
        try:
            # 1. Handle numeric formatting issues (commas in numbers like 1,500)
            # Use .replace(',', '') to ensure it can be converted to float/int
            raw_unit_price = record['UnitPrice'].replace(',', '')
            raw_quantity = record['Quantity'].replace(',', '')
            
            unit_price = float(raw_unit_price)
            quantity = int(raw_quantity)
            
            # 2. Logical Validation (Invalid data: zero quantities, negative prices)
            if unit_price <= 0 or quantity <= 0:
                print(f"Validation Error: Invalid metrics for Transaction {record.get('TransactionID')}")
                continue
            
            # 3. ID Format Validation (e.g., TransactionID should start with 'T')
            if not record['TransactionID'].startswith('T'):
                print(f"Validation Error: Invalid ID format for {record['TransactionID']}")
                continue

            # Update record with cleaned numeric values
            record['UnitPrice'] = unit_price
            record['Quantity'] = quantity
            
            # Calculate Revenue for analysis (Quantity * UnitPrice)
            record['TotalRevenue'] = quantity * unit_price
            
            clean_records.append(record)
            
        except (ValueError, KeyError) as e:
            print(f"Skipping record due to conversion error: {e}")
            continue
            
    return clean_records

def perform_analysis(clean_records):
    """
    Performs basic analysis like total revenue and sales by region.
    """
    analysis = {
        'total_revenue': 0,
        'region_sales': {},
        'record_count': len(clean_records)
    }
    
    for record in clean_records:
        revenue = record['TotalRevenue']
        region = record['Region']
        
        analysis['total_revenue'] += revenue
        analysis['region_sales'][region] = analysis['region_sales'].get(region, 0) + revenue
        
    return analysis