# AWS ML Analysis with Unit Tests

This notebook demonstrates ML analysis with proper unit testing.

In [None]:
import pandas as pd
import numpy as np
import unittest
from unittest.mock import patch, MagicMock
import sys
from io import StringIO

In [None]:
class DataProcessor:
    """Simple data processing class for demonstration."""
    
    def clean_data(self, data):
        """Remove null values and duplicates."""
        if data is None or len(data) == 0:
            return pd.DataFrame()
        
        df = pd.DataFrame(data) if not isinstance(data, pd.DataFrame) else data
        return df.dropna().drop_duplicates()
    
    def calculate_stats(self, data):
        """Calculate basic statistics."""
        if data is None or len(data) == 0:
            return {}
        
        df = pd.DataFrame(data) if not isinstance(data, pd.DataFrame) else data
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        if len(numeric_cols) == 0:
            return {}
        
        return {
            'mean': df[numeric_cols].mean().to_dict(),
            'std': df[numeric_cols].std().to_dict(),
            'count': len(df)
        }

In [None]:
class TestDataProcessor(unittest.TestCase):
    """Unit tests for DataProcessor class."""
    
    def setUp(self):
        self.processor = DataProcessor()
        self.sample_data = pd.DataFrame({
            'A': [1, 2, 3, None, 2],
            'B': [4, 5, 6, 7, 5]
        })
    
    def test_clean_data_removes_nulls_and_duplicates(self):
        result = self.processor.clean_data(self.sample_data)
        self.assertEqual(len(result), 3)  # Should remove null and duplicate
        self.assertFalse(result.isnull().any().any())
    
    def test_clean_data_empty_input(self):
        result = self.processor.clean_data([])
        self.assertTrue(result.empty)
    
    def test_clean_data_none_input(self):
        result = self.processor.clean_data(None)
        self.assertTrue(result.empty)
    
    def test_calculate_stats_valid_data(self):
        clean_data = self.processor.clean_data(self.sample_data)
        stats = self.processor.calculate_stats(clean_data)
        
        self.assertIn('mean', stats)
        self.assertIn('std', stats)
        self.assertIn('count', stats)
        self.assertEqual(stats['count'], 3)
    
    def test_calculate_stats_empty_data(self):
        stats = self.processor.calculate_stats([])
        self.assertEqual(stats, {})
    
    def test_calculate_stats_non_numeric_data(self):
        text_data = pd.DataFrame({'text': ['a', 'b', 'c']})
        stats = self.processor.calculate_stats(text_data)
        self.assertEqual(stats, {})

In [None]:
# Run the unit tests
def run_tests():
    # Capture test output
    test_output = StringIO()
    runner = unittest.TextTestRunner(stream=test_output, verbosity=2)
    
    # Create test suite
    suite = unittest.TestLoader().loadTestsFromTestCase(TestDataProcessor)
    
    # Run tests
    result = runner.run(suite)
    
    # Print results
    print(test_output.getvalue())
    print(f"\nTests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    
    return result.wasSuccessful()

# Execute tests
if run_tests():
    print("\n✅ All tests passed!")
else:
    print("\n❌ Some tests failed!")

In [None]:
# Demo usage
processor = DataProcessor()

# Sample data with issues
sample_data = pd.DataFrame({
    'feature1': [1, 2, 3, None, 2, 4],
    'feature2': [10, 20, 30, 40, 20, 50],
    'category': ['A', 'B', 'A', 'C', 'B', 'A']
})

print("Original data:")
print(sample_data)

# Clean the data
clean_data = processor.clean_data(sample_data)
print("\nCleaned data:")
print(clean_data)

# Calculate statistics
stats = processor.calculate_stats(clean_data)
print("\nStatistics:")
print(stats)