# COVID Data Analyzer

## Import Libraries

In [24]:
import pandas as pd
import numpy as np
from typing import Optional

## Define CovidDataAnalyzer Class

In [25]:
class CovidDataAnalyzer:
    """
    A class to analyze COVID-19 data.
    
    Attributes:
        data: stores the loaded dataset
        filtered_data: stores filtered datasets
    """
    
    def __init__(self, file_path):
        """
        Initialize the CovidDataAnalyzer with data from a file.
        
        Args:
            file_path (str): Path to the data file (CSV, Excel, etc.)
        """
        self.filtered_data = None
        self.data = self.load_data(file_path)
    
    def load_data(self, file_path):
        """
        Load data from a CSV file and return it.
        
        Args:
            file_path (str): Path to the CSV file
            
        Returns:
            pd.DataFrame: The loaded dataframe, or None if an error occurred
        """
        try:
            data = pd.read_csv(file_path)
            print(f"Data loaded successfully from {file_path}")
            print(f"Dataset shape: {data.shape}")
            return data
        except FileNotFoundError:
            print(f"Error: File '{file_path}' not found.")
            return None
        except Exception as e:
            print(f"Error loading data: {e}")
            return None
    
    def describe_data(self):
        """
        Print the shape, column names, and basic statistics of the dataset.
        Provides insights based on the statistics.
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("DATASET OVERVIEW")
        print("=" * 80)
        
        # Shape
        print(f"\nDataset Shape: {self.data.shape}")
        print(f"   - Number of rows: {self.data.shape[0]:,}")
        print(f"   - Number of columns: {self.data.shape[1]}")
        
        # Column names and data types
        print(f"\nColumn Names and Data Types:")
        print("-" * 80)
        for idx, (col, dtype) in enumerate(zip(self.data.columns, self.data.dtypes), 1):
            print(f"   {idx}. {col:<30} ({dtype})")
        
        # Basic statistics
        print(f"\nBasic Statistics (Numeric Columns):")
        print("-" * 80)
        print(self.data.describe())
        
        # Missing values
        print(f"\nMissing Values:")
        print("-" * 80)
        missing = self.data.isnull().sum()
        missing_pct = (missing / len(self.data) * 100).round(2)
        missing_df = pd.DataFrame({
            'Missing Count': missing,
            'Percentage': missing_pct
        })
        print(missing_df[missing_df['Missing Count'] > 0])
        
        # Data types summary
        print(f"\nData Types Summary:")
        print("-" * 80)
        print(self.data.dtypes.value_counts())
        
        # First few rows
        print(f"\nFirst 5 Rows:")
        print("-" * 80)
        print(self.data.head())
        
        # Insights
        print("\n" + "=" * 80)
        print("INSIGHTS")
        print("=" * 80)
        
        insights = []
        
        # Dataset size insight
        if self.data.shape[0] > 10000:
            insights.append(f"Large dataset with {self.data.shape[0]:,} records - sufficient for statistical analysis")
        elif self.data.shape[0] > 1000:
            insights.append(f"Medium-sized dataset with {self.data.shape[0]:,} records")
        else:
            insights.append(f"Small dataset with {self.data.shape[0]:,} records - consider collecting more data")
        
        # Missing values insight
        total_missing = self.data.isnull().sum().sum()
        if total_missing > 0:
            missing_pct_total = (total_missing / (self.data.shape[0] * self.data.shape[1]) * 100).round(2)
            insights.append(f"Dataset has {total_missing:,} missing values ({missing_pct_total}% of all data)")
            insights.append("    -> Recommend using handle_missing_values() method to address this")
        else:
            insights.append("No missing values detected - data is complete")
        
        # Numeric columns insight
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns.tolist()
        if numeric_cols:
            insights.append(f"{len(numeric_cols)} numeric columns available for statistical analysis")
        
        # Categorical columns insight
        categorical_cols = self.data.select_dtypes(include=['object']).columns.tolist()
        if categorical_cols:
            insights.append(f"{len(categorical_cols)} categorical columns available for grouping and filtering")
        
        # Print insights
        for insight in insights:
            print(f"\n{insight}")
        
        print("\n" + "=" * 80)
    
    def handle_missing_values(self):
        """
        Handle missing values in the dataset:
        - Fill missing numeric values with 0
        - Fill missing categorical values with "Unknown"
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("HANDLING MISSING VALUES")
        print("=" * 80)
        
        # Check for missing values before
        missing_before = self.data.isnull().sum().sum()
        print(f"\nMissing values before: {missing_before:,}")
        
        if missing_before == 0:
            print("\nNo missing values found. Data is already complete!")
            return self.data
        
        # Identify numeric and categorical columns
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = self.data.select_dtypes(include=['object']).columns.tolist()
        
        # Fill numeric columns with 0
        if numeric_cols:
            print(f"\nFilling {len(numeric_cols)} numeric columns with 0:")
            for col in numeric_cols:
                missing_count = self.data[col].isnull().sum()
                if missing_count > 0:
                    self.data[col] = self.data[col].fillna(0)
                    print(f"   - {col}: filled {missing_count} missing values")
        
        # Fill categorical columns with "Unknown"
        if categorical_cols:
            print(f"\nFilling {len(categorical_cols)} categorical columns with 'Unknown':")
            for col in categorical_cols:
                missing_count = self.data[col].isnull().sum()
                if missing_count > 0:
                    self.data[col] = self.data[col].fillna("Unknown")
                    print(f"   - {col}: filled {missing_count} missing values")
        
        # Check for missing values after
        missing_after = self.data.isnull().sum().sum()
        print(f"\nMissing values after: {missing_after:,}")
        print(f"Successfully handled {missing_before - missing_after:,} missing values!")
        print("=" * 80)
        
        return self.data
    
    def filter_high_cases(self) -> Optional[pd.DataFrame]:
        """
        Filter the dataset based on multiple conditions:
        - Confirmed cases greater than 100,000
        - Deaths above 5,000
        - Country is not "Unknown"
        
        Saves the filtered data to self.filtered_data
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("FILTERING HIGH CASES")
        print("=" * 80)
        
        # Apply filters
        print("\nApplying filters:")
        print("   - Confirmed cases > 100,000")
        print("   - Deaths > 5,000")
        print("   - Country/Region != 'Unknown'")
        
        self.filtered_data = self.data[
            (self.data['Confirmed'] > 100000) & 
            (self.data['Deaths'] > 5000) & 
            (self.data['Country/Region'] != 'Unknown')
        ]
        
        print(f"\nResults:")
        print(f"   - Original dataset: {len(self.data):,} rows")
        print(f"   - Filtered dataset: {len(self.filtered_data):,} rows")
        print(f"   - Rows filtered out: {len(self.data) - len(self.filtered_data):,}")
        
        if len(self.filtered_data) > 0:
            print(f"\nFiltered data saved to self.filtered_data")
            print("\nSummary of filtered data:")
            print(f"   - Unique countries: {self.filtered_data['Country/Region'].nunique()}")
            print(f"   - Total confirmed cases: {self.filtered_data['Confirmed'].sum():,}")
            print(f"   - Total deaths: {self.filtered_data['Deaths'].sum():,}")
            print(f"   - Average confirmed cases: {self.filtered_data['Confirmed'].mean():,.0f}")
            print(f"   - Average deaths: {self.filtered_data['Deaths'].mean():,.0f}")
        else:
            print("\nNo records match the filter criteria.")
        
        print("=" * 80)
        
        return self.filtered_data

## Test the Data Analyzer

Below is an example of how to use the CovidDataAnalyzer class with sample output:

**Expected Output:**
- Data loading confirmation with dataset shape
- Dataset overview with statistics and insights
- Missing values handling report
- Filtered data results with summary statistics
- Preview of the first 5 filtered records

In [None]:
# Example usage:
analyzer = CovidDataAnalyzer('covid_19_data.csv')

# Describe the data
analyzer.describe_data()

# Handle missing values
analyzer.handle_missing_values()

# Filter high cases
analyzer.filter_high_cases()
print(analyzer.filtered_data.head())