# COVID Data Analyzer

## Import Libraries

In [39]:
import pandas as pd
import numpy as np
from typing import Optional

## Define CovidDataAnalyzer Class

In [40]:
class CovidDataAnalyzer:
    """
    A class to analyze COVID-19 data.
    
    Attributes:
        data: stores the loaded dataset
        filtered_data: stores filtered datasets
    """
    
    def __init__(self, file_path):
        """
        Initialize the CovidDataAnalyzer with data from a file.
        
        Args:
            file_path (str): Path to the data file (CSV, Excel, etc.)
        """
        self.filtered_data = None
        self.data = self.load_data(file_path)
    
    def load_data(self, file_path):
        """
        Load data from a CSV file and return it.
        
        Args:
            file_path (str): Path to the CSV file
            
        Returns:
            pd.DataFrame: The loaded dataframe, or None if an error occurred
        """
        try:
            data = pd.read_csv(file_path)
            print(f"Data loaded successfully from {file_path}")
            print(f"Dataset shape: {data.shape}")
            return data
        except FileNotFoundError:
            print(f"Error: File '{file_path}' not found.")
            return None
        except Exception as e:
            print(f"Error loading data: {e}")
            return None
    
    def describe_data(self):
        """
        Print the shape, column names, and basic statistics of the dataset.
        Provides insights based on the statistics.
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("DATASET OVERVIEW")
        print("=" * 80)
        
        # Shape
        print(f"\nDataset Shape: {self.data.shape}")
        print(f"   - Number of rows: {self.data.shape[0]:,}")
        print(f"   - Number of columns: {self.data.shape[1]}")
        
        # Column names and data types
        print(f"\nColumn Names and Data Types:")
        print("-" * 80)
        for idx, (col, dtype) in enumerate(zip(self.data.columns, self.data.dtypes), 1):
            print(f"   {idx}. {col:<30} ({dtype})")
        
        # Basic statistics
        print(f"\nBasic Statistics (Numeric Columns):")
        print("-" * 80)
        print(self.data.describe())
        
        # Missing values
        print(f"\nMissing Values:")
        print("-" * 80)
        missing = self.data.isnull().sum()
        missing_pct = (missing / len(self.data) * 100).round(2)
        missing_df = pd.DataFrame({
            'Missing Count': missing,
            'Percentage': missing_pct
        })
        print(missing_df[missing_df['Missing Count'] > 0])
        
        # Data types summary
        print(f"\nData Types Summary:")
        print("-" * 80)
        print(self.data.dtypes.value_counts())
        
        # First few rows
        print(f"\nFirst 5 Rows:")
        print("-" * 80)
        print(self.data.head())
        
        # Enhanced Insights
        print("\n" + "=" * 80)
        print("COVID-19 DATA INSIGHTS")
        print("=" * 80)
        
        insights = []
        
        # 1. Dataset size and scope
        insights.append("1. DATASET SCOPE:")
        insights.append(f"   Large dataset with {self.data.shape[0]:,} records across {self.data.shape[1]} variables")
        
        # 2. Geographic coverage
        unique_countries = self.data['Country/Region'].nunique()
        unique_regions = self.data['WHO Region'].nunique()
        insights.append(f"   Geographic coverage: {unique_countries} countries/territories across {unique_regions} WHO regions")
        
        # 3. Temporal coverage
        temp_date = pd.to_datetime(self.data['Date'], errors='coerce')
        if not temp_date.isna().all():
            date_min = temp_date.min()
            date_max = temp_date.max()
            date_range_days = (date_max - date_min).days
            insights.append(f"   Time period: {date_min.date()} to {date_max.date()} ({date_range_days} days)")
        
        # 4. Data quality insights
        insights.append("\n2. DATA QUALITY:")
        total_missing = self.data.isnull().sum().sum()
        if total_missing > 0:
            missing_pct_total = (total_missing / (self.data.shape[0] * self.data.shape[1]) * 100).round(2)
            insights.append(f"   Missing data: {total_missing:,} values ({missing_pct_total}% of dataset)")
            insights.append(f"   Primary gap: Province/State column (70.11% missing - expected for country-level data)")
        else:
            insights.append("   No missing values detected")
        
        # Check for data anomalies
        if self.data['Active'].min() < 0:
            insights.append(f"   Data anomaly detected: Negative active cases (min: {self.data['Active'].min()})")
            insights.append("   This suggests data reporting inconsistencies that may need investigation")
        
        # 5. Impact analysis - Top affected countries
        insights.append("\n3. GLOBAL IMPACT OVERVIEW:")
        
        # Get latest data per country for accurate totals
        latest_data = self.data.sort_values('Date').groupby('Country/Region').last()
        
        total_confirmed = self.data['Confirmed'].sum()
        total_deaths = self.data['Deaths'].sum()
        total_recovered = self.data['Recovered'].sum()
        
        insights.append(f"   Total cumulative cases: {total_confirmed:,}")
        insights.append(f"   Total deaths: {total_deaths:,}")
        insights.append(f"   Total recovered: {total_recovered:,}")
        
        if total_confirmed > 0:
            global_cfr = (total_deaths / total_confirmed) * 100
            insights.append(f"   Global case fatality rate: {global_cfr:.2f}%")
        
        # 6. Most affected countries
        insights.append("\n4. MOST AFFECTED COUNTRIES (by latest confirmed cases):")
        top_countries = latest_data.nlargest(5, 'Confirmed')[['Confirmed', 'Deaths', 'Recovered']]
        for idx, (country, row) in enumerate(top_countries.iterrows(), 1):
            cfr = (row['Deaths'] / row['Confirmed'] * 100) if row['Confirmed'] > 0 else 0
            insights.append(f"   {idx}. {country}: {row['Confirmed']:,} cases, {row['Deaths']:,} deaths (CFR: {cfr:.2f}%)")
        
        # 7. WHO Regional distribution
        insights.append("\n5. WHO REGIONAL DISTRIBUTION:")
        regional_data = latest_data.groupby('WHO Region').agg({
            'Confirmed': 'sum',
            'Deaths': 'sum'
        }).sort_values('Confirmed', ascending=False)
        
        for region, row in regional_data.iterrows():
            pct_of_total = (row['Confirmed'] / latest_data['Confirmed'].sum() * 100)
            insights.append(f"   {region}: {row['Confirmed']:,} cases ({pct_of_total:.1f}% of global total)")
        
        # 8. Data characteristics
        insights.append("\n6. STATISTICAL CHARACTERISTICS:")
        insights.append(f"   Median confirmed cases per record: {self.data['Confirmed'].median():,.0f}")
        insights.append(f"   High variance in data (std dev: {self.data['Confirmed'].std():,.0f}) indicates")
        insights.append("   wide disparity in outbreak severity across regions and time periods")
        
        # Print all insights
        for insight in insights:
            print(insight)
        
        print("\n" + "=" * 80)
    
    def handle_missing_values(self):
        """
        Handle missing values in the dataset:
        - Fill missing numeric values with 0
        - Fill missing categorical values with "Unknown"
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("HANDLING MISSING VALUES")
        print("=" * 80)
        
        # Check for missing values before
        missing_before = self.data.isnull().sum().sum()
        print(f"\nMissing values before: {missing_before:,}")
        
        if missing_before == 0:
            print("\nNo missing values found. Data is already complete!")
            return self.data
        
        # Identify numeric and categorical columns
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = self.data.select_dtypes(include=['object']).columns.tolist()
        
        # Fill numeric columns with 0
        if numeric_cols:
            print(f"\nFilling {len(numeric_cols)} numeric columns with 0:")
            for col in numeric_cols:
                missing_count = self.data[col].isnull().sum()
                if missing_count > 0:
                    self.data[col] = self.data[col].fillna(0)
                    print(f"   - {col}: filled {missing_count} missing values")
        
        # Fill categorical columns with "Unknown"
        if categorical_cols:
            print(f"\nFilling {len(categorical_cols)} categorical columns with 'Unknown':")
            for col in categorical_cols:
                missing_count = self.data[col].isnull().sum()
                if missing_count > 0:
                    self.data[col] = self.data[col].fillna("Unknown")
                    print(f"   - {col}: filled {missing_count} missing values")
        
        # Check for missing values after
        missing_after = self.data.isnull().sum().sum()
        print(f"\nMissing values after: {missing_after:,}")
        print(f"Successfully handled {missing_before - missing_after:,} missing values!")
        print("=" * 80)
        
        return self.data
    
    def filter_high_cases(self) -> Optional[pd.DataFrame]:
        """
        Filter the dataset based on multiple conditions:
        - Confirmed cases greater than 100,000
        - Deaths above 5,000
        - Country is not "Unknown"
        
        Saves the filtered data to self.filtered_data
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("FILTERING HIGH CASES")
        print("=" * 80)
        
        # Apply filters
        print("\nApplying filters:")
        print("   - Confirmed cases > 100,000")
        print("   - Deaths > 5,000")
        print("   - Country/Region != 'Unknown'")
        
        self.filtered_data = self.data[
            (self.data['Confirmed'] > 100000) & 
            (self.data['Deaths'] > 5000) & 
            (self.data['Country/Region'] != 'Unknown')
        ]
        
        print(f"\nResults:")
        print(f"   - Original dataset: {len(self.data):,} rows")
        print(f"   - Filtered dataset: {len(self.filtered_data):,} rows")
        print(f"   - Rows filtered out: {len(self.data) - len(self.filtered_data):,}")
        
        if len(self.filtered_data) > 0:
            print(f"\nFiltered data saved to self.filtered_data")
            print("\nSummary of filtered data:")
            print(f"   - Unique countries: {self.filtered_data['Country/Region'].nunique()}")
            print(f"   - Total confirmed cases: {self.filtered_data['Confirmed'].sum():,}")
            print(f"   - Total deaths: {self.filtered_data['Deaths'].sum():,}")
            print(f"   - Average confirmed cases: {self.filtered_data['Confirmed'].mean():,.0f}")
            print(f"   - Average deaths: {self.filtered_data['Deaths'].mean():,.0f}")
        else:
            print("\nNo records match the filter criteria.")
        
        print("=" * 80)
        
        return self.filtered_data
    
    def filter_by_date_range(self, start_date, end_date) -> Optional[pd.DataFrame]:
        """
        Filter the dataset by a specified date range.
        
        Args:
            start_date: Start date (string like '2020-01-01' or datetime object)
            end_date: End date (string like '2020-12-31' or datetime object)
            
        Returns:
            pd.DataFrame: Filtered dataframe within the date range
            
        Saves the filtered data to self.filtered_data
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return None
        
        print("=" * 80)
        print("FILTERING BY DATE RANGE")
        print("=" * 80)
        
        try:
            # Convert start_date and end_date to datetime
            start_dt = pd.to_datetime(start_date)
            end_dt = pd.to_datetime(end_date)
            
            print(f"\nDate range: {start_dt.date()} to {end_dt.date()}")
            
            # Ensure Date column is in datetime format
            if self.data['Date'].dtype == 'object':
                print("Converting Date column to datetime format...")
                self.data['Date'] = pd.to_datetime(self.data['Date'])
            
            # Filter data by date range
            self.filtered_data = self.data[
                (self.data['Date'] >= start_dt) & 
                (self.data['Date'] <= end_dt)
            ]
            
            print(f"\nResults:")
            print(f"   - Original dataset: {len(self.data):,} rows")
            print(f"   - Filtered dataset: {len(self.filtered_data):,} rows")
            print(f"   - Rows filtered out: {len(self.data) - len(self.filtered_data):,}")
            
            if len(self.filtered_data) > 0:
                print(f"\nFiltered data saved to self.filtered_data")
                print(f"\nDate range in filtered data:")
                print(f"   - Earliest date: {self.filtered_data['Date'].min().date()}")
                print(f"   - Latest date: {self.filtered_data['Date'].max().date()}")
                print(f"   - Total confirmed cases: {self.filtered_data['Confirmed'].sum():,}")
                print(f"   - Total deaths: {self.filtered_data['Deaths'].sum():,}")
            else:
                print("\nNo records found within the specified date range.")
            
            print("=" * 80)
            
            return self.filtered_data
            
        except Exception as e:
            print(f"\nError filtering by date range: {e}")
            print("Please ensure dates are in a valid format (e.g., '2020-01-01' or 'YYYY-MM-DD')")
            print("=" * 80)
            return None
    
    def calculate_global_statistics(self):
        """
        Calculate global statistics for Confirmed, Deaths, and Recovered cases.
        Uses numpy for calculations and prints the results.
        """
        if self.data is None:
            print("No data loaded. Please load data first.")
            return
        
        print("=" * 80)
        print("GLOBAL STATISTICS")
        print("=" * 80)
        
        # Use numpy to calculate totals
        total_confirmed = np.sum(self.data['Confirmed'].values)
        total_deaths = np.sum(self.data['Deaths'].values)
        total_recovered = np.sum(self.data['Recovered'].values)
        
        print(f"\nGlobal Totals:")
        print(f"   - Total Confirmed Cases: {total_confirmed:,}")
        print(f"   - Total Deaths: {total_deaths:,}")
        print(f"   - Total Recovered: {total_recovered:,}")
        
        # Calculate additional statistics using numpy
        print(f"\nAdditional Statistics:")
        
        # Death rate
        if total_confirmed > 0:
            death_rate = (total_deaths / total_confirmed) * 100
            print(f"   - Global Death Rate: {death_rate:.2f}%")
        
        # Recovery rate
        if total_confirmed > 0:
            recovery_rate = (total_recovered / total_confirmed) * 100
            print(f"   - Global Recovery Rate: {recovery_rate:.2f}%")
        
        # Average cases per record using numpy
        avg_confirmed = np.mean(self.data['Confirmed'].values)
        avg_deaths = np.mean(self.data['Deaths'].values)
        avg_recovered = np.mean(self.data['Recovered'].values)
        
        print(f"\nAverage per Record:")
        print(f"   - Average Confirmed: {avg_confirmed:,.2f}")
        print(f"   - Average Deaths: {avg_deaths:,.2f}")
        print(f"   - Average Recovered: {avg_recovered:,.2f}")
        
        # Standard deviation using numpy
        std_confirmed = np.std(self.data['Confirmed'].values)
        std_deaths = np.std(self.data['Deaths'].values)
        std_recovered = np.std(self.data['Recovered'].values)
        
        print(f"\nStandard Deviation:")
        print(f"   - Confirmed: {std_confirmed:,.2f}")
        print(f"   - Deaths: {std_deaths:,.2f}")
        print(f"   - Recovered: {std_recovered:,.2f}")
        
        print("=" * 80)
    
    def save_filtered_data(self, filename):
        """
        Save the filtered data to a CSV file.
        
        Args:
            filename (str): Name of the CSV file to save (e.g., 'filtered_data.csv')
        """
        if self.filtered_data is None:
            print("No filtered data available. Please filter data first.")
            return
        
        if len(self.filtered_data) == 0:
            print("Filtered data is empty. Nothing to save.")
            return
        
        print("=" * 80)
        print("SAVING FILTERED DATA")
        print("=" * 80)
        
        try:
            # Save filtered data to CSV
            self.filtered_data.to_csv(filename, index=False)
            
            print(f"\nFiltered data saved successfully!")
            print(f"   - Filename: {filename}")
            print(f"   - Rows saved: {len(self.filtered_data):,}")
            print(f"   - Columns saved: {len(self.filtered_data.columns)}")
            print("=" * 80)
            
        except Exception as e:
            print(f"\nError saving filtered data: {e}")
            print("=" * 80)

## Test the Data Analyzer

Below is an example of how to use the CovidDataAnalyzer class with sample output:

**Expected Output:**
- Data loading confirmation with dataset shape
- Dataset overview with statistics and insights
- Missing values handling report
- Filtered data results with summary statistics
- Preview of the first 5 filtered records

In [41]:
# Example usage - Part 3: Putting It All Together
analyzer = CovidDataAnalyzer('covid_19_data.csv')

# Step 2: Load and describe the dataset
analyzer.describe_data()

# Step 3: Handle missing values
analyzer.handle_missing_values()

# Step 4: Apply filter_high_cases and save the filtered data
print("\n")
analyzer.filter_high_cases()
print(analyzer.filtered_data.head())
analyzer.save_filtered_data('filtered_high_cases.csv')

# Step 5: Apply filter_by_date_range (March 2020 to June 2020) and save
print("\n")
analyzer.filter_by_date_range('2020-03-01', '2020-06-30')
print(analyzer.filtered_data.head())
analyzer.save_filtered_data('filtered_march_to_june_2020.csv')

# Step 6: Calculate and display global statistics
print("\n")
analyzer.calculate_global_statistics()

Data loaded successfully from covid_19_data.csv
Dataset shape: (49068, 10)
DATASET OVERVIEW

Dataset Shape: (49068, 10)
   - Number of rows: 49,068
   - Number of columns: 10

Column Names and Data Types:
--------------------------------------------------------------------------------
   1. Province/State                 (object)
   2. Country/Region                 (object)
   3. Lat                            (float64)
   4. Long                           (float64)
   5. Date                           (object)
   6. Confirmed                      (int64)
   7. Deaths                         (int64)
   8. Recovered                      (int64)
   9. Active                         (int64)
   10. WHO Region                     (object)

Basic Statistics (Numeric Columns):
--------------------------------------------------------------------------------
                Lat          Long     Confirmed         Deaths     Recovered  \
count  49068.000000  49068.000000  4.906800e+04   49068.0