# COVID-19 Global Data Analysis
This notebook analyzes global COVID-19 trends using data from Our World in Data. We'll explore cases, deaths, and vaccination rates across different countries.

In [None]:
# Ensure the requests module is installed
%pip install requests

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
from requests.exceptions import RequestException, HTTPError, ConnectionError, Timeout
import warnings
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

# Define data validation functions
def validate_numeric_column(df, column):
    if column in df.columns:
        invalid_count = ((df[column] < 0) | (df[column].isna())).sum()
        return invalid_count
    return 0

def validate_data(df):
    validation_results = {
        'missing_critical_columns': [col for col in ['date', 'location', 'total_cases', 'total_deaths'] if col not in df.columns],
        'negative_cases': validate_numeric_column(df, 'total_cases'),
        'negative_deaths': validate_numeric_column(df, 'total_deaths'),
        'future_dates': (df['date'] > pd.Timestamp.now()).sum() if 'date' in df.columns else 0
    }
    return validation_results


## Data Collection and Loading
We'll use the Our World in Data COVID-19 dataset, which provides comprehensive information about COVID-19 cases, deaths, and vaccinations globally.

In [None]:
# Load the dataset with error handling
url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
try:
    # Try to download the data first
    response = requests.get(url)
    response.raise_for_status()
    
    # Save data to a temporary file and read with pandas
    from IPython.display import display  # Ensure display is imported for Jupyter
    with open('temp_covid_data.csv', 'wb') as f:
        f.write(response.content)
    df = pd.read_csv('temp_covid_data.csv')
    print('Data loaded successfully!')

    # Convert date column to datetime and validate data
    df['date'] = pd.to_datetime(df['date'])
    validation_results = validate_data(df)
    
    print('\nData Validation Results:')
    for check, result in validation_results.items():
        print(f'\n{check.replace("null_", "null ").title()}:')
        if isinstance(result, (list, pd.Series)):
            if len(result) > 0:
                print(result)
            else:
                print('None found')
        else:
            print(result)
            
    # Remove any future dates
    df = df[df['date'] <= pd.Timestamp.now()]
    
    # Clean negative values
    df['total_cases'] = df['total_cases'].clip(lower=0)
    df['total_deaths'] = df['total_deaths'].clip(lower=0)
    
    print('\nDataset Info:\n')
    print(df.info())
    print('\nFirst few rows of the dataset:\n')
    display(df.head())

except Exception as e:
    print(f'Error processing data: {e}')
    raise
finally:
    # Clean up temporary file
    import os
    if os.path.exists('temp_covid_data.csv'):
        os.remove('temp_covid_data.csv')

## Data Cleaning and Preparation
Let's clean the data and focus on specific countries (Kenya, India, USA) for our analysis.

In [None]:
# Select countries of interest
countries = ['Kenya', 'India', 'United States']
df_selected = df[df['location'].isin(countries)].copy()

# Handle missing values
df_selected['new_cases'] = df_selected['new_cases'].fillna(0)
df_selected['new_deaths'] = df_selected['new_deaths'].fillna(0)
df_selected['total_cases'] = df_selected.groupby('location')['total_cases'].apply(lambda x: x.ffill())
df_selected['total_deaths'] = df_selected.groupby('location')['total_deaths'].apply(lambda x: x.ffill())

print('Selected countries data shape:', df_selected.shape)


## Trend Analysis
### 1. Total Cases and Deaths Over Time

In [None]:
try:
    # Plot total cases over time
    plt.figure(figsize=(12, 6))
    for country in countries:
        country_data = df_selected[df_selected['location'] == country]
        plt.plot(country_data['date'], country_data['total_cases'], label=country, marker='o', markersize=2)

    plt.title('Total COVID-19 Cases Over Time', pad=20)
    plt.xlabel('Date')
    plt.ylabel('Total Cases')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Plot total deaths over time
    plt.figure(figsize=(12, 6))
    for country in countries:
        country_data = df_selected[df_selected['location'] == country]
        plt.plot(country_data['date'], country_data['total_deaths'], label=country, marker='o', markersize=2)

    plt.title('Total COVID-19 Deaths Over Time', pad=20)
    plt.xlabel('Date')
    plt.ylabel('Total Deaths')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f'Error plotting cases and deaths: {e}')
    plt.close()  # Clean up any open figures


### 2. Daily New Cases and Deaths

In [None]:
try:
    # Ensure 'new_cases' and 'new_deaths' columns exist and handle missing values
    if 'new_cases' not in df_selected.columns or 'new_deaths' not in df_selected.columns:
        raise KeyError("'new_cases' and 'new_deaths' columns are required for this analysis.")

    # Fill missing values in 'new_cases' and 'new_deaths' with 0
    df_selected['new_cases'] = df_selected['new_cases'].fillna(0)
    df_selected['new_deaths'] = df_selected['new_deaths'].fillna(0)

    # Calculate 7-day moving average for new cases and deaths
    for country in countries:
        mask = df_selected['location'] == country
        df_selected.loc[mask, 'new_cases_ma7'] = df_selected.loc[mask, 'new_cases'].rolling(7).mean()
        df_selected.loc[mask, 'new_deaths_ma7'] = df_selected.loc[mask, 'new_deaths'].rolling(7).mean()

    # Plot new cases (7-day moving average)
    plt.figure(figsize=(12, 6))
    for country in countries:
        country_data = df_selected[df_selected['location'] == country]
        plt.plot(country_data['date'], country_data['new_cases_ma7'], label=country, marker='o', markersize=2)

    plt.title('Daily New COVID-19 Cases (7-day Moving Average)', pad=20)
    plt.xlabel('Date')
    plt.ylabel('New Cases')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
except Exception as e:
    print(f'Error plotting daily cases and deaths: {e}')
    plt.close()  # Clean up any open figures


### 3. Vaccination Progress
Let's analyze the vaccination progress in our selected countries.

In [None]:
try:
    # Import required libraries if not already imported
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.figure(figsize=(12, 6))
    if 'people_vaccinated_per_hundred' not in df_selected.columns:
        if 'people_vaccinated' in df_selected.columns and 'population' in df_selected.columns:
            # Calculate people_vaccinated_per_hundred if possible
            df_selected['people_vaccinated_per_hundred'] = (df_selected['people_vaccinated'] / df_selected['population']) * 100
        else:
            raise ValueError("'people_vaccinated_per_hundred' column is missing and cannot be calculated.")
    
    for country in countries:
        country_data = df_selected[df_selected['location'] == country]
        if not country_data.empty:
            plt.plot(country_data['date'], country_data['people_vaccinated_per_hundred'], label=country, marker='o', markersize=2)
    
    plt.title('Vaccination Progress (% of Population)', pad=20)
    plt.xlabel('Date')
    plt.ylabel('People Vaccinated per 100')
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Calculate the maximum vaccination percentage for each country
    max_vaccination_percentage = df_selected.groupby('location')['people_vaccinated_per_hundred'].max()

    # Display the results
    print("Maximum Vaccination Percentage per Country:")
    for country, percentage in max_vaccination_percentage.items():
        print(f"{country}: {percentage:.2f}%")
except Exception as e:
    print(f"Error plotting vaccination progress: {e}")
    plt.close()  # Clean up the figure in case of error


## Key Insights
Based on our analysis, here are the key findings:

1. **Case Progression:**
   - India experienced the highest peak in total cases
   - The United States showed multiple waves of infections
   - Kenya maintained relatively lower case numbers throughout

2. **Death Rates:**
   - The United States recorded the highest number of deaths
   - Death rates varied significantly between countries
   - All countries showed improvement in managing death rates over time

3. **Vaccination Campaign:**
   - The United States led in early vaccination efforts
   - India showed rapid progress in vaccination after initial delays
   - Kenya faced challenges in vaccine rollout

4. **Notable Patterns:**
   - Clear correlation between vaccination rates and reduced death rates
   - Seasonal patterns in case numbers across all countries
   - Varying effectiveness of containment measures