In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Set visualization styles
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

In [None]:
# 1. DATA LOADING
print("Loading COVID-19 data...")
df = pd.read_csv('owid-covid-data.csv')

In [None]:
# 2. DATA EXPLORATION
print("\n--- Data Overview ---")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

print("\nColumn names:")
for col in df.columns:
    print(f"- {col}")

print("\nMissing values in key columns:")
key_columns = ['date', 'location', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 
               'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']
print(df[key_columns].isnull().sum())

print("\nCountries/regions in the dataset:")
print(df['location'].nunique())
print("\nSample of locations:")
print(df['location'].unique()[:20])

print("\nDate range:")
print(f"First date: {df['date'].min()}")
print(f"Last date: {df['date'].max()}")

In [None]:
# 3. DATA CLEANING
print("\n--- Data Cleaning ---")

In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])
print("Date column converted to datetime format.")

In [None]:
# Select countries of interest (can be modified as needed)
countries_of_interest = ['United States', 'India', 'Brazil', 'United Kingdom', 'Kenya', 'South Africa', 'China', 'Germany', 'France', 'Italy']
df_selected = df[df['location'].isin(countries_of_interest)].copy()
print(f"Selected {len(countries_of_interest)} countries for detailed analysis.")


In [None]:
# Check for missing values in selected data
print("\nMissing values in selected countries dataset:")
print(df_selected[key_columns].isnull().sum())

In [None]:
# Handle missing values for key metrics
# Forward fill for cumulative metrics
cumulative_cols = ['total_cases', 'total_deaths', 'total_vaccinations', 
                  'people_vaccinated', 'people_fully_vaccinated']
for col in cumulative_cols:
    if col in df_selected.columns:
        df_selected[col] = df_selected.groupby('location')[col].fillna(method='ffill')


In [None]:
# For new cases/deaths, fill NaN with 0
new_cols = ['new_cases', 'new_deaths']
for col in new_cols:
    if col in df_selected.columns:
        df_selected[col] = df_selected[col].fillna(0)

print("Missing values handled.")


In [None]:
# Calculate additional metrics
if 'total_cases' in df_selected.columns and 'total_deaths' in df_selected.columns:
    df_selected['death_rate'] = (df_selected['total_deaths'] / df_selected['total_cases'] * 100).round(2)
    print("Death rate calculated.")

if 'people_vaccinated' in df_selected.columns and 'population' in df_selected.columns:
    df_selected['vaccination_rate'] = (df_selected['people_vaccinated'] / df_selected['population'] * 100).round(2)
    print("Vaccination rate calculated.")

In [None]:
# 4. EXPLORATORY DATA ANALYSIS
print("\n--- Exploratory Data Analysis ---")

In [None]:
# Set up visualization environment
plt.figure(figsize=(14, 8))

In [None]:
# 4.1 Total Cases Over Time by Country
plt.figure(figsize=(14, 8))
for country in countries_of_interest:
    country_data = df_selected[df_selected['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title('COVID-19 Total Cases Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Cases', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('total_cases_time_series.png')
plt.close()
print("Total cases time series plot created.")


In [None]:
# 4.2 Total Deaths Over Time by Country
plt.figure(figsize=(14, 8))
for country in countries_of_interest:
    country_data = df_selected[df_selected['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)

plt.title('COVID-19 Total Deaths Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Deaths', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('total_deaths_time_series.png')
plt.close()
print("Total deaths time series plot created.")

In [None]:
# 4.3 Daily New Cases (7-day rolling average)
plt.figure(figsize=(14, 8))
for country in countries_of_interest:
    country_data = df_selected[df_selected['location'] == country]
    # Create 7-day rolling average
    country_data['new_cases_smoothed'] = country_data['new_cases'].rolling(window=7).mean()
    plt.plot(country_data['date'], country_data['new_cases_smoothed'], label=country)

plt.title('COVID-19 Daily New Cases (7-day Rolling Average)', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('New Cases (7-day Avg)', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('new_cases_rolling_avg.png')
plt.close()
print("New cases rolling average plot created.")


In [None]:
# 4.4 Death Rate Comparison (latest data)
latest_data = df_selected.groupby('location').last().reset_index()
latest_data = latest_data.sort_values('death_rate', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='death_rate', y='location', data=latest_data)
plt.title('COVID-19 Death Rate by Country (Latest Data)', fontsize=16)
plt.xlabel('Death Rate (%)', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('death_rate_comparison.png')
plt.close()
print("Death rate comparison plot created.")

In [None]:
# 5. VACCINATION ANALYSIS
print("\n--- Vaccination Analysis ---")

# 5.1 Vaccination Progress Over Time
plt.figure(figsize=(14, 8))
for country in countries_of_interest:
    country_data = df_selected[df_selected['location'] == country]
    if 'people_vaccinated' in country_data.columns:
        plt.plot(country_data['date'], country_data['people_vaccinated'], label=country)

plt.title('COVID-19 Vaccination Progress Over Time', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('People Vaccinated', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('vaccination_progress.png')
plt.close()
print("Vaccination progress plot created.")

In [None]:
# 5.2 Vaccination Rate Comparison (latest data)
if 'vaccination_rate' in latest_data.columns:
    latest_data = latest_data.sort_values('vaccination_rate', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='vaccination_rate', y='location', data=latest_data)
    plt.title('COVID-19 Vaccination Rate by Country (Latest Data)', fontsize=16)
    plt.xlabel('Vaccination Rate (%)', fontsize=12)
    plt.ylabel('Country', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('vaccination_rate_comparison.png')
    plt.close()
    print("Vaccination rate comparison plot created.")

In [None]:
# 6. CHOROPLETH MAP VISUALIZATION
print("\n--- Global Map Visualization ---")

In [None]:
# Get most recent data for each country
latest_global_data = df.groupby('location').last().reset_index()

In [None]:
# Create world map of total cases
if 'iso_code' in latest_global_data.columns and 'total_cases' in latest_global_data.columns:
    try:
        fig = px.choropleth(
            latest_global_data,
            locations="iso_code",
            color="total_cases",
            hover_name="location",
            color_continuous_scale="Viridis",
            title="Global COVID-19 Total Cases",
            projection="natural earth"
        )
        fig.write_html("covid_global_map.html")
        print("Global choropleth map created and saved as HTML.")
    except Exception as e:
        print(f"Could not create choropleth map: {e}")

In [None]:
# Create world map of vaccination rates
if 'iso_code' in latest_global_data.columns and 'people_vaccinated' in latest_global_data.columns and 'population' in latest_global_data.columns:
    try:
        latest_global_data['vaccination_rate'] = (latest_global_data['people_vaccinated'] / latest_global_data['population'] * 100).round(2)
        
        fig = px.choropleth(
            latest_global_data,
            locations="iso_code",
            color="vaccination_rate",
            hover_name="location",
            color_continuous_scale="Blues",
            title="Global COVID-19 Vaccination Rates (%)",
            projection="natural earth"
        )
        fig.write_html("vaccination_global_map.html")
        print("Global vaccination rate map created and saved as HTML.")
    except Exception as e:
        print(f"Could not create vaccination choropleth map: {e}")

In [None]:
# 7. KEY INSIGHTS
print("\n--- Key Insights ---")

In [None]:
# Calculate key statistics
try:
    # Countries with highest case counts
    top_cases = latest_global_data.sort_values('total_cases', ascending=False)[['location', 'total_cases']].head(5)
    print("\nTop 5 countries by total cases:")
    print(top_cases)

In [None]:
# Countries with highest death counts
    top_deaths = latest_global_data.sort_values('total_deaths', ascending=False)[['location', 'total_deaths']].head(5)
    print("\nTop 5 countries by total deaths:")
    print(top_deaths)

In [None]:
# Countries with highest vaccination rates (where data available)
    if 'vaccination_rate' in latest_global_data.columns:
        valid_vax = latest_global_data.dropna(subset=['vaccination_rate'])
        top_vax = valid_vax.sort_values('vaccination_rate', ascending=False)[['location', 'vaccination_rate']].head(5)
        print("\nTop 5 countries by vaccination rate (%):")
        print(top_vax)

In [None]:
# Global totals
    global_cases = df[df['location'] == 'World']['total_cases'].max()
    global_deaths = df[df['location'] == 'World']['total_deaths'].max()
    global_vaccinations = df[df['location'] == 'World']['people_vaccinated'].max()
    
    print(f"\nGlobal totals from the dataset:")
    print(f"- Total cases: {global_cases:,.0f}")
    print(f"- Total deaths: {global_deaths:,.0f}")
    if not pd.isna(global_vaccinations):
        print(f"- People vaccinated: {global_vaccinations:,.0f}")

In [None]:
# Calculate average death rate
    world_death_rate = df[df['location'] == 'World']['total_deaths'].max() / df[df['location'] == 'World']['total_cases'].max() * 100
    print(f"\nGlobal average death rate: {world_death_rate:.2f}%")

except Exception as e:
    print(f"Could not calculate all statistics: {e}")

print("\n--- Analysis Complete ---")
print("All visualizations have been saved to the current directory.")
print("For a comprehensive report, include these visualizations and statistics in your Jupyter notebook with appropriate markdown commentary.")