In [None]:
# ========== IMPORTS ==========
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
import io
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# ========== SETUP ==========
plt.style.use('ggplot')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)  # Show all columns when printing DataFrames

# ========== DATA LOADING ==========
def load_covid_data():
    """Attempt multiple methods to load COVID-19 data with fallbacks"""
    # Try local file first
    try:
        df = pd.read_csv('owid-covid-data.csv', low_memory=False)
        print("✅ Data loaded from local file 'owid-covid-data.csv'")
        return df
    except FileNotFoundError:
        print("⚠️ Local file not found. Trying online sources...")

    # List of possible data sources (updated mirrors)
    data_sources = [
        "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv",
        "https://covid.ourworldindata.org/data/owid-covid-data.csv",
        "https://storage.googleapis.com/covid19-open-data/v3/latest/epidemiology.csv"
    ]

    for url in data_sources:
        try:
            print(f"🔍 Attempting download from: {url}")
            with urllib.request.urlopen(url, timeout=15) as response:
                data = response.read()
            df = pd.read_csv(io.StringIO(data.decode('utf-8')), low_memory=False)
            df.to_csv('owid-covid-data.csv', index=False)
            print(f"✅ Successfully downloaded from {url.split('/')[2]}")
            return df
        except Exception as e:
            print(f"❌ Failed to download from {url}: {str(e)}")

    # If all downloads fail, load minimal sample data
    print("⚠️ All download attempts failed. Loading sample data...")
    sample_dates = pd.date_range(start='2020-01-01', periods=120)
    return pd.DataFrame({
        'date': sample_dates,
        'location': ['Sample Country']*len(sample_dates),
        'total_cases': np.linspace(0, 100000, len(sample_dates)),
        'total_deaths': np.linspace(0, 5000, len(sample_dates)),
        'new_cases': np.random.randint(0, 5000, len(sample_dates)),
        'population': 10000000
    })

# Load the data
df = load_covid_data()

# ========== DATA CLEANING ==========
def clean_data(df):
    """Clean and prepare the COVID dataset"""
    # Convert date column
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])

    # Select key columns and countries
    key_columns = [
        'date', 'location', 'continent', 'population',
        'total_cases', 'new_cases', 'total_deaths', 'new_deaths',
        'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated',
        'icu_patients', 'hosp_patients', 'reproduction_rate'
    ]

    available_cols = [col for col in key_columns if col in df.columns]
    df = df[available_cols].copy()

    # Handle missing values
    for col in ['total_cases', 'new_cases', 'total_deaths', 'new_deaths']:
        if col in df.columns:
            df[col] = df[col].fillna(0)

    # Calculate derived metrics
    if all(col in df.columns for col in ['total_deaths', 'total_cases']):
        df['death_rate'] = df['total_deaths'] / df['total_cases']

    if 'population' in df.columns:
        if 'total_cases' in df.columns:
            df['cases_per_million'] = (df['total_cases'] / df['population']) * 1e6
        if 'people_vaccinated' in df.columns:
            df['vaccination_rate'] = (df['people_vaccinated'] / df['population']) * 100

    return df

df_clean = clean_data(df)

# ========== ANALYSIS FUNCTIONS ==========
def plot_time_series(df, metric, title, ylabel, countries=None, log_scale=False):
    """Plot time series for specified countries"""
    plt.figure(figsize=(14, 7))
    if not countries:
        if 'location' in df.columns:
            countries = df['location'].unique()[:5]

    for country in countries:
        country_data = df[df['location'] == country]
        if metric in country_data.columns:
            plt.plot(country_data['date'], country_data[metric], label=country)

    plt.title(title, fontsize=14)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    if log_scale:
        plt.yscale('log')
    plt.grid(True, which="both", ls="--")
    plt.tight_layout()
    plt.show()

def plot_comparison(df, metric, title, kind='bar', top_n=10):
    """Compare countries by specified metric"""
    if 'date' in df.columns:
        latest_date = df['date'].max()
        latest_data = df[df['date'] == latest_date]
    else:
        latest_data = df.copy()

    if metric in latest_data.columns:
        latest_data = latest_data.sort_values(metric, ascending=False).head(top_n)

        plt.figure(figsize=(12, 6))
        if kind == 'bar':
            sns.barplot(data=latest_data, x='location', y=metric)
            plt.xticks(rotation=45)
        elif kind == 'pie':
            plt.pie(latest_data[metric], labels=latest_data['location'], autopct='%1.1f%%')

        plt.title(title, fontsize=14)
        plt.tight_layout()
        plt.show()

# ========== MAIN ANALYSIS ==========
# 1. Show dataset info
print("\n=== DATASET INFO ===")
print(f"Shape: {df_clean.shape}")
if 'date' in df_clean.columns:
    print(f"Date Range: {df_clean['date'].min()} to {df_clean['date'].max()}")
if 'location' in df_clean.columns:
    print(f"Number of locations: {df_clean['location'].nunique()}")
print("\nFirst 5 rows:")
print(df_clean.head())

# 2. Time series plots
if all(col in df_clean.columns for col in ['date', 'location']):
    sample_countries = df_clean['location'].unique()[:3]

    if 'total_cases' in df_clean.columns:
        plot_time_series(df_clean, 'total_cases', 'Total COVID-19 Cases', 'Total Cases', sample_countries, True)

    if 'new_cases' in df_clean.columns:
        plot_time_series(df_clean, 'new_cases', 'Daily New Cases', 'New Cases', sample_countries)

    if 'death_rate' in df_clean.columns:
        plot_time_series(df_clean, 'death_rate', 'Case Fatality Rate', 'Death Rate', sample_countries)

# 3. Comparative analysis
if 'location' in df_clean.columns:
    if 'total_cases' in df_clean.columns:
        plot_comparison(df_clean, 'total_cases', 'Total Cases by Country')

    if 'cases_per_million' in df_clean.columns:
        plot_comparison(df_clean, 'cases_per_million', 'Cases per Million Population')

    if 'vaccination_rate' in df_clean.columns:
        plot_comparison(df_clean, 'vaccination_rate', 'Vaccination Rate (%)', kind='pie')

# 4. Generate insights
def generate_insights(df):
    """Generate key insights from the data"""
    print("\n=== KEY INSIGHTS ===")

    if 'date' in df.columns:
        latest_date = df['date'].max()
        print(f"\nAs of {latest_date.strftime('%B %d, %Y')}:")

    if all(col in df.columns for col in ['total_cases', 'location']):
        highest_cases = df.loc[df['total_cases'].idxmax()] if 'total_cases' in df.columns else None
        print(f"- Highest total cases: {highest_cases['location']} ({int(highest_cases['total_cases']):,})" if highest_cases is not None else "- Case data not available")

    if 'death_rate' in df.columns:
        avg_death_rate = df['death_rate'].mean()
        print(f"- Average case fatality rate: {avg_death_rate:.2%}")

    if 'vaccination_rate' in df.columns:
        if 'date' in df.columns:
            latest_vax = df[df['date'] == df['date'].max()]
            max_vax = latest_vax['vaccination_rate'].max()
            print(f"- Highest vaccination rate: {max_vax:.1f}% of population")

generate_insights(df_clean)

# ========== SAVE RESULTS ==========
try:
    df_clean.to_csv('cleaned_covid_data.csv', index=False)
    print("\n💾 Saved cleaned data to 'cleaned_covid_data.csv'")
except Exception as e:
    print(f"\n❌ Could not save results: {str(e)}")

print("\n✅ Analysis complete!")

# ========== OPTIONAL: EXPORT TO PDF ==========
try:
    import os
    notebook_filename = 'covid_global_tracker.ipynb'
    print(f"\n📄 Attempting to export '{notebook_filename}' to PDF...")
    os.system(f"jupyter nbconvert --to pdf {notebook_filename}")
    print("✅ Exported notebook to PDF successfully!")
except Exception as e:
    print(f"❌ PDF export failed: {e}")
