Libraries imported


In [1]:
#IMPORT LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All libraries imported successfully")

All libraries imported successfully


loads three key COVID-19 datasets from the Johns Hopkins University repository.

In [5]:
base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"


# Global confirmed cases
confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

# Global deaths
deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"

# Global number of people who recovered
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

# Load all three datasets 
confirmed_df = pd.read_csv(confirmed_url)
deaths_df = pd.read_csv(deaths_url)  
recovered_df = pd.read_csv(recovered_url)  

# Display the first few rows of each
print("       CONFIRMED CASES      ")
print(confirmed_df.head())
print(f"\nShape: {confirmed_df.shape}")
print(f"Columns: {confirmed_df.columns.tolist()[:5]}...")  

print("\n=== DEATHS ===")
print(deaths_df.head())
print(f"\nShape: {deaths_df.shape}")
print(f"Columns: {deaths_df.columns.tolist()[:5]}...")  

print("\n=== RECOVERED ===")
print(recovered_df.head())
print(f"\nShape: {recovered_df.shape}")
print(f"Columns: {recovered_df.columns.tolist()[:5]}...")  

       CONFIRMED CASES      
  Province/State Country/Region       Lat       Long  1/22/20  1/23/20  \
0            NaN    Afghanistan  33.93911  67.709953        0        0   
1            NaN        Albania  41.15330  20.168300        0        0   
2            NaN        Algeria  28.03390   1.659600        0        0   
3            NaN        Andorra  42.50630   1.521800        0        0   
4            NaN         Angola -11.20270  17.873900        0        0   

   1/24/20  1/25/20  1/26/20  1/27/20  ...  2/28/23  3/1/23  3/2/23  3/3/23  \
0        0        0        0        0  ...   209322  209340  209358  209362   
1        0        0        0        0  ...   334391  334408  334408  334427   
2        0        0        0        0  ...   271441  271448  271463  271469   
3        0        0        0        0  ...    47866   47875   47875   47875   
4        0        0        0        0  ...   105255  105277  105277  105277   

   3/4/23  3/5/23  3/6/23  3/7/23  3/8/23  3/9/23  

This code systematically analyzes and compares the three COVID-19 datasets (confirmed cases, deaths, and recoveries). It loops through each dataset to print key information including its dimensions, memory usage, sample rows, column structure, missing values, and geographic coverage. This exploration helps understand each dataset's structure, identify data quality issues, and verify that all three datasets are properly loaded and consistent with each other before proceeding with deeper analysis.



In [7]:
# INITIAL DATA EXPLORATION
print()
print("INITIAL DATA EXPLORATION")
print()


# Dictionary of all datasets for easy looping
datasets = {
    "Confirmed Cases": confirmed_df,
    "Deaths": deaths_df,
    "Recovered": recovered_df
}

for name, df in datasets.items():
    print()
    print(f"EXPLORING: {name}")
    print()
    
    print(f"\nShape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print("\nFirst 3 rows:")
    print(df.head(3))
    
    print("\nColumns (first 10):")
    print(df.columns.tolist()[:10])
    
    print("\nMissing values (first 5 columns):")
    print(df.isnull().sum().head())
    
    print(f"\nUnique countries: {df['Country/Region'].nunique()}")


INITIAL DATA EXPLORATION


EXPLORING: Confirmed Cases


Shape: (289, 1147)
Memory usage: 2.55 MB

First 3 rows:
  Province/State Country/Region       Lat       Long  1/22/20  1/23/20  \
0            NaN    Afghanistan  33.93911  67.709953        0        0   
1            NaN        Albania  41.15330  20.168300        0        0   
2            NaN        Algeria  28.03390   1.659600        0        0   

   1/24/20  1/25/20  1/26/20  1/27/20  ...  2/28/23  3/1/23  3/2/23  3/3/23  \
0        0        0        0        0  ...   209322  209340  209358  209362   
1        0        0        0        0  ...   334391  334408  334408  334427   
2        0        0        0        0  ...   271441  271448  271463  271469   

   3/4/23  3/5/23  3/6/23  3/7/23  3/8/23  3/9/23  
0  209369  209390  209406  209436  209451  209451  
1  334427  334427  334427  334427  334443  334457  
2  271469  271477  271477  271490  271494  271496  

[3 rows x 1147 columns]

Columns (first 10):
['Province/State', 