# 📊 COVID-19 Data Analysis Project

This notebook analyzes COVID-19 data using the Our World in Data dataset. It includes data loading, cleaning, exploratory analysis, and vaccination progress visualizations.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Enable inline plotting
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv('owid-covid-data.csv')

# Show basic structure
print(df.columns)
df.head()


In [None]:
# Check for missing values
df.isnull().sum().sort_values(ascending=False)


In [None]:
# Focus on selected countries
countries = ['Kenya', 'United States', 'India']
df = df[df['location'].isin(countries)]

# Convert date column
df['date'] = pd.to_datetime(df['date'])

# Drop rows with missing critical values
df = df.dropna(subset=['total_cases', 'total_deaths'])

# Fill remaining with zeros where appropriate
df[['new_cases', 'new_deaths', 'total_vaccinations']] = df[['new_cases', 'new_deaths', 'total_vaccinations']].fillna(0)


In [None]:
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title("Total COVID-19 Cases Over Time")
plt.xlabel("Date")
plt.ylabel("Total Cases")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
df['death_rate'] = df['total_deaths'] / df['total_cases']

for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['death_rate'], label=country)

plt.title("COVID-19 Death Rate Over Time")
plt.xlabel("Date")
plt.ylabel("Death Rate")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)

plt.title("Total Vaccinations Over Time")
plt.xlabel("Date")
plt.ylabel("Total Vaccinations")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10,6))
for country in countries:
    country_data = df[df['location'] == country].copy()
    country_data['new_cases_ma'] = country_data['new_cases'].rolling(window=7).mean()
    plt.plot(country_data['date'], country_data['new_cases_ma'], label=country)

plt.title("7-Day Moving Average of New COVID-19 Cases")
plt.xlabel("Date")
plt.ylabel("New Cases (7-day MA)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
latest = df[df['date'] == df['date'].max()]
latest = latest[['iso_code', 'location', 'total_cases']].dropna()

fig = px.choropleth(latest,
                    locations='iso_code',
                    color='total_cases',
                    hover_name='location',
                    title='Total COVID-19 Cases by Country (Latest)',
                    color_continuous_scale='Reds')
fig.show()


## 📌 Key Insights

- The United States had the highest number of total cases among the selected countries.
- Death rates vary over time and differ between countries.
- Vaccination rollouts were faster in the United States and India compared to Kenya.
- Moving average trends help smooth out spikes in daily new cases.
