In [None]:
import pandas as pd

df = pd.read_csv("../data/raw/owid-covid-data.csv", parse_dates=["date"])

# 1. Filter to a few countries
countries = ["United States", "India", "Kenya", "Brazil", "Germany"]
df_sub = df[df.location.isin(countries)].copy()

# Droping rows missing critical values
df_sub.dropna(subset=["date", "total_cases"], inplace=True)

# Ensure date dtype
df_sub["date"] = pd.to_datetime(df_sub["date"])

df_sub.sort_values(["location", "date"], inplace=True)

# Filling missing daily numbers with zeros via transform
df_sub["new_cases"] = (
    df_sub
      .groupby("location")["new_cases"]
      .transform(lambda x: x.fillna(0))
)
df_sub["new_deaths"] = (
    df_sub
      .groupby("location")["new_deaths"]
      .transform(lambda x: x.fillna(0))
)

print(df_sub[["new_cases", "new_deaths"]].isnull().sum())


new_cases     0
new_deaths    0
dtype: int64


In [5]:
df_sub["total_vaccinations"] = df_sub["total_vaccinations"].fillna(0)

In [6]:
# Death rate = total_deaths / total_cases
df_sub["death_rate"] = df_sub["total_deaths"] / df_sub["total_cases"]

# Vaccination percentage of population
df_sub["pct_vaccinated"] = df_sub["total_vaccinations"] / df_sub["population"] * 100

# Quick null‐check
print(df_sub[["total_vaccinations","death_rate","pct_vaccinated"]].isnull().sum())


total_vaccinations      0
death_rate            175
pct_vaccinated          0
dtype: int64


In [7]:
df_sub = df_sub[df_sub.total_cases > 0].copy()

df_sub["death_rate"] = df_sub["total_deaths"] / df_sub["total_cases"]

print(df_sub["death_rate"].isnull().sum())   # should be 0


0


In [8]:
print(df_sub[["new_cases","new_deaths","total_vaccinations","death_rate","pct_vaccinated"]].isnull().sum())


new_cases             0
new_deaths            0
total_vaccinations    0
death_rate            0
pct_vaccinated        0
dtype: int64


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../data/raw/owid-covid-data.csv", parse_dates=["date"])

countries = ["United States", "India", "Kenya", "Brazil", "Germany"]

df_sub = df[df.location.isin(countries)].copy()
df_sub.dropna(subset=["date","total_cases"], inplace=True)
df_sub["date"] = pd.to_datetime(df_sub["date"])
df_sub.sort_values(["location","date"], inplace=True)
df_sub["new_cases"]   = df_sub.groupby("location")["new_cases"].transform(lambda x: x.fillna(0))
df_sub["new_deaths"]  = df_sub.groupby("location")["new_deaths"].transform(lambda x: x.fillna(0))
df_sub["total_vaccinations"] = df_sub["total_vaccinations"].fillna(0)
df_sub["death_rate"]  = (df_sub["total_deaths"] / df_sub["total_cases"]).fillna(0)
df_sub["pct_vaccinated"] = df_sub["total_vaccinations"] / df_sub["population"] * 100
