# COVID-19 Global Analysis – Data Cleaning & Feature Engineering

This notebook builds a cleaned and feature-enriched version of the OWID COVID-19 dataset.

In [2]:
import sys
from pathlib import Path

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
%matplotlib inline

plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["axes.grid"] = True

In [5]:
ROOT_DIR = Path.cwd().parent
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

from src.config import COVID_RAW_PATH, PROCESSED_DATA_DIR, COVID_CLEAN_FEATURES_PATH
from src.data_loading import load_covid_data
from src.preprocessing import build_clean_feature_dataset

In [6]:
df_raw = load_covid_data()
print("Raw dataset loaded.")
print(f"Raw shape: {df_raw.shape[0]:,} rows x {df_raw.shape[1]} columns")

Raw dataset loaded.
Raw shape: 166,326 rows x 67 columns


In [7]:
df_clean = build_clean_feature_dataset(df_raw)

print("Clean + feature-enriched dataset built.")
print(f"Clean shape: {df_clean.shape[0]:,} rows x {df_clean.shape[1]} columns")

df_clean.head()

Clean + feature-enriched dataset built.
Clean shape: 155,331 rows x 72 columns


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million,new_cases_pm_7d_avg,new_deaths_pm_7d_avg,stringency_index_7d_avg,case_fatality_ratio,vaccination_coverage
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,0.511,,,,,,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,0.511,,,,,,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,0.511,,,,,0.042,,8.33,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,0.511,,,,,0.0315,,8.33,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,0.511,,,,,0.0252,,8.33,,


In [8]:
cols_to_inspect = [
    "location",
    "date",
    "new_cases_per_million",
    "new_cases_pm_7d_avg",
    "new_deaths_per_million",
    "new_deaths_pm_7d_avg",
    "case_fatality_ratio",
    "vaccination_coverage",
]

existing_cols = [c for c in cols_to_inspect if c in df_clean.columns]

df_clean[existing_cols].head(10)

Unnamed: 0,location,date,new_cases_per_million,new_cases_pm_7d_avg,new_deaths_per_million,new_deaths_pm_7d_avg,case_fatality_ratio,vaccination_coverage
0,Afghanistan,2020-02-24,0.126,,,,,
1,Afghanistan,2020-02-25,0.0,,,,,
2,Afghanistan,2020-02-26,0.0,0.042,,,,
3,Afghanistan,2020-02-27,0.0,0.0315,,,,
4,Afghanistan,2020-02-28,0.0,0.0252,,,,
5,Afghanistan,2020-02-29,0.0,0.021,,,,
6,Afghanistan,2020-03-01,0.0,0.018,,,,
7,Afghanistan,2020-03-02,0.0,0.0,,,,
8,Afghanistan,2020-03-03,0.0,0.0,,,,
9,Afghanistan,2020-03-04,0.0,0.0,,,,


In [9]:
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

output_path = COVID_CLEAN_FEATURES_PATH
df_clean.to_csv(output_path, index=False)

print(f"Clean dataset with features saved to: {output_path}")

Clean dataset with features saved to: C:\Users\tcdn\Desktop\Projects\11-covid19-global-analysis\data\processed\covid19_clean_features.csv
