# Data Quality Check

---

In [1]:
# --- Imports ---
import pandas as pd
import numpy as np

In [2]:
# --- Load data ---
file_path = "../Data/Covid_19_dataset.csv"
df = pd.read_csv(file_path)

## Quick look
Preview of the first rows to confirm the structure of the dataset.

---


In [4]:
# ## Quick look
df.head()

Unnamed: 0,index,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,hosp_patients,...,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,population,median_age,gdp_per_capita,life_expectancy,latitude,longitude
0,0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,...,,,,,39835428.0,18.6,1803.987,64.83,33.0,65.0
1,1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,...,,,,,39835428.0,18.6,1803.987,64.83,33.0,65.0
2,2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,...,,,,,39835428.0,18.6,1803.987,64.83,33.0,65.0
3,3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,...,,,,,39835428.0,18.6,1803.987,64.83,33.0,65.0
4,4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,...,,,,,39835428.0,18.6,1803.987,64.83,33.0,65.0


## Overview
General information and descriptive statistics for all columns.

---

In [9]:
df.describe(include="all")

Unnamed: 0,index,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,hosp_patients,...,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,population,median_age,gdp_per_capita,life_expectancy,latitude,longitude
count,180477.0,180477,180477,180477,180477,173493.0,173212.0,155323.0,155095.0,27152.0,...,43851.0,41524.0,19866.0,36940.0,180477.0,159326.0,157994.0,179426.0,180477.0,180477.0
unique,,226,6,226,900,,,,,,...,,,,,,,,,,
top,,ARG,Africa,Argentina,2021-08-29,,,,,,...,,,,,,,,,,
freq,,900,45193,900,225,,,,,,...,,,,,,,,,,
mean,93968.446456,,,,,874822.9,3101.972,17109.81,40.44552,4181.064783,...,21797110.0,18190410.0,7832295.0,270026.4,37426980.0,30.594091,19662.412111,73.598154,19.549747,15.962622
std,54582.966489,,,,,4089686.0,18574.92,66952.45,183.352225,11240.365676,...,84328620.0,71100290.0,26212940.0,1191714.0,144298900.0,9.081572,20679.185182,7.489226,24.2806,68.153682
min,0.0,,,,,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,47.0,15.1,661.24,53.28,-51.75,-176.2
25%,47611.0,,,,,2608.0,0.0,91.0,0.0,152.0,...,414761.5,319994.5,36539.0,4007.5,888456.0,22.2,4449.898,69.5,5.0,-15.0
50%,92730.0,,,,,30703.0,54.0,797.0,1.0,764.5,...,2743372.0,2244652.0,994843.5,24938.0,6871547.0,29.9,12951.839,75.05,18.25,18.5
75%,140370.0,,,,,304146.0,733.0,6461.5,12.0,2956.25,...,12328420.0,9682117.0,5457455.0,124954.2,25887040.0,39.1,27936.896,79.19,40.0,47.6581


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180477 entries, 0 to 180476
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   index                    180477 non-null  int64  
 1   iso_code                 180477 non-null  object 
 2   continent                180477 non-null  object 
 3   location                 180477 non-null  object 
 4   date                     180477 non-null  object 
 5   total_cases              173493 non-null  float64
 6   new_cases                173212 non-null  float64
 7   total_deaths             155323 non-null  float64
 8   new_deaths               155095 non-null  float64
 9   hosp_patients            27152 non-null   float64
 10  total_tests              78326 non-null   float64
 11  new_tests                74509 non-null   float64
 12  tests_per_case           92600 non-null   float64
 13  total_vaccinations       46366 non-null   float64
 14  peop

## Duplicates
Check for fully duplicated rows.

---


In [6]:
# ## Duplicates
dup_rows = df.duplicated().sum()
print("Duplicate rows:", dup_rows)

Duplicate rows: 0


In [11]:
# ## Duplicates by iso_code + date
dup_key = df.duplicated(subset=["iso_code", "date"]).sum()
print("Rows with duplicated iso_code + date:", dup_key)

if dup_key > 0:
    display(df[df.duplicated(subset=["iso_code", "date"], keep=False)]
            .sort_values(["iso_code", "date"]))

Rows with duplicated iso_code + date: 0


## Sanity checks

We perform basic logical validations to confirm data consistency:
- Key numeric columns (cases, deaths, tests, vaccinations, population) contain no negative values.
- Latitude values fall within [-90, 90] and longitude within [-180, 180].
- The `date` column can be safely converted to datetime without errors.


In [12]:
# ## Sanity checks

# Non-negative values check (for key numeric columns)
num_cols = ["total_cases", "new_cases", "total_deaths", "new_deaths",
            "hosp_patients", "total_tests", "new_tests",
            "total_vaccinations", "people_vaccinated", "people_fully_vaccinated",
            "total_boosters", "new_vaccinations", "population"]

for col in num_cols:
    negatives = (df[col] < 0).sum()
    if negatives > 0:
        print(f"Column {col} has {negatives} negative values")
    else:
        print(f"Column {col} OK (no negatives)")

# Latitude and longitude range
print("Latitude out of range:", ((df["latitude"] < -90) | (df["latitude"] > 90)).sum())
print("Longitude out of range:", ((df["longitude"] < -180) | (df["longitude"] > 180)).sum())

# Date parsing
df["date"] = pd.to_datetime(df["date"], errors="coerce")
print("Invalid dates:", df["date"].isna().sum())

Column total_cases OK (no negatives)
Column new_cases OK (no negatives)
Column total_deaths OK (no negatives)
Column new_deaths OK (no negatives)
Column hosp_patients OK (no negatives)
Column total_tests OK (no negatives)
Column new_tests OK (no negatives)
Column total_vaccinations OK (no negatives)
Column people_vaccinated OK (no negatives)
Column people_fully_vaccinated OK (no negatives)
Column total_boosters OK (no negatives)
Column new_vaccinations OK (no negatives)
Column population OK (no negatives)
Latitude out of range: 0
Longitude out of range: 0
Invalid dates: 0


## Completeness
Count missing values per column and calculate their percentage.

In [7]:
# ## Completeness
total_rows = len(df)
completeness = pd.DataFrame({
    "non_null": df.notna().sum(),
    "null": df.isna().sum()
})
completeness["null_%"] = (completeness["null"] / total_rows) * 100

print("Total rows:", total_rows)
completeness.sort_values("null_%", ascending=True)

Total rows: 180477


Unnamed: 0,non_null,null,null_%
index,180477,0,0.0
iso_code,180477,0,0.0
continent,180477,0,0.0
location,180477,0,0.0
date,180477,0,0.0
latitude,180477,0,0.0
longitude,180477,0,0.0
population,180477,0,0.0
life_expectancy,179426,1051,0.582346
total_cases,173493,6984,3.869745


## Summary
The dataset passed all quality checks:
- No full-row or key (`iso_code + date`) duplicates were found.  
- Missing data is concentrated in testing, hospitalization, and vaccination-related columns.  
- Sanity checks showed no structural issues (no negatives, valid coordinates, valid dates).  

The dataset is ready for further analysis and visualization.