Step 1: Load the Data in Python

In [1]:
import pandas as pd

# Load your raw dataset
data = pd.read_csv('vaccinations.csv')

# Take a quick look at the first few rows
data.head()

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,,0.0,0.0,,,,,
1,Afghanistan,AFG,2021-02-23,,,,,,1367.0,,,,,33.0,1367.0,0.003
2,Afghanistan,AFG,2021-02-24,,,,,,1367.0,,,,,33.0,1367.0,0.003
3,Afghanistan,AFG,2021-02-25,,,,,,1367.0,,,,,33.0,1367.0,0.003
4,Afghanistan,AFG,2021-02-26,,,,,,1367.0,,,,,33.0,1367.0,0.003


Step 2: Identify and Handle Missing Values

In [3]:
# Check for missing values
data.isnull().sum()

# Option 1: Fill missing values with 0 (if it's logical for your data)
data.fillna(0, inplace=True)

# Option 2: Drop rows with significant missing data
# data.dropna(subset=['total_vaccinations', 'people_vaccinated', 'daily_vaccinations'], inplace=True)

in most cases for vaccination data, you might want to replace missing values with 0, assuming missing data means no vaccinations occurred that day.

Step 3: Ensure Correct Data Types

In [5]:
# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Check other column types
data.dtypes

location                                       object
iso_code                                       object
date                                   datetime64[ns]
total_vaccinations                            float64
people_vaccinated                             float64
people_fully_vaccinated                       float64
total_boosters                                float64
daily_vaccinations_raw                        float64
daily_vaccinations                            float64
total_vaccinations_per_hundred                float64
people_vaccinated_per_hundred                 float64
people_fully_vaccinated_per_hundred           float64
total_boosters_per_hundred                    float64
daily_vaccinations_per_million                float64
daily_people_vaccinated                       float64
daily_people_vaccinated_per_hundred           float64
dtype: object

Step 4: Remove Duplicates

In [7]:
# Drop any duplicate rows
data.drop_duplicates(inplace=True)

Step 5: Normalize Column Names

In [16]:
# Renaming columns for better readability
data.rename(columns={
    'total_vaccinations': 'Total Vaccinations',
    'people_vaccinated': 'People Vaccinated',
    'people_fully_vaccinated': 'Fully Vaccinated',
    'total_boosters': 'Total Boosters',
    'daily_vaccinations': 'Daily Vaccinations'
}, inplace=True)

Step 6: Save the Cleaned Data

In [21]:
# Save the cleaned dataset
data.to_csv('cleaned_vaccinations.csv', index=False)

Step 7: Verify Your Cleaned Data

In [24]:
# Verify the cleaned data
data.head()

Unnamed: 0,location,iso_code,date,Total Vaccinations,People Vaccinated,Fully Vaccinated,Total Boosters,daily_vaccinations_raw,Daily Vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
0,Afghanistan,AFG,2021-02-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,AFG,2021-02-23,0.0,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,0.0,33.0,1367.0,0.003
2,Afghanistan,AFG,2021-02-24,0.0,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,0.0,33.0,1367.0,0.003
3,Afghanistan,AFG,2021-02-25,0.0,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,0.0,33.0,1367.0,0.003
4,Afghanistan,AFG,2021-02-26,0.0,0.0,0.0,0.0,0.0,1367.0,0.0,0.0,0.0,0.0,33.0,1367.0,0.003
