In [1]:
# importing dependencies
import pandas as pd
import numpy as np
import scipy as st
import matplotlib.pyplot as plt
import requests
import json
import csv 

In [2]:
# Import CSV containing covid data from the year 2020. 
csv_path = "Resources/owid-covid-data-filtered.csv"
covid_data = pd.read_csv(csv_path)
covid_data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,01/04/2020,166.0,52.0,17.714,4.0,0.0,0.429,...,,37.746,0.5,64.83,0.511,41128772,,,,
1,AFG,Asia,Afghanistan,01/07/2020,31359.0,121.0,268.286,735.0,2.0,16.714,...,,37.746,0.5,64.83,0.511,41128772,,,,
2,AFG,Asia,Afghanistan,01/10/2020,39268.0,14.0,17.571,1458.0,0.0,1.714,...,,37.746,0.5,64.83,0.511,41128772,,,,
3,AFG,Asia,Afghanistan,01/01/2021,52513.0,183.0,131.143,2201.0,12.0,9.429,...,,37.746,0.5,64.83,0.511,41128772,,,,
4,AFG,Asia,Afghanistan,01/04/2021,56517.0,63.0,41.571,2489.0,5.0,3.143,...,,37.746,0.5,64.83,0.511,41128772,,,,


In [3]:
#Find data types in csv file
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546 entries, 0 to 3545
Data columns (total 67 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   iso_code                                    3546 non-null   object 
 1   continent                                   3378 non-null   object 
 2   location                                    3546 non-null   object 
 3   date                                        3546 non-null   object 
 4   total_cases                                 3281 non-null   float64
 5   new_cases                                   3444 non-null   float64
 6   new_cases_smoothed                          3444 non-null   float64
 7   total_deaths                                3041 non-null   float64
 8   new_deaths                                  3444 non-null   float64
 9   new_deaths_smoothed                         3444 non-null   float64
 10  total_cases_

In [4]:
#Replace missing values with 0, to prevent any mathematical errors
covid_data_filled = covid_data.fillna(0)
covid_data_filled.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,01/04/2020,166.0,52.0,17.714,4.0,0.0,0.429,...,0.0,37.746,0.5,64.83,0.511,41128772,0.0,0.0,0.0,0.0
1,AFG,Asia,Afghanistan,01/07/2020,31359.0,121.0,268.286,735.0,2.0,16.714,...,0.0,37.746,0.5,64.83,0.511,41128772,0.0,0.0,0.0,0.0
2,AFG,Asia,Afghanistan,01/10/2020,39268.0,14.0,17.571,1458.0,0.0,1.714,...,0.0,37.746,0.5,64.83,0.511,41128772,0.0,0.0,0.0,0.0
3,AFG,Asia,Afghanistan,01/01/2021,52513.0,183.0,131.143,2201.0,12.0,9.429,...,0.0,37.746,0.5,64.83,0.511,41128772,0.0,0.0,0.0,0.0
4,AFG,Asia,Afghanistan,01/04/2021,56517.0,63.0,41.571,2489.0,5.0,3.143,...,0.0,37.746,0.5,64.83,0.511,41128772,0.0,0.0,0.0,0.0


In [6]:
# List of columns to keep
selected_columns = [
    'continent', 'location', 'date', 'total_cases_per_million', 'total_deaths_per_million',
    'hosp_patients_per_million', 'total_vaccinations_per_hundred',
    'median_age', 'aged_65_older', 'gdp_per_capita', 'extreme_poverty', 'diabetes_prevalence',
    'female_smokers', 'male_smokers', 'handwashing_facilities'
]

# Create a new DataFrame with only the selected columns
selected_df = covid_data_filled[selected_columns]

# Display the first few rows of the selected DataFrame
selected_df.head()

Unnamed: 0,continent,location,date,total_cases_per_million,total_deaths_per_million,hosp_patients_per_million,total_vaccinations_per_hundred,median_age,aged_65_older,gdp_per_capita,extreme_poverty,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities
0,Asia,Afghanistan,01/04/2020,4.036,0.097,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
1,Asia,Afghanistan,01/07/2020,762.459,17.871,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
2,Asia,Afghanistan,01/10/2020,954.757,35.45,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
3,Asia,Afghanistan,01/01/2021,1276.795,53.515,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
4,Asia,Afghanistan,01/04/2021,1374.148,60.517,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746


In [12]:
# Define the new column names
new_column_names = {
    'continent': 'Continent',
    'location': 'Location',
    'date': 'Date',
    'total_cases_per_million': 'Total_Cases_per_Million',
    'total_deaths_per_million': 'Total_Deaths_per_Million',
    'hosp_patients_per_million': 'Hosp_Patients_per_Million',
    'total_vaccinations_per_hundred': 'Total_Vaccinations_per_Hundred',
    'median_age': 'Median_Age',
    'aged_65_older': 'Aged_65_Older',
    'gdp_per_capita': 'GDP_per_Capita',
    'extreme_poverty': 'Extreme_Poverty',
    'diabetes_prevalence': 'Diabetes_Prevalence',
    'female_smokers': 'Female_Smokers',
    'male_smokers': 'Male_Smokers',
    'handwashing_facilities': 'Handwashing_Facilities'
}

# Rename the columns using the new_column_names dictionary
cleaned_covid_data = selected_df.rename(columns=new_column_names)

# Display the DataFrame with renamed columns
cleaned_covid_data.head(10)


Unnamed: 0,Continent,Location,Date,Total_Cases_per_Million,Total_Deaths_per_Million,Hosp_Patients_per_Million,Total_Vaccinations_per_Hundred,Median_Age,Aged_65_Older,GDP_per_Capita,Extreme_Poverty,Diabetes_Prevalence,Female_Smokers,Male_Smokers,Handwashing_Facilities
0,Asia,Afghanistan,2020-04-01,4.036,0.097,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
1,Asia,Afghanistan,2020-07-01,762.459,17.871,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
2,Asia,Afghanistan,2020-10-01,954.757,35.45,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
3,Asia,Afghanistan,2021-01-01,1276.795,53.515,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
4,Asia,Afghanistan,2021-04-01,1374.148,60.517,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
5,Asia,Afghanistan,2021-07-01,2922.917,120.645,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
6,Asia,Afghanistan,2021-10-01,3770.329,175.206,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
7,Asia,Afghanistan,2022-01-01,3843.027,178.853,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
8,Asia,Afghanistan,2022-04-01,4321.719,186.487,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
9,Asia,Afghanistan,2022-07-01,4437.964,187.776,0.0,0.0,18.6,2.581,1803.987,0.0,9.59,0.0,0.0,37.746
