# Exploratory Data Analysis (EDA) for COVID-19 Datasets

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
datasets = {
    'Comprehensive_Global_COVID-19_Dataset': pd.read_csv('data/Comprehensive_Global_COVID-19_Dataset.csv'),
    'Covid19-TestingRecord': pd.read_csv('data/Covid19-TestingRecord.csv'),
    'Covid19-VariantsFound': pd.read_csv('data/Covid19-VariantsFound.csv'),
    'Vaccinations_ByCountry_ByManufacturer': pd.read_csv('data/Vaccinations_ByCountry_ByManufacturer.csv'),
    'Vaccinations_ByCountry': pd.read_csv('data/Vaccinations_ByCountry.csv')
}

# Display first few rows of each dataset
for name, df in datasets.items():
    print(f'## {name} (First 5 rows)')
    display(df.head())
    print('\n---\n')

## Comprehensive_Global_COVID-19_Dataset (First 5 rows)


Unnamed: 0,S. No.,Country Name,Cases,Deaths,Recovered
0,1.0,Peru,4524748.0,220831.0,4303917.0
1,2.0,Bulgaria,1329266.0,38700.0,1290566.0
2,3.0,Bosnia and Herzegovina,403638.0,16388.0,387250.0
3,4.0,Hungary,2230381.0,49051.0,2181330.0
4,5.0,North Macedonia,350589.0,9977.0,340612.0



---

## Covid19-TestingRecord (First 5 rows)


Unnamed: 0,Entity,ISO code,Date,Source URL,Source label,Notes,Daily change in cumulative total,Cumulative total,Cumulative total per thousand,Daily change in cumulative total per thousand,7-day smoothed daily change,7-day smoothed daily change per thousand,Short-term positive rate,Short-term tests per case
0,Albania - tests performed,ALB,2020-02-25,https://shendetesia.gov.al/koronavirusi-mshms-...,Ministry of Health and Social Protection,,8.0,8.0,0.003,0.003,,,,
1,Albania - tests performed,ALB,2020-02-26,https://shendetesia.gov.al/fond-shtese-per-mas...,Ministry of Health and Social Protection,,5.0,13.0,0.005,0.002,,,,
2,Albania - tests performed,ALB,2020-02-27,https://shendetesia.gov.al/ministria-e-shendet...,Ministry of Health and Social Protection,,4.0,17.0,0.006,0.001,,,,
3,Albania - tests performed,ALB,2020-02-28,http://shendetesia.gov.al/manastirliu-asnje-ra...,Ministry of Health and Social Protection,,1.0,18.0,0.006,0.0,,,,
4,Albania - tests performed,ALB,2020-02-29,https://shendetesia.gov.al/ministria-e-shendet...,Ministry of Health and Social Protection,,8.0,26.0,0.009,0.003,,,,



---

## Covid19-VariantsFound (First 5 rows)


Unnamed: 0,location,date,variant,num_sequences,perc_sequences,num_sequences_total
0,Angola,2020-07-06,Alpha,0,0.0,3
1,Angola,2020-07-06,B.1.1.277,0,0.0,3
2,Angola,2020-07-06,B.1.1.302,0,0.0,3
3,Angola,2020-07-06,B.1.1.519,0,0.0,3
4,Angola,2020-07-06,B.1.160,0,0.0,3



---

## Vaccinations_ByCountry_ByManufacturer (First 5 rows)


Unnamed: 0,location,date,vaccine,total_vaccinations
0,Argentina,2020-12-29,Moderna,2
1,Argentina,2020-12-29,Oxford/AstraZeneca,3
2,Argentina,2020-12-29,Sinopharm/Beijing,1
3,Argentina,2020-12-29,Sputnik V,20481
4,Argentina,2020-12-30,Moderna,2



---

## Vaccinations_ByCountry (First 5 rows)


Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million,vaccines,source_name,source_website
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,0.0,0.0,,,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
1,Afghanistan,AFG,2021-02-23,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
2,Afghanistan,AFG,2021-02-24,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
3,Afghanistan,AFG,2021-02-25,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/
4,Afghanistan,AFG,2021-02-26,,,,,1367.0,,,,34.0,"Johnson&Johnson, Oxford/AstraZeneca, Pfizer/Bi...",World Health Organization,https://covid19.who.int/



---



In [3]:
datasets['Comprehensive_Global_COVID-19_Dataset'].groupby('Country Name').size().reset_index(name='Counts').sort_values(
    by='Counts', ascending=False)

Unnamed: 0,Country Name,Counts
0,Afghanistan,1
1,Albania,1
2,Algeria,1
3,American Samoa,1
4,Andorra,1
...,...,...
225,Vietnam,1
226,Wallis and Futuna,1
227,Yemen,1
228,Zambia,1


In [4]:
datasets['Covid19-TestingRecord']['country'] = datasets['Covid19-TestingRecord']['Entity'].str.split(' - ').str[0]
datasets['Covid19-TestingRecord'].groupby('country').size().reset_index(name='Counts').sort_values(
    by='Counts', ascending=False)

Unnamed: 0,country,Counts
60,Italy,1255
102,Poland,1236
22,Canada,1006
81,Mexico,708
3,Argentina,704
...,...,...
21,Cambodia,56
70,Lebanon,48
25,China,2
2,Antigua and Barbuda,1


In [5]:
datasets['Covid19-VariantsFound'].groupby('location').size().reset_index(name='Counts').sort_values(
    by='Counts', ascending=False)

Unnamed: 0,location,Counts
7,Belgium,1080
6,Bangladesh,1080
34,France,1080
77,Netherlands,1080
69,Mexico,1080
...,...,...
31,Ethiopia,384
71,Monaco,360
9,Benin,336
32,Fiji,336


In [6]:
datasets['Vaccinations_ByCountry_ByManufacturer'].groupby('location').size().reset_index(name='Counts').sort_values(
    by='Counts', ascending=False)

Unnamed: 0,location,Counts
11,European Union,4122
0,Argentina,2629
14,Germany,2290
36,South Korea,1985
21,Latvia,1664
7,Czechia,1661
13,France,1652
19,Italy,1639
4,Chile,1493
40,Ukraine,1491


In [7]:
datasets['Vaccinations_ByCountry'].groupby('country').size().reset_index(name='Counts').sort_values(
    by='Counts', ascending=False)

Unnamed: 0,country,Counts
149,Norway,482
110,Latvia,480
54,Denmark,476
212,United States,471
163,Russia,470
...,...,...
24,Bonaire Sint Eustatius and Saba,146
200,Tokelau,114
165,Saint Helena,92
158,Pitcairn,85


In [20]:
# Create a function to check country presence percentage in each dataset
def check_country_presence(country):
    total_rows = {
        'Comprehensive': len(datasets['Comprehensive_Global_COVID-19_Dataset']),
        'Testing': len(datasets['Covid19-TestingRecord']),
        'Variants': len(datasets['Covid19-VariantsFound']),
        'Vacc_Manufacturer': len(datasets['Vaccinations_ByCountry_ByManufacturer']),
        'Vaccinations': len(datasets['Vaccinations_ByCountry'])
    }
    
    presence = {
        'Comprehensive': (datasets['Comprehensive_Global_COVID-19_Dataset']['Country Name'] == country).sum() / total_rows['Comprehensive'] * 100,
        'Testing': (datasets['Covid19-TestingRecord']['country'] == country).sum() / total_rows['Testing'] * 100,
        'Variants': (datasets['Covid19-VariantsFound']['location'] == country).sum() / total_rows['Variants'] * 100,
        'Vacc_Manufacturer': (datasets['Vaccinations_ByCountry_ByManufacturer']['location'] == country).sum() / total_rows['Vacc_Manufacturer'] * 100,
        'Vaccinations': (datasets['Vaccinations_ByCountry']['country'] == country).sum() / total_rows['Vaccinations'] * 100
    }
    
    presence.update({'Mean': sum(presence.values())/len(presence.values())})
    return presence

# Create presence data for each country
# Get all unique country names from each dataset
all_countries = set()
all_countries.update(datasets['Comprehensive_Global_COVID-19_Dataset']['Country Name'].unique())
all_countries.update(datasets['Covid19-TestingRecord']['country'].unique())
all_countries.update(datasets['Covid19-VariantsFound']['location'].unique())
all_countries.update(datasets['Vaccinations_ByCountry_ByManufacturer']['location'].unique())
all_countries.update(datasets['Vaccinations_ByCountry']['country'].unique())

# Calculate presence data for each country
presence_data = {country: check_country_presence(country) for country in all_countries}

# Convert to DataFrame
presence_df = pd.DataFrame.from_dict(presence_data, orient='index')

# Sort by mean presence
presence_df = presence_df.sort_values('Mean', ascending=False)

# Round all values to 2 decimal places
presence_df = presence_df.round(2)

# Display the results
presence_df

Unnamed: 0,Comprehensive,Testing,Variants,Vacc_Manufacturer,Vaccinations,Mean
European Union,0.42,0.00,0.00,11.57,0.00,2.40
Argentina,0.42,0.95,1.05,7.38,0.53,2.07
Germany,0.42,0.85,1.05,6.43,0.53,1.86
South Korea,0.42,0.88,1.05,5.57,0.46,1.68
Italy,0.42,1.69,1.05,4.60,0.53,1.66
...,...,...,...,...,...,...
Tokelau,0.00,0.00,0.00,0.00,0.13,0.03
Saint Helena,0.00,0.00,0.00,0.00,0.11,0.02
Pitcairn,0.00,0.00,0.00,0.00,0.10,0.02
Falkland Islands,0.00,0.00,0.00,0.00,0.08,0.02


In [21]:
# Create a list of countries to check
countries_to_check = ['Belgium', 'France', 'Ireland', 'Luxembourg', 'Monaco', 'Netherlands', 'United Kingdom']

# Sort presence_df by Mean in descending order and create a rank
ranked_df = presence_df.sort_values('Mean', ascending=False).reset_index()
ranked_df['Rank'] = ranked_df.index + 1

# Filter for the specified countries
result = ranked_df[ranked_df['index'].isin(countries_to_check)][['index', 'Mean', 'Rank']]
result = result.rename(columns={'index': 'Country'}).sort_values('Rank')

# Display results
print("Rankings out of", len(presence_df), "countries:")
display(result)

Rankings out of 254 countries:


Unnamed: 0,Country,Mean,Rank
5,France,1.49,6
19,Belgium,0.76,20
22,Netherlands,0.73,23
24,Luxembourg,0.7,25
38,Ireland,0.62,39
47,United Kingdom,0.57,48
138,Monaco,0.24,139


In [9]:
# Summary statistics and missing values analysis
for name, df in datasets.items():
    print(f'## {name} Summary Statistics and Missing Values')
    print(df.describe(include='all'))
    print('\nMissing Values:\n')
    print(df.isnull().sum())
    print('\n---\n')

## Comprehensive_Global_COVID-19_Dataset Summary Statistics and Missing Values
            S. No. Country Name         Cases        Deaths     Recovered
count   230.000000          230  2.300000e+02  2.300000e+02  2.300000e+02
unique         NaN          230           NaN           NaN           NaN
top            NaN         Peru           NaN           NaN           NaN
freq           NaN            1           NaN           NaN           NaN
mean    115.495652          NaN  4.234399e+06  3.626242e+04  4.198137e+06
std      66.533621          NaN  1.663905e+07  1.348604e+05  1.652371e+07
min       1.000000          NaN  1.403000e+03  1.000000e+00  1.395000e+03
25%      58.250000          NaN  3.055700e+04  1.902500e+02  3.024025e+04
50%     115.500000          NaN  2.246960e+05  2.147500e+03  2.227915e+05
75%     172.750000          NaN  1.363680e+06  1.636725e+04  1.353812e+06
max     230.000000          NaN  1.856361e+08  1.261370e+06  1.843747e+08

Missing Values:

S. No.         