In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [69]:
# Extract data from the source (Kaggle COVID-19 dataset)
data = pd.read_csv('covid19_data.csv')

vaccinations_data = pd.read_csv('vaccinationsByAge.csv')


In [70]:

# Group data by country and date
grouped_data = data.groupby(['Country/Region', 'WHO Region']).sum()
grouped_data = grouped_data.drop('Date', axis=1)

vaccinations_data['date'] = pd.to_datetime(vaccinations_data['date'])

#Group data by Age Group and calculate the total people_vaccinated_per_hundred for each age group
grouped_data2 = vaccinations_data.groupby('age_group')['people_vaccinated_per_hundred'].sum()

In [86]:
# Calculate the percentage of total vaccinations per hundred for each age group
total_doses_sum = grouped_data2.sum()
grouped_data_percentage = (grouped_data2 / total_doses_sum) * 100
grouped_data_sorted = grouped_data2.sort_values(ascending=False)



In [82]:
# Print the grouped data with total doses for each age group
print("Total Doses by Age Group:")
print(grouped_data2)


Total Doses by Age Group:
age_group
0-15          0.00
0-17      26974.47
0-19      37228.07
0-4         183.97
0-5         407.06
0-9         263.02
10-14     64240.84
10-19      4644.74
100+      85325.98
12-15     43925.12
12-17     58691.33
12-64     56320.02
15-17    115399.24
16-17      5971.22
16-19     46758.00
16-69         0.45
18-24    210547.54
18-29     78275.28
18-69        66.50
20-29    123590.17
25-34      1834.36
25-49    218363.16
3-11      46155.78
30-39    221726.88
35-44      2522.17
40-49    232155.59
45-54      3556.57
5-11      13769.50
5-9       22736.20
50-54     11788.71
50-59    465331.80
55-59     12720.45
55-64      5310.69
60-64     12895.04
60-69    488841.71
65+       63101.87
65-69     12479.02
65-74      4622.24
70-74     12720.30
70-79    505239.22
75+        4262.48
75-79     13153.04
80+      343382.38
80-89    159251.11
90+       86272.53
90-99     65085.52
Name: people_vaccinated_per_hundred, dtype: float64


In [83]:
# Print the percentage of total doses for each age group
print("\nPercentage of Total Doses by Age Group:")
print(grouped_data_percentage)




Percentage of Total Doses by Age Group:
age_group
0-15      0.000000
0-17      0.676375
0-19      0.933481
0-4       0.004613
0-5       0.010207
0-9       0.006595
10-14     1.610817
10-19     0.116465
100+      2.139519
12-15     1.101407
12-17     1.471665
12-64     1.412205
15-17     2.893596
16-17     0.149726
16-19     1.172441
16-69     0.000011
18-24     5.279406
18-29     1.962725
18-69     0.001667
20-29     3.098980
25-34     0.045996
25-49     5.475380
3-11      1.157340
30-39     5.559724
35-44     0.063243
40-49     5.821220
45-54     0.089180
5-11      0.345265
5-9       0.570102
50-54     0.295598
50-59    11.668033
55-59     0.318961
55-64     0.133164
60-64     0.323339
60-69    12.257535
65+       1.582257
65-69     0.312907
65-74     0.115901
70-74     0.318957
70-79    12.668697
75+       0.106880
75-79     0.329808
80+       8.610193
80-89     3.993166
90+       2.163254
90-99     1.631997
Name: people_vaccinated_per_hundred, dtype: float64


In [84]:
# Print the sorted age groups by total doses
print("\nAge Groups Sorted by Total Doses:")
print(grouped_data_sorted)


Age Groups Sorted by Total Doses:
age_group
70-79    505239.22
60-69    488841.71
50-59    465331.80
80+      343382.38
40-49    232155.59
30-39    221726.88
25-49    218363.16
18-24    210547.54
80-89    159251.11
20-29    123590.17
15-17    115399.24
90+       86272.53
100+      85325.98
18-29     78275.28
90-99     65085.52
10-14     64240.84
65+       63101.87
12-17     58691.33
12-64     56320.02
16-19     46758.00
3-11      46155.78
12-15     43925.12
0-19      37228.07
0-17      26974.47
5-9       22736.20
5-11      13769.50
75-79     13153.04
60-64     12895.04
55-59     12720.45
70-74     12720.30
65-69     12479.02
50-54     11788.71
16-17      5971.22
55-64      5310.69
10-19      4644.74
65-74      4622.24
75+        4262.48
45-54      3556.57
35-44      2522.17
25-34      1834.36
0-5         407.06
0-9         263.02
0-4         183.97
18-69        66.50
16-69         0.45
0-15          0.00
Name: people_vaccinated_per_hundred, dtype: float64


In [87]:
# Calculate additional metrics
grouped_data['ActiveCases'] = grouped_data['Confirmed'] - grouped_data['Recovered'] - grouped_data['Deaths']
grouped_data['NewCases'] = grouped_data.groupby('Country/Region')['Confirmed'].diff()
grouped_data['NewDeaths'] = grouped_data.groupby('Country/Region')['Deaths'].diff()
grouped_data['MortalityRate'] = grouped_data['Deaths'] / grouped_data['Confirmed'] * 100
grouped_data['RecoveryRate'] = grouped_data['Recovered'] / grouped_data['Confirmed'] * 100
grouped_data2['PercentageVaccinated'] = grouped_data_percentage * 100



In [30]:
# Standardize the data using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(grouped_data)


In [31]:
# Perform clustering using K-means algorithm
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(scaled_data)
labels = kmeans.labels_



In [32]:
# Perform principal component analysis (PCA)
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)


In [33]:
# Add cluster labels and principal components to the grouped data
grouped_data['Cluster'] = labels
grouped_data['PC1'] = principal_components[:, 0]
grouped_data['PC2'] = principal_components[:, 1]

In [17]:
# Bar plot: Top 10 countries with the most total cases
top_10_countries = grouped_data.groupby('Country/Region')['Confirmed'].sum().nlargest(10)
top_10_countries.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Country')
plt.ylabel('Total Cases')
plt.title('Top 10 Countries with the Most Total COVID-19 Cases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('bar_plot.png')
plt.close()

In [14]:
# Scatter plot: Total cases vs. total deaths for all countries
plt.scatter(grouped_data['Confirmed'], grouped_data['Deaths'])
plt.xlabel('Total Cases')
plt.ylabel('Total Deaths')
plt.title('Total COVID-19 Cases vs. Total Deaths for All Countries')
plt.tight_layout()
plt.savefig('scatter_plot.png')
plt.close()

In [35]:
# Scatter plot: Principal components with cluster labels
plt.scatter(grouped_data['PC1'], grouped_data['PC2'], c=grouped_data['Cluster'], cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Principal Components Analysis')
plt.savefig('scatter_plot_PCA.png')
plt.close()

In [37]:
# Bar plot: Total cases by WHO Region
total_cases_by_region = grouped_data.groupby('WHO Region')['Confirmed'].sum()
total_cases_by_region.plot(kind='bar')
plt.xlabel('WHO Region')
plt.ylabel('Total Cases')
plt.title('Total COVID-19 Cases by WHO Region')
plt.xticks(rotation=45)
plt.savefig('bar_plot_WHO.png')
plt.close()

In [89]:
! pip install openpyxl


5884.37s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Defaulting to user installation because normal site-packages is not writeable
Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [90]:
#Load Data into CSV and EXCEL

# Save the data analysis as a CSV file
grouped_data.to_csv('data_analysis_covid19.csv', index=True)

# Save the data analysis as an Excel file
grouped_data.to_excel('data_analysis_covid19.xlsx', index=True)

# Save the data analysis as a CSV file
grouped_data2.to_csv('data_analysis_vaccinations.csv', index=True)

# Save the data analysis as an Excel file
grouped_data2.to_excel('data_analysis_vaccinations.xlsx', index=True)