In [11]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the CSV file into a dataframe
df = pd.read_csv('C:/Users/Noor/Desktop/owid-covid-data.csv')




In [12]:
# Check number of rows and columns in the dataframe
print("Number of rows and columns:", df.shape)



Number of rows and columns: (309797, 67)


In [13]:
# View the first 10 rows and last 5 rows of the data
print("First 10 rows:")
print(df.head(10))
print("Last 5 rows:")
print(df.tail(5))



First 10 rows:
  iso_code continent     location        date  total_cases  new_cases  \
0      AFG      Asia  Afghanistan  2020-01-03          NaN        0.0   
1      AFG      Asia  Afghanistan  2020-01-04          NaN        0.0   
2      AFG      Asia  Afghanistan  2020-01-05          NaN        0.0   
3      AFG      Asia  Afghanistan  2020-01-06          NaN        0.0   
4      AFG      Asia  Afghanistan  2020-01-07          NaN        0.0   
5      AFG      Asia  Afghanistan  2020-01-08          NaN        0.0   
6      AFG      Asia  Afghanistan  2020-01-09          NaN        0.0   
7      AFG      Asia  Afghanistan  2020-01-10          NaN        0.0   
8      AFG      Asia  Afghanistan  2020-01-11          NaN        0.0   
9      AFG      Asia  Afghanistan  2020-01-12          NaN        0.0   

   new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  ...  \
0                 NaN           NaN         0.0                  NaN  ...   
1                 NaN      

In [14]:
# Show the basic summary of data
print("Summary of data:")
print(df.describe())



Summary of data:
        total_cases     new_cases  new_cases_smoothed  total_deaths  \
count  2.736270e+05  3.009060e+05        2.996420e+05  2.529760e+05   
mean   5.720062e+06  1.078888e+04        1.083173e+04  8.034578e+04   
std    3.574208e+07  1.033029e+05        1.005335e+05  4.152550e+05   
min    1.000000e+00  0.000000e+00        0.000000e+00  1.000000e+00   
25%    6.514500e+03  0.000000e+00        1.143000e+00  1.210000e+02   
50%    6.209400e+04  1.700000e+01        3.828600e+01  1.225000e+03   
75%    6.351960e+05  5.290000e+02        6.352502e+02  1.057925e+04   
max    7.659025e+08  7.460827e+06        6.410668e+06  6.927365e+06   

          new_deaths  new_deaths_smoothed  total_cases_per_million  \
count  301000.000000        299770.000000            273627.000000   
mean       96.218953            96.588991             87308.498902   
std       602.347508           593.003157            137715.424666   
min         0.000000             0.000000                 0.000

In [15]:
# Drop the following columns: new_deaths_smoothed', 'new_cases_per_million', 'total_cases_per_million'
df = df.drop(['new_deaths_smoothed', 'new_cases_per_million', 'total_cases_per_million'], axis=1)

# Rename the following columns: 'date': 'Date', 'location': 'Country', 'continent': 'Continent', 'iso_code': 'ISO_code'
df = df.rename(columns={'date': 'Date', 'location': 'Country', 'continent': 'Continent', 'iso_code': 'ISO_code'})

# List the continent names
continents = df['Continent'].unique()
print("Continent names:")
print(continents)



Continent names:
['Asia' nan 'Europe' 'Africa' 'Oceania' 'North America' 'South America']


In [16]:
# Impute missing values with 0
imputer = SimpleImputer(strategy='constant', fill_value=0)
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Groupby 'Date' and 'Country' and calculate the sum of 'total_cases', 'total_deaths', 'total_vaccinations'
df3 = df.groupby(['Date', 'Country']).agg({
    'total_cases': 'sum',
    'total_deaths': 'sum',
    'total_vaccinations': 'sum'
}).reset_index()

# Find total countries where total_deaths is greater than 1000000
total_countries = df3[df3['total_deaths'] > 1000000]['Country'].nunique()
print("Total countries where total deaths is greater than 1000000:", total_countries)

# Find how many dates we have in total where total_deaths is greater than 1000000
total_dates = df3[df3['total_deaths'] > 1000000]['Date'].nunique()
print("Total dates where total deaths is greater than 1000000:", total_dates)


Total countries where total deaths is greater than 1000000: 10
Total dates where total deaths is greater than 1000000: 974
