## Option 3 - Spread of Covid 19

Look at this document for reference,
https://www.kaggle.com/anandhuh/covid19-in-world-countrieslatest-data

In [1]:
# (1) Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
# (2) Import the CSV file
covid19Df = pd.read_csv('./worldwide covid data.csv')

In [3]:
# (3) Explore data (for example, see what is categorial and numerical)
covid19Df.head()

Unnamed: 0,Country,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Cases/1M population,Deaths/1M population,Total Tests,Tests/1M population,Population
0,Afghanistan,156307,7281,128791.0,20235.0,3898,182,774655.0,19319.0,40097200
1,Albania,186222,2937,175451.0,7834.0,64805,1022,1311540.0,456411.0,2873596
2,Algeria,206649,5927,141811.0,58911.0,4602,132,230861.0,5141.0,44907419
3,Andorra,15516,130,15242.0,144.0,200387,1679,193595.0,2500258.0,77430
4,Angola,64487,1713,53376.0,9398.0,1884,50,1092363.0,31915.0,34227629


In [4]:
# Check data types
covid19Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    196 non-null    object 
 1   Total Cases                196 non-null    int64  
 2   Total Deaths               196 non-null    int64  
 3   Total Recovered            194 non-null    float64
 4   Active Cases               194 non-null    float64
 5   Total Cases/1M population  196 non-null    int64  
 6   Deaths/1M population       196 non-null    int64  
 7   Total Tests                192 non-null    float64
 8   Tests/1M population        192 non-null    float64
 9   Population                 196 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 15.4+ KB


In [5]:
# (4) Choose the label and features
LabelDf = covid19Df['Deaths/1M population']
featuresDf = covid19Df.drop('Deaths/1M population', axis=1,inplace =False)

In [6]:
# (5) Feature engineer for data that is
#(a) relevant
# Droping irrelevant features 
featuresDf = covid19Df.drop(['Country','Total Cases','Total Deaths','Total Recovered',
                             'Total Tests', 'Population', 'Active Cases'], axis=1,inplace =False)


In [7]:
# (5) Feature engineer for data that is
# (b) unique
# Prove No Duplicates
print("Before Dropping Duplicates : ",covid19Df.shape,'\n')
covid19Df=covid19Df.drop_duplicates()
print("\nAfter Dropping Duplicates : ",covid19Df.shape)

Before Dropping Duplicates :  (196, 10) 


After Dropping Duplicates :  (196, 10)


In [8]:
# (5) Feature engineer for data that is
# (c) correct 
#Make sure no value is negative 
#Finding Negative Values and Capping To 0.
for col in covid19Df.columns:
    if(col=='Country'):
        continue
    for j in range(len(covid19Df)):
        if(covid19Df[col][j]<0):
            covid19Df[col][j]=0

In [9]:
# (5) Feature engineer for data that is
# (d) not missing
# First check the number of NULLS 
covid19Df.isna().sum()

Country                      0
Total Cases                  0
Total Deaths                 0
Total Recovered              2
Active Cases                 2
Total Cases/1M population    0
Deaths/1M population         0
Total Tests                  4
Tests/1M population          4
Population                   0
dtype: int64

In [10]:
covid19Df.fillna(0,inplace=True)

In [11]:
covid19Df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196 entries, 0 to 195
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    196 non-null    object 
 1   Total Cases                196 non-null    int64  
 2   Total Deaths               196 non-null    int64  
 3   Total Recovered            196 non-null    float64
 4   Active Cases               196 non-null    float64
 5   Total Cases/1M population  196 non-null    int64  
 6   Deaths/1M population       196 non-null    int64  
 7   Total Tests                196 non-null    float64
 8   Tests/1M population        196 non-null    float64
 9   Population                 196 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 20.9+ KB


In [12]:
featuresDf.head()

Unnamed: 0,Total Cases/1M population,Deaths/1M population,Tests/1M population
0,3898,182,19319.0
1,64805,1022,456411.0
2,4602,132,5141.0
3,200387,1679,2500258.0
4,1884,50,31915.0


In [16]:
# Covert float64 into int64
covid19Df['Total Recovered'] = covid19Df['Total Recovered'].apply(np.int64)
covid19Df['Active Cases'] = covid19Df['Active Cases'].apply(np.int64)
covid19Df['Total Tests'] = covid19Df['Total Tests'].apply(np.int64)
covid19Df['Tests/1M population'] = covid19Df['Tests/1M population'].apply(np.int64)

covid19Df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196 entries, 0 to 195
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Country                    196 non-null    object
 1   Total Cases                196 non-null    int64 
 2   Total Deaths               196 non-null    int64 
 3   Total Recovered            196 non-null    int64 
 4   Active Cases               196 non-null    int64 
 5   Total Cases/1M population  196 non-null    int64 
 6   Deaths/1M population       196 non-null    int64 
 7   Total Tests                196 non-null    int64 
 8   Tests/1M population        196 non-null    int64 
 9   Population                 196 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 20.9+ KB


In [14]:
# (6) Confirm data is ready with further exploratory analysis


In [15]:
# (7) Training, Testing (and/or Validation) data split 

# for example, 60/20/20