### **Imports**

In [None]:
import time
start_time = time.time()
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import math
import string
import os

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

import warnings
warnings.filterwarnings('ignore')

### **DataSet** **Upload**

In [None]:
from google.colab import files , drive

#upload for files in google drive
drive.mount('/content/gdrive')
dataset = "/content/gdrive/My Drive/Colab Notebooks/covid_data.csv"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
"""
#alternative for local file upload (file needs to be downloaded locally)
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['covid_data.csv']))
"""

"\n#alternative for local file upload (file needs to be downloaded locally)\nfrom google.colab import files\nuploaded = files.upload()\ndf = pd.read_csv(io.BytesIO(uploaded['covid_data.csv']))\n"

### **DataUpload check**

In [None]:
df = pd.read_csv(dataset) #this will be omited for alterantive upload

In [None]:
df.shape

(67166, 59)

In [None]:
df.head(5)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,0.026,0.026,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.498
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.498
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.498
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.498
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.498


In [None]:
df.nunique()

iso_code                                   192
continent                                    6
location                                   200
date                                       403
total_cases                              34313
new_cases                                 9391
new_cases_smoothed                       17775
total_deaths                             12928
new_deaths                                2305
new_deaths_smoothed                       5143
total_cases_per_million                  48425
new_cases_per_million                    27581
new_cases_smoothed_per_million           31234
total_deaths_per_million                 28379
new_deaths_per_million                    5548
new_deaths_smoothed_per_million           6782
reproduction_rate                          356
icu_patients                              1938
icu_patients_per_million                  3956
hosp_patients                             3696
hosp_patients_per_million                 6291
weekly_icu_ad

In [None]:
df.dtypes.value_counts()

float64    54
object      5
dtype: int64

In [None]:
df.dtypes

iso_code                                  object
continent                                 object
location                                  object
date                                      object
total_cases                              float64
new_cases                                float64
new_cases_smoothed                       float64
total_deaths                             float64
new_deaths                               float64
new_deaths_smoothed                      float64
total_cases_per_million                  float64
new_cases_per_million                    float64
new_cases_smoothed_per_million           float64
total_deaths_per_million                 float64
new_deaths_per_million                   float64
new_deaths_smoothed_per_million          float64
reproduction_rate                        float64
icu_patients                             float64
icu_patients_per_million                 float64
hosp_patients                            float64
hosp_patients_per_mi

### **Data Preprocessing**

In [None]:
df.drop(['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million',
         'new_tests_smoothed_per_thousand','new_tests_smoothed', 'new_vaccinations_smoothed', 'new_vaccinations_smoothed_per_million'], axis=1, inplace=True)

In [None]:
df = df[df.continent == 'Europe'] #filters europe only entries

In [None]:
df.continent.value_counts() #reduced entries from 67K to 16K

Europe    16203
Name: continent, dtype: int64

In [None]:
"""
Loop to select the countries with the most common entries
INPUT: series of countries with number of entries
OUTPUT: list of the most common countries with the number of entries
"""

i = 0
theMostCommonCountry_List = []
while i < 10:
  part_index = str(df.location.value_counts().index[i])
  theMostCommonCountry_List.append( part_index )
  i = i + 1
print(theMostCommonCountry_List)

['France', 'Switzerland', 'Germany', 'Finland', 'Italy', 'United Kingdom', 'Russia', 'Sweden', 'Estonia', 'Spain']


In [None]:
df = df[df.location.isin(theMostCommonCountry_List)]  #filters entries by list of common countries

In [None]:
df = df.reset_index()   #resets index
df.drop(columns=['index','continent'], inplace=True) #drops old index column and continent becasue it is not needed
df

Unnamed: 0,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,EST,Estonia,2020-02-01,,,,,,,,,,,,1.0,0.754,,,,,,,,,,,,,,,,,,,0.0,1326539.0,31.033,42.7,19.452,13.491,29481.252,0.5,255.569,4.02,24.5,39.3,,4.69,78.74,0.871
1,EST,Estonia,2020-02-02,,,,,,,,,,,,1.0,0.754,,,,,,,,,,,,,,,,,,,0.0,1326539.0,31.033,42.7,19.452,13.491,29481.252,0.5,255.569,4.02,24.5,39.3,,4.69,78.74,0.871
2,EST,Estonia,2020-02-03,,,,,,,,,,,,1.0,0.754,,,,,,,,,,,,,,,,,,,0.0,1326539.0,31.033,42.7,19.452,13.491,29481.252,0.5,255.569,4.02,24.5,39.3,,4.69,78.74,0.871
3,EST,Estonia,2020-02-04,,,,,,,,,,,,1.0,0.754,,,,,,,,,,,,,,,,,,,0.0,1326539.0,31.033,42.7,19.452,13.491,29481.252,0.5,255.569,4.02,24.5,39.3,,4.69,78.74,0.871
4,EST,Estonia,2020-02-05,,,,,,,,,,,,1.0,0.754,,,,,,,,,,,,,,,,,,,0.0,1326539.0,31.033,42.7,19.452,13.491,29481.252,0.5,255.569,4.02,24.5,39.3,,4.69,78.74,0.871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3742,GBR,United Kingdom,2021-02-02,3863757.0,16906.0,108225.0,1451.0,56915.369,249.035,1594.217,21.374,,3638.0,53.590,31670.0,466.517,,,,,606382.0,71642534.0,1055.336,8.932,0.036,28.1,tests performed,10520433.0,10021471.0,498962.0,376922.0,15.50,14.76,0.73,,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.922
3743,GBR,United Kingdom,2021-02-03,3882972.0,19215.0,109547.0,1322.0,57198.418,283.048,1613.691,19.474,,3625.0,53.398,,,,,,,801949.0,72464146.0,1067.439,11.813,0.034,29.5,tests performed,10992444.0,10490487.0,501957.0,472011.0,16.19,15.45,0.74,,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.922
3744,GBR,United Kingdom,2021-02-04,3903706.0,20734.0,110462.0,915.0,57503.841,305.424,1627.169,13.478,,,,,,,,,,,,,,,,,11477040.0,10971047.0,505993.0,484596.0,16.91,16.16,0.75,,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.922
3745,GBR,United Kingdom,2021-02-05,3922910.0,19204.0,111477.0,1015.0,57786.727,282.886,1642.121,14.952,,,,,,,,,,,,,,,,,11975267.0,11465210.0,510057.0,498227.0,17.64,16.89,0.75,,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.922


In [None]:
"""
Loop for checking which features are object type
INPUT: pandas series
OUTPUT: list of objects
"""
list_of_objects = []
x=0
while x < len(df.columns):
  if df.dtypes.values[x] == object:
    list_of_objects.append(df.dtypes.index[x])
  x = x + 1
print(list_of_objects)

['iso_code', 'location', 'date', 'tests_units']


In [None]:
df.iso_code = LabelEncoder().fit_transform(df.iso_code)
df.location = LabelEncoder().fit_transform(df.location)
df.date = LabelEncoder().fit_transform(df.date)

In [None]:
df.tests_units.fillna('Not Tested', inplace=True)
df.tests_units.value_counts()

tests performed    2877
Not Tested          605
people tested       265
Name: tests_units, dtype: int64

In [None]:
df.tests_units = df.tests_units.replace(['tests performed', 'people tested'], 'Tested')
df.tests_units.value_counts()

Tested        3142
Not Tested     605
Name: tests_units, dtype: int64

In [None]:
df.tests_units = LabelEncoder().fit_transform(df.tests_units)

In [None]:
df.dtypes

iso_code                                 int64
location                                 int64
date                                     int64
total_cases                            float64
new_cases                              float64
total_deaths                           float64
new_deaths                             float64
total_cases_per_million                float64
new_cases_per_million                  float64
total_deaths_per_million               float64
new_deaths_per_million                 float64
reproduction_rate                      float64
icu_patients                           float64
icu_patients_per_million               float64
hosp_patients                          float64
hosp_patients_per_million              float64
weekly_icu_admissions                  float64
weekly_icu_admissions_per_million      float64
weekly_hosp_admissions                 float64
weekly_hosp_admissions_per_million     float64
new_tests                              float64
total_tests  

In [None]:
df.location.value_counts() #counts amount of entrires 

8    380
2    380
3    377
1    375
9    373
5    373
4    373
7    372
6    372
0    372
Name: location, dtype: int64

In [None]:
correlation_matrix = df.corr().round(2) 
correlation_matrix.shape #checks size of the matrix

(50, 50)

In [None]:
df.isna().sum() #counts NA values

iso_code                                  0
location                                  0
date                                      0
total_cases                              58
new_cases                                58
total_deaths                            383
new_deaths                              383
total_cases_per_million                  58
new_cases_per_million                    58
total_deaths_per_million                383
new_deaths_per_million                  383
reproduction_rate                       521
icu_patients                           1448
icu_patients_per_million               1448
hosp_patients                          1735
hosp_patients_per_million              1735
weekly_icu_admissions                  3562
weekly_icu_admissions_per_million      3562
weekly_hosp_admissions                 3452
weekly_hosp_admissions_per_million     3452
new_tests                              1243
total_tests                            1609
total_tests_per_thousand        

In [None]:
df = df.drop(columns=['handwashing_facilities','weekly_icu_admissions',
                      'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 
                      'weekly_hosp_admissions_per_million', 'icu_patients_per_million',
                      'hosp_patients_per_million', 'total_cases_per_million', 'new_cases_per_million',
                      'total_deaths_per_million', 
                      ]) #drops those columns from the dataset

### **Target Dataset Preprocessing**

In [None]:
print("", df["stringency_index"].max(), " = highest restriction value \n",
      df["stringency_index"].mean(), " = average restriction value \n",
      df["stringency_index"].min(), " = lowest restriction value \n")

 93.52  = highest restriction value 
 54.220298869143704  = average restriction value 
 0.0  = lowest restriction value 



In [None]:


#loop for changing all the values smaller than 53 to 0 
i = 0
while i < len(df): 
  if df.stringency_index[i] < 53.0:
    df.stringency_index  = df.stringency_index.replace(df.stringency_index[i], 0)

  elif math.isnan(df.stringency_index[i]) == True:
    df.stringency_index  = df.stringency_index.replace((df.stringency_index[i]), 0)

  i = i + 1
#loop for changning all the values bigger than 53 to 1
i=0
while i < len(df):
  if df.stringency_index[i] >= 53.0:
    df.stringency_index  = df.stringency_index.replace(df.stringency_index[i], 1)
  i = i + 1


In [None]:
#replace floats with string
df.stringency_index  = df.stringency_index.replace(1.0, "lockdown")
df.stringency_index  = df.stringency_index.replace(0.0, "open")

In [None]:
df.stringency_index.value_counts()

lockdown    2202
open        1545
Name: stringency_index, dtype: int64

In [None]:
df = df.fillna(0)

In [None]:
Y = df.stringency_index
X = df.drop(columns=['stringency_index'])

### **RFE Feature Selection**

In [None]:
#importing algorithm to select best performing features
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier


rfe_selector = RFE(estimator=DecisionTreeClassifier(), n_features_to_select = 10, step = 1)

rfe_transformed = rfe_selector.fit_transform(X, Y)

cols = list(X.columns)

DecisionTreeClassifier().fit(rfe_transformed,Y)

temp = pd.Series(rfe_selector.support_, index=cols) #support changes rfe selector into numpy array

selected_features = temp[temp==True].index

In [None]:
selected_features

Index(['date', 'total_cases', 'total_deaths', 'new_deaths',
       'reproduction_rate', 'icu_patients', 'hosp_patients',
       'population_density', 'aged_70_older', 'male_smokers'],
      dtype='object')