### **Imports**

In [400]:
import time
start_time = time.time()
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io
import math
import string
import os

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

### **DataSet** **Upload**

In [401]:
from google.colab import files , drive

#upload for files in google drive
drive.mount('/content/gdrive')
dataset = "/content/gdrive/My Drive/Colab Notebooks/covid_data2.csv"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [402]:
"""
#alternative for local file upload (file needs to be downloaded locally)
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['covid_data.csv']))
"""

"\n#alternative for local file upload (file needs to be downloaded locally)\nfrom google.colab import files\nuploaded = files.upload()\ndf = pd.read_csv(io.BytesIO(uploaded['covid_data.csv']))\n"

### **DataUpload check**

In [403]:
df = pd.read_csv(dataset) #this will be omited for alterantive upload

In [404]:
df.shape

(81185, 59)

In [405]:
df.head(5)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,0.026,0.026,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,0.026,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511


In [406]:
df.nunique()

iso_code                                   215
continent                                    6
location                                   215
date                                       468
total_cases                              42656
new_cases                                11118
new_cases_smoothed                       21160
total_deaths                             16210
new_deaths                                2643
new_deaths_smoothed                       5907
total_cases_per_million                  61321
new_cases_per_million                    35099
new_cases_smoothed_per_million           39517
total_deaths_per_million                 37676
new_deaths_per_million                    6821
new_deaths_smoothed_per_million           8128
reproduction_rate                          374
icu_patients                              2243
icu_patients_per_million                  4700
hosp_patients                             4353
hosp_patients_per_million                 7621
weekly_icu_ad

In [407]:
df.dtypes.value_counts()

float64    54
object      5
dtype: int64

In [408]:
df.dtypes

iso_code                                  object
continent                                 object
location                                  object
date                                      object
total_cases                              float64
new_cases                                float64
new_cases_smoothed                       float64
total_deaths                             float64
new_deaths                               float64
new_deaths_smoothed                      float64
total_cases_per_million                  float64
new_cases_per_million                    float64
new_cases_smoothed_per_million           float64
total_deaths_per_million                 float64
new_deaths_per_million                   float64
new_deaths_smoothed_per_million          float64
reproduction_rate                        float64
icu_patients                             float64
icu_patients_per_million                 float64
hosp_patients                            float64
hosp_patients_per_mi

### **Data Preprocessing**

In [409]:
df.drop(['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million',
         'new_tests_smoothed_per_thousand','new_tests_smoothed', 'new_vaccinations_smoothed', 'new_vaccinations_smoothed_per_million'], axis=1, inplace=True)

In [410]:
df = df[df.continent == 'Europe'] #filters europe only entries

In [411]:
df.continent.value_counts() #reduced entries from 67K to 16K

Europe    19536
Name: continent, dtype: int64

In [412]:
"""
Loop to select the countries with the most common entries
INPUT: series of countries with number of entries
OUTPUT: list of the most common countries with the number of entries
"""

i = 0
theMostCommonCountry_List = []
while i < 10:
  part_index = str(df.location.value_counts().index[i])
  theMostCommonCountry_List.append( part_index )
  i = i + 1
print(theMostCommonCountry_List)

['France', 'Germany', 'Finland', 'Russia', 'Italy', 'United Kingdom', 'Sweden', 'Spain', 'Denmark', 'Slovenia']


In [413]:
df = df[df.location.isin(theMostCommonCountry_List)]  #filters entries by list of common countries

In [414]:
df = df.reset_index()   #resets index
df.drop(columns=['index','continent'], inplace=True) #drops old index column and continent becasue it is not needed
df

Unnamed: 0,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,DNK,Denmark,2020-02-02,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,,,tests performed,,,,,,,,0.00,5792203.0,136.520,42.3,19.677,12.325,46682.515,0.2,114.767,6.41,19.3,18.8,,2.50,80.90,0.940
1,DNK,Denmark,2020-02-03,,,,,,,,,,,,,,,,,,,,,,,,tests performed,,,,,,,,0.00,5792203.0,136.520,42.3,19.677,12.325,46682.515,0.2,114.767,6.41,19.3,18.8,,2.50,80.90,0.940
2,DNK,Denmark,2020-02-04,,,,,,,,,,,,,,,,,,,,,,,,tests performed,,,,,,,,0.00,5792203.0,136.520,42.3,19.677,12.325,46682.515,0.2,114.767,6.41,19.3,18.8,,2.50,80.90,0.940
3,DNK,Denmark,2020-02-05,,,,,,,,,,,,,,,,,,,,,,,,tests performed,,,,,,,,0.00,5792203.0,136.520,42.3,19.677,12.325,46682.515,0.2,114.767,6.41,19.3,18.8,,2.50,80.90,0.940
4,DNK,Denmark,2020-02-06,,,,,,,,,,,,,,,,,,,,,,,,tests performed,,,,,,,,0.00,5792203.0,136.520,42.3,19.677,12.325,46682.515,0.2,114.767,6.41,19.3,18.8,,2.50,80.90,0.940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4382,GBR,United Kingdom,2021-04-08,4384954.0,3124.0,127224.0,53.0,64592.902,46.018,1874.083,0.781,0.69,,,,,,,,,,,,,,,,38444540.0,31903366.0,6541174.0,545511.0,56.63,47.00,9.64,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932
4383,GBR,United Kingdom,2021-04-09,4380167.0,-4787.0,127284.0,60.0,64522.387,-70.515,1874.967,0.884,0.69,,,,,,,,,,,,,,,,39001554.0,32010244.0,6991310.0,557014.0,57.45,47.15,10.30,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932
4384,GBR,United Kingdom,2021-04-10,4382880.0,2713.0,127324.0,40.0,64562.351,39.964,1875.556,0.589,0.69,,,,,,,,,,,,,,,,39587893.0,32121353.0,7466540.0,586339.0,58.32,47.32,11.00,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932
4385,GBR,United Kingdom,2021-04-11,4384610.0,1730.0,127331.0,7.0,64587.835,25.484,1875.659,0.103,,,,,,,,,,,,,,,,,,,,,,,,75.93,67886004.0,272.898,40.8,18.517,12.527,39753.244,0.2,122.137,4.28,20.0,24.7,,2.54,81.32,0.932


In [415]:
"""
Loop for checking which features are object type
INPUT: pandas series
OUTPUT: list of objects
"""
list_of_objects = []
x=0
while x < len(df.columns):
  if df.dtypes.values[x] == object:
    list_of_objects.append(df.dtypes.index[x])
  x = x + 1
print(list_of_objects)

['iso_code', 'location', 'date', 'tests_units']


In [416]:
df.iso_code = LabelEncoder().fit_transform(df.iso_code)
df.location = LabelEncoder().fit_transform(df.location)
df.date = LabelEncoder().fit_transform(df.date)

In [417]:
df.tests_units.fillna('Not Tested', inplace=True)
df.tests_units.value_counts()

tests performed    3458
Not Tested          601
people tested       328
Name: tests_units, dtype: int64

In [418]:
df.tests_units = df.tests_units.replace(['tests performed', 'people tested'], 'Tested')
df.tests_units.value_counts()

Tested        3786
Not Tested     601
Name: tests_units, dtype: int64

In [419]:
df.tests_units = LabelEncoder().fit_transform(df.tests_units)

In [420]:
df.dtypes

iso_code                                 int64
location                                 int64
date                                     int64
total_cases                            float64
new_cases                              float64
total_deaths                           float64
new_deaths                             float64
total_cases_per_million                float64
new_cases_per_million                  float64
total_deaths_per_million               float64
new_deaths_per_million                 float64
reproduction_rate                      float64
icu_patients                           float64
icu_patients_per_million               float64
hosp_patients                          float64
hosp_patients_per_million              float64
weekly_icu_admissions                  float64
weekly_icu_admissions_per_million      float64
weekly_hosp_admissions                 float64
weekly_hosp_admissions_per_million     float64
new_tests                              float64
total_tests  

In [421]:
df.location.value_counts() #counts amount of entrires 

2    445
3    442
1    440
9    438
5    438
4    438
7    437
8    437
6    436
0    436
Name: location, dtype: int64

In [422]:
correlation_matrix = df.corr().round(2) 
correlation_matrix.shape #checks size of the matrix

(50, 50)

In [423]:
df.isna().sum() #counts NA values

iso_code                                  0
location                                  0
date                                      0
total_cases                              57
new_cases                                57
total_deaths                            371
new_deaths                              371
total_cases_per_million                  57
new_cases_per_million                    57
total_deaths_per_million                371
new_deaths_per_million                  371
reproduction_rate                       385
icu_patients                           1320
icu_patients_per_million               1320
hosp_patients                          1694
hosp_patients_per_million              1694
weekly_icu_admissions                  4156
weekly_icu_admissions_per_million      4156
weekly_hosp_admissions                 3982
weekly_hosp_admissions_per_million     3982
new_tests                              1420
total_tests                            1887
total_tests_per_thousand        

In [424]:
df = df.drop(columns=['handwashing_facilities','weekly_icu_admissions',
                      'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 
                      'weekly_hosp_admissions_per_million', 'icu_patients_per_million',
                      'hosp_patients_per_million', 'total_cases_per_million', 'new_cases_per_million',
                      'total_deaths_per_million', 
                      ]) #drops those columns from the dataset

### **Target Dataset Preprocessing**

In [425]:
print("", df["stringency_index"].max(), " = highest restriction value \n",
      df["stringency_index"].mean(), " = average restriction value \n",
      df["stringency_index"].min(), " = lowest restriction value \n")

 93.52  = highest restriction value 
 58.438722132842564  = average restriction value 
 0.0  = lowest restriction value 



In [426]:


#loop for changing all the values smaller than 53 to 0 
i = 0
while i < len(df): 
  if df.stringency_index[i] < 53.0:
    df.stringency_index  = df.stringency_index.replace(df.stringency_index[i], 0)

  elif math.isnan(df.stringency_index[i]) == True:
    df.stringency_index  = df.stringency_index.replace((df.stringency_index[i]), 0)

  i = i + 1
#loop for changning all the values bigger than 53 to 1
i=0
while i < len(df):
  if df.stringency_index[i] >= 53.0:
    df.stringency_index  = df.stringency_index.replace(df.stringency_index[i], 1)
  i = i + 1


In [427]:
#replace floats with string
df.stringency_index  = df.stringency_index.replace(1.0, "lockdown")
df.stringency_index  = df.stringency_index.replace(0.0, "open")

In [428]:
df.stringency_index.value_counts()

lockdown    2816
open        1571
Name: stringency_index, dtype: int64

In [429]:
df = df.fillna(0)

In [430]:
Y = df.stringency_index
X = df.drop(columns=['stringency_index'])

### **RFE Feature Selection**

In [431]:
#importing algorithm to select best performing features
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier


rfe_selector = RFE(estimator=DecisionTreeClassifier(), n_features_to_select = 10, step = 1)

rfe_transformed = rfe_selector.fit_transform(X, Y)

cols = list(X.columns)

DecisionTreeClassifier().fit(rfe_transformed,Y)

temp = pd.Series(rfe_selector.support_, index=cols) #support changes rfe selector into numpy array

selected_features = temp[temp==True].index

In [432]:
selected_features

Index(['date', 'total_cases', 'total_deaths', 'new_deaths_per_million',
       'reproduction_rate', 'icu_patients', 'hosp_patients', 'positive_rate',
       'aged_65_older', 'male_smokers'],
      dtype='object')

### **Building Model**

In [433]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix, confusion_matrix,classification_report
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [434]:
X = df.filter(items=selected_features)
X

Unnamed: 0,date,total_cases,total_deaths,new_deaths_per_million,reproduction_rate,icu_patients,hosp_patients,positive_rate,aged_65_older,male_smokers
0,9,0.0,0.0,0.000,0.00,0.0,0.0,0.0,19.677,18.8
1,10,0.0,0.0,0.000,0.00,0.0,0.0,0.0,19.677,18.8
2,11,0.0,0.0,0.000,0.00,0.0,0.0,0.0,19.677,18.8
3,12,0.0,0.0,0.000,0.00,0.0,0.0,0.0,19.677,18.8
4,13,0.0,0.0,0.000,0.00,0.0,0.0,0.0,19.677,18.8
...,...,...,...,...,...,...,...,...,...,...
4382,440,4384954.0,127224.0,0.781,0.69,0.0,0.0,0.0,18.517,24.7
4383,441,4380167.0,127284.0,0.884,0.69,0.0,0.0,0.0,18.517,24.7
4384,442,4382880.0,127324.0,0.589,0.69,0.0,0.0,0.0,18.517,24.7
4385,443,4384610.0,127331.0,0.103,0.00,0.0,0.0,0.0,18.517,24.7


In [435]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=3)

In [436]:
X_train.shape

(3070, 10)

In [437]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [438]:
performanceList = []
accuracylist = []

In [439]:
#1
gnb = GaussianNB()

model_gnb = gnb.fit(X_train, Y_train)    #fitting the model
y_pred = gnb.fit(X_train, Y_train).predict(X_test)    #predicting Gaussian NB the target

print("Training accuracy: {:,.4f}".format(model_gnb.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_gnb.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_pred)))

performanceList.append(model_gnb)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 0.7003
Testing accuracy: 0.7191
Total points:  1317 
Miss matched points:  370 
Accuracy: 0.7191


In [440]:
#2
bnb = BernoulliNB()

model_bnb = bnb.fit(X_train, Y_train)    #fitting Bernoulli NB
y_pred = bnb.fit(X_train, Y_train).predict(X_test)    #predicting Bernoulli NB the target

print("Training accuracy: {:,.4f}".format(model_bnb.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_bnb.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_pred)))

performanceList.append(model_bnb)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 0.6775
Testing accuracy: 0.6955
Total points:  1317 
Miss matched points:  401 
Accuracy: 0.6955


In [441]:
#3
logistic = LogisticRegression()

model_log = logistic.fit(X_train, Y_train) #logistic regression is fitted
y_test_pred = model_log.predict(X_test) #predicting Logistic Regression's target

print("Training accuracy: {:,.4f}".format(model_log.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_log.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_test_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_test_pred)))

performanceList.append(model_log)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 0.7840
Testing accuracy: 0.7677
Total points:  1317 
Miss matched points:  306 
Accuracy: 0.7677


In [442]:
#4
svm = SVC()

model_SVM = svm.fit(X_train, Y_train)
y_test_pred = model_SVM.predict(X_test)

print("Training accuracy: {:,.4f}".format(model_SVM.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_SVM.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_test_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_test_pred)))

performanceList.append(model_SVM)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 0.9264
Testing accuracy: 0.9127
Total points:  1317 
Miss matched points:  115 
Accuracy: 0.9127


In [450]:
#5
knn = KNeighborsClassifier(n_neighbors=10,p=1)

model_knn = knn.fit(X_train, Y_train)
y_test_pred = model_knn.predict(X_test)

print("Training accuracy: {:,.4f}".format(model_knn.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_knn.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_test_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_test_pred)))

performanceList.append(model_knn)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 1.0000
Testing accuracy: 0.9856
Total points:  1317 
Miss matched points:  19 
Accuracy: 0.9856


In [444]:
#6
dTree = DecisionTreeClassifier()

model_dTree = dTree.fit(X_train, Y_train)
y_test_pred = model_dTree.predict(X_test)

print("Training accuracy: {:,.4f}".format(model_dTree.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_dTree.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_test_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_test_pred)))

performanceList.append(model_dTree)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 1.0000
Testing accuracy: 0.9894
Total points:  1317 
Miss matched points:  14 
Accuracy: 0.9894


In [445]:
#7
rForest = RandomForestClassifier()

model_rForest = rForest.fit(X_train, Y_train)
y_test_pred = model_rForest.predict(X_test)

print("Training accuracy: {:,.4f}".format(model_rForest.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_rForest.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_test_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_test_pred)))

performanceList.append(model_rForest)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 1.0000
Testing accuracy: 0.9924
Total points:  1317 
Miss matched points:  10 
Accuracy: 0.9924


In [446]:
#8
adaB = AdaBoostClassifier()

model_adaB = adaB.fit(X_train, Y_train)
y_test_pred = model_adaB.predict(X_test)

print("Training accuracy: {:,.4f}".format(model_adaB.score(X_train, Y_train)))
print("Testing accuracy: {:,.4f}".format(model_adaB.score(X_test, Y_test)))

print("Total points: ", X_test.shape[0], "\nMiss matched points: ", (Y_test != y_test_pred).sum(), "\nAccuracy: {:.4f}".format(accuracy_score(Y_test, y_test_pred)))

performanceList.append(model_adaB)
accuracylist.append(accuracy_score(Y_test, y_test_pred))

Training accuracy: 0.9609
Testing accuracy: 0.9598
Total points:  1317 
Miss matched points:  53 
Accuracy: 0.9598


### **Cross Validation**

In [447]:
"""
Loop for performing cross validation on every model.
INPUT: List of models 
OUTPUT: List of models with mean accuracy score after cross validation
"""
i=0 
print("  Algorithm    :  CV value      Original value       Difference")
while i < len(performanceList):
  clf = cross_val_score(performanceList[i], X, Y, cv=10, scoring='accuracy')
  clf_rounded = clf.mean().round(4)
  al = accuracylist[i].round(4)
  if (al - clf_rounded > 0):
    difference = "-"+str((al - clf_rounded).round(4))
  else :
    difference = "+"+str((al - clf_rounded).round(4))
  print("Algorithm ", [i+1], ": ", clf_rounded,"      ", al, "             ", difference) 
  i=i+1

  Algorithm    :  CV value      Original value       Difference
Algorithm  [1] :  0.5685        0.9514               -0.3829
Algorithm  [2] :  0.6878        0.9514               -0.2636
Algorithm  [3] :  0.6187        0.7677               -0.149
Algorithm  [4] :  0.7016        0.9127               -0.2111
Algorithm  [5] :  0.6321        0.9696               -0.3375
Algorithm  [6] :  0.6992        0.9894               -0.2902
Algorithm  [7] :  0.7314        0.9924               -0.261
Algorithm  [8] :  0.7197        0.9598               -0.2401


In [449]:
from sklearn.model_selection import GridSearchCV

print("\n")

param_grid={'n_neighbors': [3,5,11,19],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan'],
            'algorithm':['ball_tree','kd_tree','brute']}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=1, cv=10,n_jobs=-1)

grid_results = grid.fit(X_train,Y_train)

print("Best estimator: ",grid_results.best_estimator_)
print("Best parameters: ",grid_results.best_params_)

print("Best score:", grid_results.best_score_)



Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    4.0s


Best estimator:  KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')
Best parameters:  {'algorithm': 'ball_tree', 'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best score: 0.9817589576547231


[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   10.9s finished
