In [99]:
from google.colab import files
uploaded = files.upload()


In [100]:
import pandas as pd
import numpy as np
import io
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, auc, roc_curve
pd.set_option("display.max_rows", None, "display.max_columns", None)
import math

from scipy.stats import pointbiserialr
from scipy.stats import pearsonr
from scipy.stats import chi2_contingency
from datetime import datetime

In [101]:
df = pd.read_csv(io.BytesIO(uploaded['covid_with_y_values.csv']), usecols=range(3,30)) #remove 1st to third column
df.head()

Unnamed: 0,Date,school_closures,international_travel_controls,restriction_gatherings,cancel_public_events,public_information_campaigns,restrictions_internal_movements,income_support,facial_coverings,vaccination_policy,testing_policy,contact_tracing,new_cases_smoothed_per_million,new_deaths_smoothed_per_million,new_vaccinations_smoothed_per_million,population,population_density,median_age,aged_65_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,hospital_beds_per_thousand,life_expectancy,human_development_index,international_travel_controls_binary,probability_open
0,1/6/2020,3,3,4,2,2,2,0,1,0,1,1,16.796,0.169,0,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,0,0.923077
1,8/6/2020,3,3,4,2,2,2,0,3,0,1,1,18.962,0.393,0,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,0,1.0
2,15/6/2020,3,1,4,2,2,2,0,3,0,1,1,16.921,0.4,0,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,1.0
3,22/6/2020,3,1,4,2,2,2,0,3,0,1,1,13.27,0.422,0,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,1.0
4,29/6/2020,3,1,4,2,2,2,0,3,0,1,1,7.637,0.525,0,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,1.0


## Select NA values for prediction and evaluation

In [102]:
data_topredict = pd.read_csv(io.BytesIO(uploaded['covid_with_y_values.csv'])) 

In [103]:
data_topredict = data_topredict[data_topredict["probability_open"].isna()] #to be used once model is done

In [104]:
data_topredict.head()

Unnamed: 0.1,Unnamed: 0,Entity,Code,Date,school_closures,international_travel_controls,restriction_gatherings,cancel_public_events,public_information_campaigns,restrictions_internal_movements,income_support,facial_coverings,vaccination_policy,testing_policy,contact_tracing,new_cases_smoothed_per_million,new_deaths_smoothed_per_million,new_vaccinations_smoothed_per_million,population,population_density,median_age,aged_65_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,hospital_beds_per_thousand,life_expectancy,human_development_index,international_travel_controls_binary,probability_open
45,45,Afghanistan,AFG,12/4/2021,0,1,0,2,1,0,0,2,2,3,0,1.927,0.077,169,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,
46,46,Afghanistan,AFG,19/4/2021,0,1,0,2,1,0,0,2,2,3,0,2.407,0.062,206,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,
47,47,Afghanistan,AFG,26/4/2021,0,1,0,2,1,0,0,2,2,3,0,4.099,0.169,292,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,
48,48,Afghanistan,AFG,3/5/2021,0,1,0,2,1,0,0,2,2,3,0,4.716,0.183,358,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,
49,49,Afghanistan,AFG,10/5/2021,0,1,0,2,1,0,0,3,2,3,1,6.47,0.206,358,38928341,54.422,18.6,2.581,1803.987,597.029,9.59,0.5,64.83,0.511,1,


## Give data label for probability > 0.5

In [105]:
df.dropna(inplace = True) #remove all the "to_predict"

In [106]:
conditions = [(df["probability_open"] < 0.5), (df["probability_open"]>= 0.5)]
values = [0, 1]

df["label"] = np.select(conditions, values) #if else condition for label

In [107]:
df.label.value_counts() #check for unbalanced dataset

0    3771
1    2754
Name: label, dtype: int64

## Dataset preparation for model training

In [108]:
allcols = df.columns.tolist()
feature_cols = allcols.copy()
feature_cols.remove("probability_open") #remove due to multicollinearity
feature_cols.remove("Date") #remove date which is not a feature in the model

print(feature_cols)

['school_closures', 'international_travel_controls', 'restriction_gatherings', 'cancel_public_events', 'public_information_campaigns', 'restrictions_internal_movements', 'income_support', 'facial_coverings', 'vaccination_policy', 'testing_policy', 'contact_tracing', 'new_cases_smoothed_per_million', 'new_deaths_smoothed_per_million', 'new_vaccinations_smoothed_per_million', 'population', 'population_density', 'median_age', 'aged_65_older', 'gdp_per_capita', 'cardiovasc_death_rate', 'diabetes_prevalence', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index', 'international_travel_controls_binary', 'label']


In [109]:
y = df["label"] #get response variable
x = df[feature_cols] #get features

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 3, stratify = y) #try to balance the dataset more

In [110]:

z = df[feature_cols] #get features
train=z.sample(frac=0.8,random_state=200) #random state is a seed value
test=z.drop(train.index)
train.head()

Unnamed: 0,school_closures,international_travel_controls,restriction_gatherings,cancel_public_events,public_information_campaigns,restrictions_internal_movements,income_support,facial_coverings,vaccination_policy,testing_policy,contact_tracing,new_cases_smoothed_per_million,new_deaths_smoothed_per_million,new_vaccinations_smoothed_per_million,population,population_density,median_age,aged_65_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,hospital_beds_per_thousand,life_expectancy,human_development_index,international_travel_controls_binary,label
298,1,3,2,1,2,0,2,2,0,2,2,12.927,0.032,0,9006400,106.749,44.4,19.202,45436.686,145.183,6.35,7.37,81.54,0.922,0,0
6218,1,3,0,0,2,0,2,3,0,2,1,20.606,0.359,0,37846605,124.027,41.8,16.763,27216.445,227.331,5.91,6.62,78.73,0.88,0,0
6567,1,3,0,1,2,0,2,3,0,2,2,0.0,0.0,0,98340,208.354,36.2,8.606,26382.287,242.648,10.55,3.6,73.4,0.796,0,0
1692,3,4,3,2,2,2,2,2,0,2,2,101.268,3.228,0,19116209,24.282,35.4,11.087,22767.037,127.993,8.46,2.11,80.18,0.851,0,0
3908,2,3,3,2,2,2,2,4,0,3,2,87.706,1.139,0,8655541,402.606,30.6,11.733,33132.32,93.32,6.74,2.99,82.97,0.919,0,0


In [111]:
y_train.value_counts() #training dataset is balanced at approximately 42% for y=1

0    2639
1    1928
Name: label, dtype: int64

In [112]:
y_test.value_counts() #test dataset is balanced at approximately 42% for y=1

0    1132
1     826
Name: label, dtype: int64

## Initial Model

In [113]:
all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

school_closures+international_travel_controls+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+vaccination_policy+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+new_vaccinations_smoothed_per_million+population+population_density+median_age+aged_65_older+gdp_per_capita+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary


In [114]:

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

Optimization terminated successfully.
         Current function value: 0.394297
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5194
Method:                           MLE   Df Model:                           25
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4222
Time:                        21:12:26   Log-Likelihood:                -2058.2
converged:                       True   LL-Null:                       -3562.4
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                            coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
Intercept                                 8.0251      

In [115]:
y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2683,519
Predicted Y=1,308,1710


Accuracy (Testing Set): 0.8416
Precision (Testing Set): 0.8474
Specificity (Testing Set): 0.897
Sensitivity (Testing Set): 0.7672
F1-Score (Testing Set): 0.8053
AUC (Testing Set): 0.8321


In [116]:
y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,106
Predicted Y=1,86,419


Accuracy (Testing Set): 0.8529
Precision (Testing Set): 0.8297
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.7981
F1-Score (Testing Set): 0.8136
AUC (Testing Set): 0.8439


# After dropping international_travel_controls

In [117]:
train = train.drop(columns = 'international_travel_controls')
test = test.drop(columns = 'international_travel_controls')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+vaccination_policy+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+new_vaccinations_smoothed_per_million+population+population_density+median_age+aged_65_older+gdp_per_capita+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394298
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5195
Method:                           MLE   Df Model:                           24
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:        

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2683,519
Predicted Y=1,308,1710


Accuracy (Testing Set): 0.8416
Precision (Testing Set): 0.8474
Specificity (Testing Set): 0.897
Sensitivity (Testing Set): 0.7672
F1-Score (Testing Set): 0.8053
AUC (Testing Set): 0.8321


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,106
Predicted Y=1,86,419


Accuracy (Testing Set): 0.8529
Precision (Testing Set): 0.8297
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.7981
F1-Score (Testing Set): 0.8136
AUC (Testing Set): 0.8439


# removing new_vaccinations_smoothed_per_million

In [118]:
train = train.drop(columns = 'new_vaccinations_smoothed_per_million')
test = test.drop(columns = 'new_vaccinations_smoothed_per_million')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+vaccination_policy+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+population+population_density+median_age+aged_65_older+gdp_per_capita+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394309
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5196
Method:                           MLE   Df Model:                           23
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4222
Time:                

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2683,519
Predicted Y=1,308,1710


Accuracy (Testing Set): 0.8416
Precision (Testing Set): 0.8474
Specificity (Testing Set): 0.897
Sensitivity (Testing Set): 0.7672
F1-Score (Testing Set): 0.8053
AUC (Testing Set): 0.8321


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,106
Predicted Y=1,86,419


Accuracy (Testing Set): 0.8529
Precision (Testing Set): 0.8297
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.7981
F1-Score (Testing Set): 0.8136
AUC (Testing Set): 0.8439


# removing gdp_per_capita

In [119]:
train = train.drop(columns = 'gdp_per_capita')
test = test.drop(columns = 'gdp_per_capita')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+vaccination_policy+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+population+population_density+median_age+aged_65_older+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394327
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5197
Method:                           MLE   Df Model:                           22
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4222
Time:                        21:12:2

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2682,515
Predicted Y=1,309,1714


Accuracy (Testing Set): 0.8421
Precision (Testing Set): 0.8473
Specificity (Testing Set): 0.8967
Sensitivity (Testing Set): 0.769
F1-Score (Testing Set): 0.8062
AUC (Testing Set): 0.8328


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,104
Predicted Y=1,86,421


Accuracy (Testing Set): 0.8544
Precision (Testing Set): 0.8304
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8019
F1-Score (Testing Set): 0.8159
AUC (Testing Set): 0.8458


# removing population_density

In [120]:
train = train.drop(columns = 'population_density')
test = test.drop(columns = 'population_density')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+vaccination_policy+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+population+median_age+aged_65_older+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394327
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5198
Method:                           MLE   Df Model:                           21
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4222
Time:                        21:12:27   Log-Likelihood:

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2682,515
Predicted Y=1,309,1714


Accuracy (Testing Set): 0.8421
Precision (Testing Set): 0.8473
Specificity (Testing Set): 0.8967
Sensitivity (Testing Set): 0.769
F1-Score (Testing Set): 0.8062
AUC (Testing Set): 0.8328


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,104
Predicted Y=1,86,421


Accuracy (Testing Set): 0.8544
Precision (Testing Set): 0.8304
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8019
F1-Score (Testing Set): 0.8159
AUC (Testing Set): 0.8458


# removing aged_65_older

In [121]:
train = train.drop(columns = 'aged_65_older')
test = test.drop(columns = 'aged_65_older')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+vaccination_policy+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+population+median_age+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394337
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5199
Method:                           MLE   Df Model:                           20
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4222
Time:                        21:12:28   Log-Likelihood:              

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2682,516
Predicted Y=1,309,1713


Accuracy (Testing Set): 0.842
Precision (Testing Set): 0.8472
Specificity (Testing Set): 0.8967
Sensitivity (Testing Set): 0.7685
F1-Score (Testing Set): 0.8059
AUC (Testing Set): 0.8326


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,104
Predicted Y=1,86,421


Accuracy (Testing Set): 0.8544
Precision (Testing Set): 0.8304
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8019
F1-Score (Testing Set): 0.8159
AUC (Testing Set): 0.8458


# removing vaccination_policy

In [122]:
train = train.drop(columns = 'vaccination_policy')
test = test.drop(columns = 'vaccination_policy')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+restriction_gatherings+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+population+median_age+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394511
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5200
Method:                           MLE   Df Model:                           19
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4219
Time:                        21:12:28   Log-Likelihood:                -2059.3
converged

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2684,519
Predicted Y=1,307,1710


Accuracy (Testing Set): 0.8418
Precision (Testing Set): 0.8478
Specificity (Testing Set): 0.8974
Sensitivity (Testing Set): 0.7672
F1-Score (Testing Set): 0.8055
AUC (Testing Set): 0.8323


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,105
Predicted Y=1,86,420


Accuracy (Testing Set): 0.8536
Precision (Testing Set): 0.83
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8
F1-Score (Testing Set): 0.8147
AUC (Testing Set): 0.8449


# removing restriction_gatherings

In [123]:
train = train.drop(columns = 'restriction_gatherings')
test = test.drop(columns = 'restriction_gatherings')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+testing_policy+contact_tracing+new_cases_smoothed_per_million+new_deaths_smoothed_per_million+population+median_age+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394673
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5201
Method:                           MLE   Df Model:                           18
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4217
Time:                        21:12:28   Log-Likelihood:                -2060.2
converged:                      

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2684,512
Predicted Y=1,307,1717


Accuracy (Testing Set): 0.8431
Precision (Testing Set): 0.8483
Specificity (Testing Set): 0.8974
Sensitivity (Testing Set): 0.7703
F1-Score (Testing Set): 0.8074
AUC (Testing Set): 0.8338


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,104
Predicted Y=1,86,421


Accuracy (Testing Set): 0.8544
Precision (Testing Set): 0.8304
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8019
F1-Score (Testing Set): 0.8159
AUC (Testing Set): 0.8458


# removing new_deaths_smoothed_per_million

In [124]:
train = train.drop(columns = 'new_deaths_smoothed_per_million')
test = test.drop(columns = 'new_deaths_smoothed_per_million')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+testing_policy+contact_tracing+new_cases_smoothed_per_million+population+median_age+cardiovasc_death_rate+diabetes_prevalence+hospital_beds_per_thousand+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.394969
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5202
Method:                           MLE   Df Model:                           17
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4213
Time:                        21:12:29   Log-Likelihood:                -2061.7
converged:                       True   LL-Null:                

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2683,513
Predicted Y=1,308,1716


Accuracy (Testing Set): 0.8427
Precision (Testing Set): 0.8478
Specificity (Testing Set): 0.897
Sensitivity (Testing Set): 0.7699
F1-Score (Testing Set): 0.807
AUC (Testing Set): 0.8334


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,104
Predicted Y=1,86,421


Accuracy (Testing Set): 0.8544
Precision (Testing Set): 0.8304
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8019
F1-Score (Testing Set): 0.8159
AUC (Testing Set): 0.8458


# removing hospital_beds_per_thousand

In [125]:
train = train.drop(columns = 'hospital_beds_per_thousand')
test = test.drop(columns = 'hospital_beds_per_thousand')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+cancel_public_events+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+testing_policy+contact_tracing+new_cases_smoothed_per_million+population+median_age+cardiovasc_death_rate+diabetes_prevalence+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.395284
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5203
Method:                           MLE   Df Model:                           16
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4208
Time:                        21:12:29   Log-Likelihood:                -2063.4
converged:                       True   LL-Null:                       -3562.4
Covariance T

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2682,515
Predicted Y=1,309,1714


Accuracy (Testing Set): 0.8421
Precision (Testing Set): 0.8473
Specificity (Testing Set): 0.8967
Sensitivity (Testing Set): 0.769
F1-Score (Testing Set): 0.8062
AUC (Testing Set): 0.8328


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,105
Predicted Y=1,86,420


Accuracy (Testing Set): 0.8536
Precision (Testing Set): 0.83
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8
F1-Score (Testing Set): 0.8147
AUC (Testing Set): 0.8449


# removing cancel_public_events

In [126]:
train = train.drop(columns = 'cancel_public_events')
test = test.drop(columns = 'cancel_public_events')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+testing_policy+contact_tracing+new_cases_smoothed_per_million+population+median_age+cardiovasc_death_rate+diabetes_prevalence+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.395619
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5204
Method:                           MLE   Df Model:                           15
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4203
Time:                        21:12:30   Log-Likelihood:                -2065.1
converged:                       True   LL-Null:                       -3562.4
Covariance Type:            nonro

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2684,524
Predicted Y=1,307,1705


Accuracy (Testing Set): 0.8408
Precision (Testing Set): 0.8474
Specificity (Testing Set): 0.8974
Sensitivity (Testing Set): 0.7649
F1-Score (Testing Set): 0.8041
AUC (Testing Set): 0.8311


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,105
Predicted Y=1,86,420


Accuracy (Testing Set): 0.8536
Precision (Testing Set): 0.83
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8
F1-Score (Testing Set): 0.8147
AUC (Testing Set): 0.8449


# removing median_age and final prediction

In [127]:
train = train.drop(columns = 'median_age')
test = test.drop(columns = 'median_age')

all_predictors = "+".join(train.drop(['label'],axis=1).columns)
print(all_predictors)

f = 'label ~ '+all_predictors
logitfit = smf.logit(formula = f, data = train).fit()
print(logitfit.summary())

y_pred = np.array(logitfit.predict(train) > 0.5, dtype=float)
y_test = np.array(train['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

y_pred = np.array(logitfit.predict(test) > 0.5, dtype=float)
y_test = np.array(test['label'], dtype=float)
cm = pd.DataFrame(confusion_matrix(y_pred, y_test))
cm.columns = ['True Y=0','True Y=1']
cm.index = ['Predicted Y=0','Predicted Y=1']
display(cm)
specificity = cm.iloc[0, 0]/(cm.iloc[0, 0] + cm.iloc[1, 0])
print("Accuracy (Testing Set):",round(accuracy_score(y_test, y_pred),4))
print('Precision (Testing Set):', round(precision_score(y_test, y_pred),4))
print('Specificity (Testing Set):', round(specificity,4))
print('Sensitivity (Testing Set):',round(recall_score(y_test,y_pred),4))
print('F1-Score (Testing Set):', round(f1_score(y_test,y_pred),4))
print('AUC (Testing Set):',round(roc_auc_score(y_test, y_pred),4))

school_closures+public_information_campaigns+restrictions_internal_movements+income_support+facial_coverings+testing_policy+contact_tracing+new_cases_smoothed_per_million+population+cardiovasc_death_rate+diabetes_prevalence+life_expectancy+human_development_index+international_travel_controls_binary
Optimization terminated successfully.
         Current function value: 0.395891
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  label   No. Observations:                 5220
Model:                          Logit   Df Residuals:                     5205
Method:                           MLE   Df Model:                           14
Date:                Wed, 15 Sep 2021   Pseudo R-squ.:                  0.4199
Time:                        21:12:30   Log-Likelihood:                -2066.6
converged:                       True   LL-Null:                       -3562.4
Covariance Type:            nonrobust   LLR 

Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,2683,506
Predicted Y=1,308,1723


Accuracy (Testing Set): 0.8441
Precision (Testing Set): 0.8484
Specificity (Testing Set): 0.897
Sensitivity (Testing Set): 0.773
F1-Score (Testing Set): 0.8089
AUC (Testing Set): 0.835


Unnamed: 0,True Y=0,True Y=1
Predicted Y=0,694,104
Predicted Y=1,86,421


Accuracy (Testing Set): 0.8544
Precision (Testing Set): 0.8304
Specificity (Testing Set): 0.8897
Sensitivity (Testing Set): 0.8019
F1-Score (Testing Set): 0.8159
AUC (Testing Set): 0.8458
