In [79]:
import numpy as np
import matplotlib.pyplot as plt
from utils import data
import os
import sklearn
import numpy as np
from sklearn.neighbors import (
    KNeighborsClassifier,
    DistanceMetric
)
import json

In [82]:
BASE_PATH = 'COVID-19/csse_covid_19_data/'
MIN_CASES = 1000

In [83]:
confirmed = os.path.join(
    BASE_PATH, 
    'csse_covid_19_time_series',
    'time_series_covid19_confirmed_global.csv')
confirmed = data.load_csv_data(confirmed)
features = []
targets = []

for val in np.unique(confirmed["Country/Region"]):
    df = data.filter_by_attribute(
        confirmed, "Country/Region", val)
    cases, labels = data.get_cases_chronologically(df)
    features.append(cases)
    targets.append(labels)

features = np.concatenate(features, axis=0)
targets = np.concatenate(targets, axis=0)

In [84]:
features

array([[0, 0, 0, ..., 71838, 72977, 74026],
       [0, 0, 0, ..., 132315, 132337, 132351],
       [0, 0, 0, ..., 128913, 129218, 129640],
       ...,
       [0, 0, 0, ..., 6742, 6751, 6759],
       [0, 0, 0, ..., 95263, 95821, 96563],
       [0, 0, 0, ..., 38961, 38998, 39031]], dtype=object)

In [85]:
targets_adj = targets[:,1]

<h1>Stochastic Gradient Descent</h1>

In [86]:
from sklearn.linear_model import SGDClassifier

In [87]:
model = SGDClassifier(loss="hinge", penalty="l2", max_iter=200000)
model = model.fit(features,targets_adj)



In [90]:
pred=model.predict(features)

In [91]:
np.sum(targets_adj==pred)/len(targets_adj)

0.007246376811594203

<h1>Decision Trees</h1>

In [92]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(features,targets_adj)

In [93]:
pred = model.predict(features)

In [94]:
np.sum(targets_adj==pred)/len(targets_adj)

1.0

Now slightly perturb input features with some random noise and see if decision trees are still accurate

In [95]:
r=features.shape[0]
c=features.shape[1]
pert = np.random.rand(r,c)*2 -1
features_perturbed = features+pert

In [96]:
pred = model.predict(features_perturbed)
np.sum(targets_adj==pred)/len(targets_adj)

0.7028985507246377

<h1>Clustering Countries by Covid-19 Data</h1>

In [97]:
from sklearn.cluster import KMeans

<h2>Clustering with 3 clusters</h2>

In [98]:
clusters = KMeans(n_clusters=3, random_state=0).fit(features)
clust=clusters.labels_

In [99]:
targets_adj[np.where(clust==1)]

array(['Brazil', 'India'], dtype=object)

In [100]:
targets_adj[np.where(clust==2)]

array(['US'], dtype=object)

In [101]:
targets_adj[np.where(clust==0)]

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Australia', 'Australia', 'Australia', 'Australia', 'Australia',
       'Australia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
       'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma',
       'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Central African Republic', 'Chad',
       'Chile', 'China', 'China', 'China', 'China', 'China', 'China',
       'China', 'China', 'China', 'China', 'China', 'China', 'China',
       'China', 'China', 'China', 'China', 'China', 'China', 'China',
       'China', 'China', 'China', 'Chin

That's an interesting result and one that makes sense. Brazil and India are clutered together due to their still-increasing high numbers of positive cases. US is clustered by itself because it had extremely high case numbers but is rapidly decreasing in new cases due to successful vaccination efforts. The rest of the countries are then grouped together.

<h2>Clustering with 5 clusters</h2>

In [102]:
clusters = KMeans(n_clusters=5, random_state=0).fit(features)
clust=clusters.labels_

In [103]:
targets_adj[np.where(clust==1)]

array(['Brazil', 'India'], dtype=object)

In [104]:
targets_adj[np.where(clust==2)]

array(['US'], dtype=object)

In [105]:
targets_adj[np.where(clust==3)]

array(['Argentina', 'Colombia', 'France', 'Germany', 'Italy', 'Russia',
       'Spain', 'Turkey', 'United Kingdom'], dtype=object)

In [106]:
targets_adj[np.where(clust==4)]

array(['Bangladesh', 'Belgium', 'Chile', 'Czechia', 'Indonesia', 'Iran',
       'Iraq', 'Israel', 'Mexico', 'Netherlands', 'Pakistan', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'Romania', 'South Africa',
       'Sweden', 'Ukraine'], dtype=object)

<h3>Clustering with 7 clusters</h3>

In [107]:
clusters = KMeans(n_clusters=7, random_state=0).fit(features)
clust=clusters.labels_

In [108]:
targets_adj[np.where(clust==1)]

array(['India'], dtype=object)

In [109]:
targets_adj[np.where(clust==2)]

array(['US'], dtype=object)

In [110]:
targets_adj[np.where(clust==3)]

array(['Argentina', 'Colombia', 'Germany', 'Iran', 'Mexico', 'Peru',
       'Poland', 'South Africa', 'Ukraine'], dtype=object)

In [111]:
targets_adj[np.where(clust==4)]

array(['Brazil'], dtype=object)

In [112]:
targets_adj[np.where(clust==5)]

array(['France', 'Italy', 'Russia', 'Spain', 'Turkey', 'United Kingdom'],
      dtype=object)

In [113]:
targets_adj[np.where(clust==6)]

array(['Austria', 'Bangladesh', 'Belgium', 'Chile', 'Czechia', 'Hungary',
       'Indonesia', 'Iraq', 'Israel', 'Japan', 'Jordan', 'Morocco',
       'Netherlands', 'Pakistan', 'Philippines', 'Portugal', 'Romania',
       'Saudi Arabia', 'Serbia', 'Sweden', 'Switzerland',
       'United Arab Emirates'], dtype=object)

In [114]:
targets_adj[np.where(clust==0)]

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Armenia', 'Australia', 'Australia',
       'Australia', 'Australia', 'Australia', 'Australia', 'Australia',
       'Australia', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Barbados',
       'Belarus', 'Belize', 'Benin', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brunei', 'Bulgaria',
       'Burkina Faso', 'Burma', 'Burundi', 'Cabo Verde', 'Cambodia',
       'Cameroon', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Central African Republic', 'Chad', 'China', 'China', 'China',
       'China', 'China', 'China', 'China', 'China', 'China', 'China',
       'China', 'China', 'China', 'China', 'China', 'China', 'China',
       'China', 'China', 'China', 'China', 'China', 'China', 'China',
       'China', 'China', 'China', 'China', 'China', 'China', 

<h1>Prediction Classification for US States</h1>

<h2>Decision Trees</h2>

In [115]:
from sklearn.cluster import KMeans

In [116]:
confirmed_st = os.path.join(
    BASE_PATH, 
    'csse_covid_19_time_series',
    'time_series_covid19_confirmed_US.csv')
confirmed_st = data.load_csv_data(confirmed_st)
features_st = []
targets_st = []

for val in np.unique(confirmed_st["Combined_Key"]):
    df = data.filter_by_attribute(
        confirmed_st, "Combined_Key", val)
    cases, labels = data.get_cases_chronologically_st(df)
    features_st.append(cases)
    targets_st.append(labels)

features_st = np.concatenate(features_st, axis=0)
targets_st = np.concatenate(targets_st, axis=0)

In [117]:
targets_st_adj = targets_st[:,6]

In [118]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(features_st,targets_st_adj)

In [119]:
pred=model.predict(features_st)

In [120]:
np.sum(targets_st_adj==pred)/len(targets_st_adj)

0.9838420107719928

<h2>Stochastic Gradient Descent</h2>

In [121]:
from sklearn.linear_model import SGDClassifier

In [122]:
model = SGDClassifier(loss="hinge", penalty="l2", max_iter=200000)
model = model.fit(features_st,targets_st_adj)

In [123]:
pred=model.predict(features_st)

In [124]:
np.sum(targets_st_adj==pred)/len(targets_st_adj)

0.06223818073010173