In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from countryinfo import CountryInfo
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
plt.rcParams['figure.figsize'] = [12, 8]
sns.set()

In [None]:
# get the dataset from https://www.kaggle.com/imdevskp/corona-virus-report and paste it on ../data
df = pd.read_csv('../data/covid_19_clean_complete.csv')
df['Date'] = df['Date'].astype('datetime64[ns]')

In [None]:
south_america = ['Argentina', 'Uruguay', 'Chile', 'Bolivia', 'Paraguay', 'Brazil', 'Ecuador', 'Colombia', 'Venezuela', 'Peru', 'Guyana', 'Suriname', 'French Guiana']

# Simple Linear Regression

In [None]:
regressions = {}
for c in south_america:
    df_c = df[df['Country/Region'] == c].sort_values('Date')
    X = np.array(list(range(df_c.shape[0]))).reshape(-1,1)
    y = df_c.Confirmed.to_numpy()
    if y.shape[0] > 0:
        # don't model zeroes
        y_fit = y[y>0]
        X_fit = X[-y_fit.shape[0]:]
        X_pred = np.append(X_fit, [X_fit.max()+1, X_fit.max()+2]).reshape(-1,1)
        plt.title(c)
        regressions[c] = LinearRegression().fit(X_fit, y_fit)
        plt.plot(X, y, marker='x', color='cornflowerblue', linewidth=2,
             label="ground truth")
        Y_pred = regressions[c].predict(X_pred)
        plt.plot(X_pred, Y_pred, color='red',linewidth=2,
             label="linear fit")
        plt.legend()
        plt.show()


# Polynomial Regression

In [None]:
colors = ['teal', 'yellowgreen', 'gold', 'red']
for c in south_america:
    df_c = df[df['Country/Region'] == c].sort_values('Date')
    X = np.array(list(range(df_c.shape[0]))).reshape(-1,1)
    y = df_c.Confirmed.to_numpy()
    if y.shape[0] > 0:
        plt.title(c)
        plt.scatter(X, y, color='cornflowerblue', linewidth=2,
             label="ground truth")
        for count, degree in enumerate([2, 3, 4, 5]):
            model = make_pipeline(PolynomialFeatures(degree), Ridge())
            model.fit(X, y)
            y_plot = model.predict(X)
            plt.plot(X, y_plot, color=colors[count], linewidth=2,
                     label=f'Ridge Reg {degree} degrees', alpha=0.5)
        plt.legend(loc='upper left')
        plt.ylabel('Confirmed cases')
        plt.xlabel('Days since 2020-01-22')
        plt.savefig(f'{c}-RidgeModels.png')
        plt.show()

# Average growth rate

In [None]:
colors = ['teal', 'yellowgreen', 'gold', 'red']
for c in south_america:
    df_c = df[df['Country/Region'] == c].sort_values('Date')
    X = np.array(list(range(df_c.shape[0]))).reshape(-1,1)
    X_predict = (np.array(list(range(df_c.shape[0] + 1))).reshape(-1,1))
    y = df_c.Confirmed.to_numpy()
    if y.shape[0] > 0:
        growth_rate = 1
        for i in range(y.shape[0]):
            if y[i] != 0 and y[i-1]
            
        plt.title(c)
        plt.scatter(X, y, color='cornflowerblue', linewidth=2,
             label="ground truth")
        model = LogisticRegression()
        model.fit(X, y)
        y_plot = model.predict(X_predict)
        plt.plot(X_predict, y_plot, color=colors[count], linewidth=2,
                 label=f'Logistic fit', alpha=0.5)
        plt.legend(loc='upper left')
        plt.ylabel('Confirmed cases')
        plt.xlabel('Days since 2020-01-22')
        plt.show()