In [1]:
# import all required modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [2]:
# Read the csv file into a pandas DataFrame
covid_master = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')
covid_master.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,...,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy
0,ABW,North America,Aruba,2020-03-13,2.0,2.0,0.0,0.0,18.733,18.733,...,7.452,35973.781,,,11.62,,,,,76.29
1,ABW,North America,Aruba,2020-03-20,4.0,2.0,0.0,0.0,37.465,18.733,...,7.452,35973.781,,,11.62,,,,,76.29
2,ABW,North America,Aruba,2020-03-24,12.0,8.0,0.0,0.0,112.395,74.93,...,7.452,35973.781,,,11.62,,,,,76.29
3,ABW,North America,Aruba,2020-03-25,17.0,5.0,0.0,0.0,159.227,46.831,...,7.452,35973.781,,,11.62,,,,,76.29
4,ABW,North America,Aruba,2020-03-26,19.0,2.0,0.0,0.0,177.959,18.733,...,7.452,35973.781,,,11.62,,,,,76.29


In [3]:
covid_df = covid_master.copy()

def day_of_year(date_val):
    return int(date_val.strftime("%j"))

covid_df = covid_df.loc[:,["iso_code","date","total_cases", "total_deaths"]] # remove unwanted columns
covid_df['date'] = pd.to_datetime(covid_df['date']).apply(day_of_year) # convert date to day of the year
covid_df = covid_df.loc[(covid_df['date'] > 60) & (covid_df['date'] < 300 ), :] # filter out data from previous year
covid_df = covid_df.loc[(covid_df['iso_code'] == 'USA'), :] # select data for selected country
covid_df.head()

Unnamed: 0,iso_code,date,total_cases,total_deaths
33644,USA,61,69.0,1.0
33645,USA,62,89.0,2.0
33646,USA,63,103.0,6.0
33647,USA,64,125.0,9.0
33648,USA,65,159.0,11.0


In [4]:
# Assign X (data) and y (target)

X = covid_df[['date']]
y = covid_df['total_cases'].values.reshape(-1, 1)

print(X.shape, y.shape)

(163, 1) (163, 1)


In [5]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
# Create a StandardScater model and fit it to the training data
# Transform the training and testing data using the X_scaler and y_scaler models
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [7]:
# Ridge model

ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)
predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.05433917222937187, R2: 0.9559916909655009


In [8]:
x_new = X_scaler.transform([[360]])
y_new = ridge.predict(x_new)
y_inversed = y_scaler.inverse_transform(y_new)
predicted_cases = round(y_inversed[0][0]) 
print(predicted_cases)

8199587.0


In [9]:
## Predict Deaths

# Assign X (data) and y (target)

X = covid_df[['date']]
y = covid_df['total_deaths'].values.reshape(-1, 1)

print(X.shape, y.shape)

(163, 1) (163, 1)


In [10]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
# Create a StandardScater model and fit it to the training data
# Transform the training and testing data using the X_scaler and y_scaler models
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [12]:
# Ridge model

ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)
predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.03525635304545096, R2: 0.9695966281134236


In [13]:
x_new = X_scaler.transform([[360]])
y_new = ridge.predict(x_new)
y_inversed = y_scaler.inverse_transform(y_new)
predicted_deaths = round(y_inversed[0][0]) 
print(predicted_deaths)

329480.0
