In [146]:
# import all required modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [69]:
# Read the csv file into a pandas DataFrame
covid_master = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')
covid_master.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,...,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy
0,AFG,Asia,Afghanistan,2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,...,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
1,AFG,Asia,Afghanistan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,...,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
2,AFG,Asia,Afghanistan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,...,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
3,AFG,Asia,Afghanistan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,...,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
4,AFG,Asia,Afghanistan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,...,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83


In [139]:
covid_df = covid_master.copy()

def day_of_year(date_val):
    return int(date_val.strftime("%j"))

covid_df = covid_df.loc[:,["iso_code","date","total_cases"]] # remove unwanted columns
covid_df['date'] = pd.to_datetime(covid_df['date']).apply(day_of_year) # convert date to day of the year
covid_df = covid_df.loc[(covid_df['date'] > 100) & (covid_df['date'] < 300 ), :] # filter out data from previous year
covid_df = covid_df.loc[(covid_df['iso_code'] == 'USA'), :] # select data for selected country
covid_df.head()

Unnamed: 0,iso_code,date,total_cases
33843,USA,101,466033.0
33844,USA,102,501560.0
33845,USA,103,529951.0
33846,USA,104,557571.0
33847,USA,105,582594.0


In [140]:
# Assign X (data) and y (target)

X = covid_df[['date']]
y = covid_df['total_cases'].values.reshape(-1, 1)

print(X.shape, y.shape)

(122, 1) (122, 1)


In [141]:
# Split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [150]:
# Create a StandardScater model and fit it to the training data
# Transform the training and testing data using the X_scaler and y_scaler models
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [151]:
# Ridge model

ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)
predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.04357095868990408, R2: 0.9590496156993541


In [166]:
x_new = X_scaler.transform([[360]])
y_new = ridge.predict(x_new)
y_inversed = y_scaler.inverse_transform(y_new)
predicted_y = round(y_inversed[0][0]) 
print(predicted_y)

9071683.0
