##### This is the data repository for the 2019 Novel Coronavirus Visual Dashboard operated by the Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE).
**Dataset: Novel Corona Virus 2019 Dataset**
**Dataset Description:**
- SNo: Serial Number
- ObservationDate: Observation date in mm/dd/yyyy
- Province/State: Province or State
- Country/Region: Country or region
- Last Update: Last update date time in UTC
- Confirmed: Cumulative number of confirmed cases
- Deaths: Cumulative number of deaths cases
- Recovered: Cumulative number of recovered cases
** *Infected: Confirmed - Recovered – Deaths**

In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression as LinReg
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [6]:
data = pd.read_csv('covid_19_data.csv')
df = data.copy()

In [7]:
df.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [17]:
columns = df.columns
df.columns = [col.lower() for col in columns]
df.rename(columns = {'province/state':'province', 'country/region':'country','last update':'last_update'},inplace=True)

In [19]:
df.drop(['sno','province','country','last_update'],axis=1,inplace=True)

In [23]:
df.head()

Unnamed: 0,observationdate,confirmed,deaths,recovered
0,01/22/2020,1.0,0.0,0.0
1,01/22/2020,14.0,0.0,0.0
2,01/22/2020,6.0,0.0,0.0
3,01/22/2020,1.0,0.0,0.0
4,01/22/2020,0.0,0.0,0.0


In [29]:
df = df.groupby(df['observationdate']).sum().reset_index()

In [31]:
df

Unnamed: 0,observationdate,confirmed,deaths,recovered
0,01/22/2020,555.0,17.0,28.0
1,01/23/2020,653.0,18.0,30.0
2,01/24/2020,941.0,26.0,36.0
3,01/25/2020,1438.0,42.0,39.0
4,01/26/2020,2118.0,56.0,52.0
5,01/27/2020,2927.0,82.0,61.0
6,01/28/2020,5578.0,131.0,107.0
7,01/29/2020,6165.0,133.0,126.0
8,01/30/2020,8235.0,171.0,143.0
9,01/31/2020,9925.0,213.0,222.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   observationdate  57 non-null     object 
 1   confirmed        57 non-null     float64
 2   deaths           57 non-null     float64
 3   recovered        57 non-null     float64
dtypes: float64(3), object(1)
memory usage: 1.9+ KB


In [39]:
df['observationdate'] = pd.to_datetime(df['observationdate'])

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   observationdate  57 non-null     datetime64[ns]
 1   confirmed        57 non-null     float64       
 2   deaths           57 non-null     float64       
 3   recovered        57 non-null     float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 1.9 KB


In [43]:
def substraction(observationdate):
    min_ob_date = df['observationdate'].min()
    return (observationdate - min_ob_date).days + 1

In [48]:
df['days'] = df['observationdate'].apply(lambda x:substraction(x))

In [52]:
df.drop('observationdate',axis=1,inplace=True)

In [54]:
df.head()

Unnamed: 0,confirmed,deaths,recovered,days
0,555.0,17.0,28.0,1
1,653.0,18.0,30.0,2
2,941.0,26.0,36.0,3
3,1438.0,42.0,39.0,4
4,2118.0,56.0,52.0,5


In [56]:
df['infected'] = df['confirmed'] - df['recovered'] - df['deaths']

In [58]:
df.head()

Unnamed: 0,confirmed,deaths,recovered,days,infected
0,555.0,17.0,28.0,1,510.0
1,653.0,18.0,30.0,2,605.0
2,941.0,26.0,36.0,3,879.0
3,1438.0,42.0,39.0,4,1357.0
4,2118.0,56.0,52.0,5,2010.0


##### Confirmed Prediction

In [71]:
X = df['days']
y = df['confirmed']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=101)

In [89]:
X_train = np.reshape(X_train,(-1,1))
X_test = np.reshape(X_test,(-1,1))

In [91]:
from sklearn.linear_model import LinearRegression as LinReg

model = LinReg()
linmodel = model.fit(X_train, y_train)

In [95]:
y_pred = linmodel.predict(X_test)

In [107]:
R2 = r2_score(y_test, y_pred)
print('R2 =', R2)
n = X_test.shape[0]
p = X_test.shape[1]
adj_r2 = 1-(1-R2)*(n-1)/(n-p-1)
print('Adjusted R2 =', adj_r2)
MSE = mean_squared_error(y_test, y_pred)
print('MSE =', MSE)
RMSE = np.sqrt(MSE)
print("RMSE =",RMSE)
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, y_pred)
print('MAE =',MAE)

R2 = 0.9081493592042216
Adjusted R2 = 0.8989642951246437
MSE = 181528979.18849102
RMSE = 13473.269060940296
MAE = 9892.347140961212


##### Deaths prediction

In [110]:
X = df['days']
y = df['deaths']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=101)

In [112]:
X_train = np.reshape(X_train,(-1,1))
X_test = np.reshape(X_test,(-1,1))

In [114]:
from sklearn.linear_model import LinearRegression as LinReg

model = LinReg()
linmodel = model.fit(X_train, y_train)

In [116]:
y_pred = linmodel.predict(X_test)

In [118]:
R2 = r2_score(y_test, y_pred)
print('R2 =', R2)
n = X_test.shape[0]
p = X_test.shape[1]
adj_r2 = 1-(1-R2)*(n-1)/(n-p-1)
print('Adjusted R2 =', adj_r2)
MSE = mean_squared_error(y_test, y_pred)
print('MSE =', MSE)
RMSE = np.sqrt(MSE)
print("RMSE =",RMSE)
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, y_pred)
print('MAE =',MAE)

R2 = 0.862565303753617
Adjusted R2 = 0.8488218341289786
MSE = 468237.2156059365
RMSE = 684.2786096364086
MAE = 551.8960403393711


##### Recovered Prediction

In [121]:
X = df['days']
y = df['recovered']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=101)

In [123]:
X_train = np.reshape(X_train,(-1,1))
X_test = np.reshape(X_test,(-1,1))

In [125]:
from sklearn.linear_model import LinearRegression as LinReg

model = LinReg()
linmodel = model.fit(X_train, y_train)

In [127]:
y_pred = linmodel.predict(X_test)

In [129]:
R2 = r2_score(y_test, y_pred)
print('R2 =', R2)
n = X_test.shape[0]
p = X_test.shape[1]
adj_r2 = 1-(1-R2)*(n-1)/(n-p-1)
print('Adjusted R2 =', adj_r2)
MSE = mean_squared_error(y_test, y_pred)
print('MSE =', MSE)
RMSE = np.sqrt(MSE)
print("RMSE =",RMSE)
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, y_pred)
print('MAE =',MAE)

R2 = 0.8707546732091535
Adjusted R2 = 0.8578301405300688
MSE = 90873569.84641379
RMSE = 9532.762970220847
MAE = 8788.989756344141


##### Infected Prediction

In [132]:
X = df['days']
y = df['infected']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=101)

In [134]:
X_train = np.reshape(X_train,(-1,1))
X_test = np.reshape(X_test,(-1,1))

In [136]:
from sklearn.linear_model import LinearRegression as LinReg

model = LinReg()
linmodel = model.fit(X_train, y_train)

In [138]:
y_pred = linmodel.predict(X_test)

In [140]:
R2 = r2_score(y_test, y_pred)
print('R2 =', R2)
n = X_test.shape[0]
p = X_test.shape[1]
adj_r2 = 1-(1-R2)*(n-1)/(n-p-1)
print('Adjusted R2 =', adj_r2)
MSE = mean_squared_error(y_test, y_pred)
print('MSE =', MSE)
RMSE = np.sqrt(MSE)
print("RMSE =",RMSE)
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, y_pred)
print('MAE =',MAE)

R2 = 0.47729610527090516
Adjusted R2 = 0.42502571579799564
MSE = 237295853.22212484
RMSE = 15404.410187414669
MAE = 13951.213674016863
