In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
from datetime import timedelta
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,r2_score
std=StandardScaler()




In [2]:
covid=pd.read_csv(r"covid_19_data.csv")
covid

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
156287,156288,11/15/2020,Zaporizhia Oblast,Ukraine,2020-11-16 05:25:57,18484.0,164.0,3021.0
156288,156289,11/15/2020,Zeeland,Netherlands,2020-11-16 05:25:57,5041.0,86.0,0.0
156289,156290,11/15/2020,Zhejiang,Mainland China,2020-11-16 05:25:57,1291.0,1.0,1279.0
156290,156291,11/15/2020,Zhytomyr Oblast,Ukraine,2020-11-16 05:25:57,22225.0,368.0,12266.0


In [3]:
print("Shape of the dataset: ",covid.shape)
print("Checking for null values:\n",covid.isnull().sum())
print("Checking Data-type of each column:\n",covid.dtypes)

Shape of the dataset:  (156292, 8)
Checking for null values:
 SNo                    0
ObservationDate        0
Province/State     44316
Country/Region         0
Last Update            0
Confirmed              0
Deaths                 0
Recovered              0
dtype: int64
Checking Data-type of each column:
 SNo                  int64
ObservationDate     object
Province/State      object
Country/Region      object
Last Update         object
Confirmed          float64
Deaths             float64
Recovered          float64
dtype: object


In [4]:
#Dropping column as SNo is of no use, and "Province/State" contains too many missing values
covid.drop(["SNo","Province/State"],axis=1,inplace=True)

In [5]:
#Converting "Observation Date" into Datetime format
covid["ObservationDate"]=pd.to_datetime(covid["ObservationDate"])

In [6]:
grouped_country=covid.groupby(["Country/Region","ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})

In [7]:
grouped_country["Active Cases"]=grouped_country["Confirmed"]-grouped_country["Recovered"]-grouped_country["Deaths"]
grouped_country["log_confirmed"]=np.log(grouped_country["Confirmed"])
grouped_country["log_active"]=np.log(grouped_country["Active Cases"])

In [8]:
#Grouping different types of cases as per the date
datewise=covid.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
datewise["Days Since"]=datewise.index-datewise.index.min()

In [9]:
print("Basic Information")
print("Totol number of countries with Disease Spread: ",len(covid["Country/Region"].unique()))
print("Total number of Confirmed Cases around the World: ",datewise["Confirmed"].iloc[-1])
print("Total number of Recovered Cases around the World: ",datewise["Recovered"].iloc[-1])
print("Total number of Deaths Cases around the World: ",datewise["Deaths"].iloc[-1])
print("Total number of Active Cases around the World: ",(datewise["Confirmed"].iloc[-1]-datewise["Recovered"].iloc[-1]-datewise["Deaths"].iloc[-1]))
print("Total number of Closed Cases around the World: ",datewise["Recovered"].iloc[-1]+datewise["Deaths"].iloc[-1])
print("Number of Confirmed Cases in last 24 hours: ",datewise["Confirmed"].iloc[-1]-datewise["Confirmed"].iloc[-2])
print("Number of Recovered Cases in last 24 hours: ",datewise["Recovered"].iloc[-1]-datewise["Recovered"].iloc[-2])
print("Number of Death Cases in last 24 hours: ",datewise["Deaths"].iloc[-1]-datewise["Deaths"].iloc[-2])

Basic Information
Totol number of countries with Disease Spread:  226
Total number of Confirmed Cases around the World:  54370186.0
Total number of Recovered Cases around the World:  34955148.0
Total number of Deaths Cases around the World:  1317139.0
Total number of Active Cases around the World:  18097899.0
Total number of Closed Cases around the World:  36272287.0
Number of Confirmed Cases in last 24 hours:  443028.0
Number of Recovered Cases in last 24 hours:  228730.0
Number of Death Cases in last 24 hours:  5947.0


In [10]:
fig=px.bar(x=datewise.index,y=datewise["Confirmed"]-datewise["Recovered"]-datewise["Deaths"])
fig.update_layout(title="Distribution of Number of Active Cases",
                  xaxis_title="Date",yaxis_title="Number of Cases",)
fig.show()

In [11]:
india_data=covid[covid["Country/Region"]=="India"]
datewise_india=india_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
print(datewise_india.iloc[-1])
print("Total Active Cases: ",datewise_india["Confirmed"].iloc[-1]-datewise_india["Recovered"].iloc[-1]-datewise_india["Deaths"].iloc[-1])
print("Total Closed Cases: ",datewise_india["Recovered"].iloc[-1]+datewise_india["Deaths"].iloc[-1])

Confirmed    8845127.0
Recovered    8249579.0
Deaths        130070.0
Name: 2020-11-15 00:00:00, dtype: float64
Total Active Cases:  465478.0
Total Closed Cases:  8379649.0


In [12]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=datewise_india.index, y=datewise_india["Confirmed"],
                    mode='lines+markers',
                    name='Confirmed Cases'))
fig.add_trace(go.Scatter(x=datewise_india.index, y=datewise_india["Recovered"],
                    mode='lines+markers',
                    name='Recovered Cases'))
fig.add_trace(go.Scatter(x=datewise_india.index, y=datewise_india["Deaths"],
                    mode='lines+markers',
                    name='Death Cases'))
fig.update_layout(title="Growth of different types of cases in India",
                 xaxis_title="Date",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

<h4>Linear Regression Model</h4>

In [13]:
datewise["Days Since"]=datewise.index-datewise.index[0]
datewise["Days Since"]=datewise["Days Since"].dt.days

In [14]:
# Feature and target
input = np.array(datewise["Days Since"]).reshape(-1, 1)
output = np.array(datewise["Confirmed"])

# Split data
x_train, x_test, y_train, y_test = train_test_split(input, output, test_size=0.25, random_state=0)

In [15]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

In [16]:
prediction_linreg = lin_reg.predict(x_test)
r2_linreg = r2_score(y_test, prediction_linreg)
rmse_linreg = np.sqrt(mean_squared_error(y_test, prediction_linreg))

print("R2 Score - Linear Regression:", r2_linreg * 100)
print("RMSE - Linear Regression:", rmse_linreg)

R2 Score - Linear Regression: 88.15714920002794
RMSE - Linear Regression: 5666466.980406367


In [17]:
# Plot Linear Regression on full data
full_prediction_lr = lin_reg.predict(input)
fig = go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"], mode='lines+markers', name='Actual'))
fig.add_trace(go.Scatter(x=datewise.index, y=full_prediction_lr, mode='lines', name='Linear Regression', line=dict(dash='dot')))
fig.update_layout(title="Linear Regression Fit", xaxis_title="Date", yaxis_title="Confirmed Cases")
fig.show()


<h4>Polynomial Regression</h4>

In [18]:
poly = PolynomialFeatures(degree=4)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

In [19]:
lin_reg_poly = LinearRegression()
lin_reg_poly.fit(x_train_poly, y_train)

In [20]:
prediction_poly = lin_reg_poly.predict(x_test_poly)
r2_poly = r2_score(y_test, prediction_poly)
rmse_poly = np.sqrt(mean_squared_error(y_test, prediction_poly))

print("R2 Score - Polynomial Regression:", r2_poly * 100)
print("RMSE - Polynomial Regression:", rmse_poly)

R2 Score - Polynomial Regression: 99.92389583315222
RMSE - Polynomial Regression: 454243.21746884676


In [21]:
# Plot Polynomial Regression on full data
full_poly_input = poly.transform(input)
prediction_poly_full = lin_reg_poly.predict(full_poly_input)
fig = go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"], mode='lines+markers', name='Actual'))
fig.add_trace(go.Scatter(x=datewise.index, y=prediction_poly_full, mode='lines', name='Polynomial Regression', line=dict(dash='dot')))
fig.update_layout(title="Polynomial Regression Fit", xaxis_title="Date", yaxis_title="Confirmed Cases")
fig.show()

In [22]:
new_prediction_poly=[]
for i in range(1,18):
    new_date_poly=poly.fit_transform(np.array(datewise["Days Since"].max()+i).reshape(-1,1))
    new_prediction_poly.append(lin_reg_poly.predict(new_date_poly)[0])

<h4>Support Vector Machine Model</h4>

In [23]:
# Support Vector Regression 
svm = SVR(C=1e6, kernel='rbf', gamma=0.01, epsilon=1e3)
svm.fit(x_train, y_train)

In [24]:
prediction_svm = svm.predict(x_test)
r2_svm = r2_score(y_test, prediction_svm)
rmse_svm = np.sqrt(mean_squared_error(y_test, prediction_svm))

print("R2 Score - SVR:", r2_svm * 100)
print("RMSE - SVR:", rmse_svm)

R2 Score - SVR: 60.66894010072688
RMSE - SVR: 10326474.05985089


In [25]:
# Plot SVR predictions on full data
full_prediction_svm = svm.predict(input)
fig = go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"], mode='lines+markers', name='Actual'))
fig.add_trace(go.Scatter(x=datewise.index, y=full_prediction_svm, mode='lines', name='SVR', line=dict(dash='dot')))
fig.update_layout(title="Support Vector Regression Fit", xaxis_title="Date", yaxis_title="Confirmed Cases")
fig.show()

In [26]:
new_date=[]
new_prediction_lr=[]
new_prediction_svm=[]
for i in range(1,18):
    new_date.append(datewise.index[-1]+timedelta(days=i))
    new_prediction_lr.append(lin_reg.predict(np.array(datewise["Days Since"].max()+i).reshape(-1,1))[0])
    new_prediction_svm.append(svm.predict(np.array(datewise["Days Since"].max()+i).reshape(-1,1))[0])

In [27]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)
model_predictions=pd.DataFrame(zip(new_date,new_prediction_lr,new_prediction_poly,new_prediction_svm),
                               columns=["Dates","Linear Regression Prediction","Polynonmial Regression Prediction","SVM Prediction"])
model_predictions.head()


Unnamed: 0,Dates,Linear Regression Prediction,Polynonmial Regression Prediction,SVM Prediction
0,2020-11-16,39729163.298633,53159182.899801,15945263.932642
1,2020-11-17,39895928.487485,53622377.289184,15388677.883655
2,2020-11-18,40062693.676336,54088352.038301,14846917.443666
3,2020-11-19,40229458.865188,54557119.51196,14329546.117028
4,2020-11-20,40396224.05404,55028692.123943,13845118.645202


In [28]:
# Compare RMSE and R2 scores for all models
comparison_df = pd.DataFrame({
    "Model": ["Linear Regression", "Polynomial Regression (Deg 4)", "Support Vector Regression"],
    "R2 Score (%)": [r2_linreg * 100, r2_poly * 100, r2_svm * 100],
    "RMSE": [rmse_linreg, rmse_poly, rmse_svm]
})

# Display comparison
print("\nModel Performance Comparison:")
print(comparison_df)



Model Performance Comparison:
                           Model  R2 Score (%)            RMSE
0              Linear Regression     88.157149  5666466.980406
1  Polynomial Regression (Deg 4)     99.923896   454243.217469
2      Support Vector Regression     60.668940 10326474.059851
