In [None]:
# Import Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Get the list of all the pre-installed libraries
!pip list >> requirements.txt

# DATA UNDERSTANDING

In [None]:
# Reading the data
df = pd.read_csv("../input/novel-corona-virus-2019-dataset/covid_19_data.csv")

# Printing no. of rows & cols
num_rows = df.shape[0]
num_cols = df.shape[1]

print(" ")
print("Total no. of rows & cols: ", num_rows, num_cols)
print("No. of countries:", df["Country/Region"].nunique())
print("--------------------------------------")
df.head()

In [None]:
df.hist()

# DATA CLEANING

In [None]:
"""
OBSERVATIONS FROM BELOW RESULT: 
1. min() shows that there are -ve numbers in cases => to be removed
"""
df.describe()

In [None]:
"""
OBSERVATIONS FROM BELOW RESULT: 
2. Province/State has fewer non-null entries than other cols. Hence, have to fill that up with an empty string ''
3. ObservationDate is a "string"; have to convert to date to make it uniform
4. Dropping unncecessary column - Sno, Last Update
5. #5, 6, 7 are integers and not floating point
"""
df.info()

In [None]:
"""
Making the above corrections
"""
df_proc = df.copy(deep=True)

# 1. Remove rows with negative numbers in cols
cond = (
    (df_proc["Deaths"] >= 0) & (df_proc["Confirmed"] >= 0) & (df_proc["Recovered"] >= 0)
)
df_proc = df_proc[cond]

# 2. Filling null entries
df_proc[["Province/State"]] = df_proc[["Province/State"]].fillna("")
df_proc[["Confirmed", "Deaths", "Recovered"]] = df_proc[
    ["Confirmed", "Deaths", "Recovered"]
].fillna(0)

# 3. Converting to standard date format
df_proc["ObservationDate"] = pd.to_datetime(df_proc["ObservationDate"])

# 4. Dropping unnecessary columns
df_proc.drop(["SNo", "Last Update"], axis=1, inplace=True)

# 5. Convert datatypes
df_proc[["Confirmed", "Deaths", "Recovered"]] = df_proc[
    ["Confirmed", "Deaths", "Recovered"]
].astype("int")

# Reset index
df_proc = df_proc.reset_index()
df_proc.drop(["index"], axis=1, inplace=True)

In [None]:
print("HENCE, WE CAN SEE ALL CHANGES BELOW")
print("------------------------------------")
print(" ")
display(df_proc.info())
print(" ")
display(df_proc.describe())
print(" ")
display(df_proc.head())

# ANALYSIS

Q1) What is the worldwide impact of the virus, country-wise?

In [None]:
# Get the sum of confirmed, deaths and recovered cases for each country
temp = df_proc.groupby(["Country/Region"]).agg(
    {"Confirmed": "sum", "Deaths": "sum", "Recovered": "sum"}
)
temp = temp.reset_index()
temp
# temp.head(227)

Q2) What are currently the top 10 countries having the most active cases and hence needing the most aid right now?

In [None]:
# Calculating no. of active cases and sorting them in descending order
temp["Active"] = temp["Confirmed"] - temp["Deaths"] - temp["Recovered"]
temp_sort = temp.sort_values(["Active"], ascending=False).reset_index()
temp_sort.drop("index", axis="columns", inplace=True)
temp_sort.head(10)

Code to create stacked bar chart for top 10 countries with highest number of active cases.

In [None]:
x = temp_sort["Country/Region"].head(10)
y1 = temp_sort["Confirmed"].head(10)
y2 = temp_sort["Deaths"].head(10)
y3 = temp_sort["Recovered"].head(10)
y4 = temp_sort["Active"].head(10)
f, ax = plt.subplots(figsize=(18, 5))

plt.bar(x, y1, color="r")
plt.bar(x, y2, bottom=y1, color="b")
plt.bar(x, y3, bottom=y1 + y2, color="y")
plt.bar(x, y4, bottom=y1 + y2 + y3, color="g")
plt.xlabel("Countries")
plt.ylabel("No. of Cases")
plt.legend(["Confirmed", "Deaths", "Recovered", "Active"])
plt.title("Top 10 countries with highest no. of active cases")
plt.show()

Q3. Looking at India in more detail - how has the virus affected the largest democracy in the world?

In [None]:
# Filtering only India data
df_India = df_proc[df_proc["Country/Region"] == "India"].reset_index()
df_India

In [None]:
# calculating sum of confirmed, deaths,recovered statewise
df_India = df_India.groupby(["Province/State"]).agg(
    {"Confirmed": "sum", "Deaths": "sum", "Recovered": "sum"}
)
df_India = df_India.reset_index()  # to make states as a column of the table
df_India

In [None]:
# calculating active no. of cases
df_India["Active"] = df_India["Confirmed"] - df_India["Deaths"] - df_India["Recovered"]
df_India

In [None]:
# sorting states with highest no. of active cases
df_India_sort = df_India.sort_values(["Active"], ascending=False).reset_index()
df_India_sort

In [None]:
# dropping index column
df_India_sort.drop(["index"], axis="columns", inplace=True)
df_India_sort

Code to create stacked bar chart for top 10 states with highest no. of active cases

In [None]:
x = df_India_sort["Province/State"].head(10)
y1 = df_India_sort["Confirmed"].head(10)
y2 = df_India_sort["Deaths"].head(10)
y3 = df_India_sort["Recovered"].head(10)
y4 = df_India_sort["Active"].head(10)

f, ax = plt.subplots(figsize=(18, 5))
plt.bar(x, y1, color="r")
plt.bar(x, y2, bottom=y1, color="b")
plt.bar(x, y3, bottom=y1 + y2, color="y")
plt.bar(x, y4, bottom=y1 + y2 + y3, color="g")
plt.xlabel("Indian States")
plt.ylabel("No. of Cases")
plt.legend(["Confirmed", "Deaths", "Recovered", "Active"])
plt.title("Top 10 states with highest no. of active cases")
plt.show()

Q4. As a continuation of point 3 - what has the trend been in the worst-hit state in India?


In [None]:
# Extracting data for only Maharashtra

df_Maha = df_proc[df_proc["Province/State"] == "Maharashtra"].reset_index()
df_Maha

In [None]:
# sorting maharashtra data using observation date
df_Maha["ObservationDate"] = pd.to_datetime(
    df_Maha["ObservationDate"], format="%m/%d/%Y", utc=True
)
df_Maha_sort = df_Maha.sort_values(["ObservationDate"], ascending=True).reset_index()
df_Maha_sort

In [None]:
# Trend of death, recovered, confirmed

plt.scatter(
    np.arange(0, len(df_Maha_sort)),
    df_Maha_sort["Confirmed"],
    color="blue",
    label="Confirmed",
)
plt.scatter(
    np.arange(0, len(df_Maha_sort)),
    df_Maha_sort["Recovered"],
    color="green",
    label="Recovered",
)
plt.scatter(
    np.arange(0, len(df_Maha_sort)), df_Maha_sort["Deaths"], color="red", label="Deaths"
)
plt.title("Maharashtra")
plt.xlabel("Days since the first suspect")
plt.ylabel("Number of cases")
plt.legend()
plt.show()

# MODELING

Q4) Based on the history of the rate of growth of cases, can we provide some projections on how many hospital beds/oxygen tanks will be required tomorrow?

In [None]:
# importing libraries
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
# Calculating total no. of confirmed, deaths,recovered cases for each observation date
df_proc = df_proc.groupby(["ObservationDate"]).agg(
    {"Confirmed": "sum", "Deaths": "sum", "Recovered": "sum"}
)
# Caculating active cases and sorting in descending order
df_proc["Active"] = df_proc["Confirmed"] - df_proc["Deaths"] - df_proc["Recovered"]
df_proc = df_proc.sort_values(by=["ObservationDate"], ascending=True)
df_proc.head()

In [None]:
# Feature Engineering
data = df_proc[["Active"]]
data.head()

In [None]:
print(data.loc[:, "Active"].shift(1))

In [None]:
# Feature Engineering
data = df_proc[["Active"]]

data.loc[:, "-1"] = data.loc[:, "Active"].shift(1)
data.loc[:, "-3"] = data.loc[:, "Active"].shift(3)
data.loc[:, "-5"] = data.loc[:, "Active"].shift(5)

# dropping NAs
data = data.dropna()
data.head()

In [None]:
X_train = data.head(int(data.shape[0] / 1.2)).drop(["Active"], axis=1)
y_train = data.head(int(data.shape[0] / 1.2))["Active"]
X_test = data.tail(int(data.shape[0] / 1.2)).drop(["Active"], axis=1)
y_test = data.tail(int(data.shape[0] / 1.2))["Active"]

In [None]:
X_train

In [None]:
"""
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train
"""

In [None]:
def regression_results(y_true, y_pred):
    mse = metrics.mean_squared_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    print("r2: ", round(r2, 4))
    print("MSE: ", round(mse, 4))

# RF

In [None]:
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_true = y_test.values
y_pred = model.predict(X_test)
regression_results(y_true, y_pred)

In [None]:
print(X_test.values[0:2], y_true[0:2], y_pred[0:2])

In [None]:
o_pred = []
o_true = []
for i in range(1, 25):
    o_pred.append([y_true[i]])
    o_true.append([y_pred[i]])
plt.figure(figsize=(9, 5))
plt.plot(o_pred, "g", label="Predicted no. of active cases")
plt.plot(o_true, "b", label="Actual no. of active cases")
plt.xlabel("No. of samples")
plt.ylabel("No. of active cases")
plt.legend()
plt.show()