# Scraping, Analyzing and Visualizing Covid-19 Data for Worldwide and India

In [1]:
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url_covid19_world='https://www.worldometers.info/coronavirus/'
url_covid19_India='https://www.mohfw.gov.in/'

In [161]:
class Covid19Tracker:
    
    def __init__(self):
        from datetime import date
        # pandas dataframe for processing and visualization
        self.df_world_summary = None
        self.df_all_countries = None
        #Todo: Set _processed to TRUE only after the retrieved dataset is processed and cleaned up
        #self._processed = False
        #_today to indicate last updated time
        self._today = date.today()
        self.dict_india_summary = {}
        self.df_india_summary = None
        self.df_indian_states = None
    
    def get_world_summary(self):
        return self.df_world_summary
    
    def get_country_data(self):
        return self.df_all_countries
    
    def __set_total_cases_india(self):
        self.dict_india_summary['total_cases'] = int(self.df_all_countries.loc[14,'Total_Cases'])
        
    def get_total_cases_india(self):
        return self.dict_india_summary.get('total_cases')
    
    def get_indian_states_data(self):
        return self.df_indian_states

    
    """
    ***************************************************************************************
    *** scrape_world_data(): Extracting Worldwide covid-19 statistics from today's date 
    ****************************************************************************************
    """
    def scrape_world_data(self):
        try:
            response = requests.get(url_covid19_world)
            # If the response was successful, no Exception will be raised
            response.raise_for_status()
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}') 
        except Exception as err:
            print(f'Other error occurred: {err}')  
        else:
            soup=BeautifulSoup(response.text, "html.parser")
            #display(soup.title)
            
            """
            Extracting Worldwide parameters: Coronovirus Cases, Deaths, Recovered, Active, Critical 
            """
            # Processing the required data from the first table that is retrieved
            coronatable=soup.find_all("table")[0]
            
            # Processing the <tr> tag containing the statistics for worldwide
            # Cleanup the data by removing whitespaces, comma. If missing value then replace 
            row_world_summary = coronatable.find_all("tr")[8]
            col=row_world_summary.find_all("td")
            
            dict_world_summary = {}
            dict_world_summary['total_cases']   = int((col[1].text.strip()).replace(',','') or 0)
            dict_world_summary['total_deaths']  = int((col[3].text.strip()).replace(',','') or 0)
            dict_world_summary['recovered']     = int((col[5].text.strip()).replace(',','') or 0)
            dict_world_summary['active']        = int((col[6].text.strip()).replace(',','') or 0)
            dict_world_summary['critical']      = int((col[7].text.strip()).replace(',','') or 0)
            
            self.df_world_summary = pd.DataFrame([dict_world_summary])
            
            # displaying data as shown below in the order of colunns mentioned
            #Todo : need to display Source Last updated on date
            print("___________________________________________________________________________________")
            print("Covid-19 updates for Worldwide")
            print("___________________________________________________________________________________")
            display(self.df_world_summary[['total_cases','total_deaths','recovered','active','critical']])
            
            """
            Extracting each Country parameters: Coronovirus Cases, Deaths, Recovered, Active, Critical 
            """
            c_name               = []
            c_total_cases        = []
            c_total_deaths       = []
            c_total_recovered    = []
            c_active             = []
            c_critical           = []
            
            rows=coronatable.find_all("tr")[9:-8]
            
            for row in rows:
                col=row.find_all("td")
                c_name.append(col[0].text.strip())
                c_total_cases.append(col[1].text.strip().replace(',',''))
                c_total_deaths.append(col[3].text.strip().replace(',',''))
                c_total_recovered.append(col[5].text.strip().replace(',',''))
                c_active.append(col[6].text.strip().replace(',',''))
                c_critical.append(col[7].text.strip().replace(',',''))


            self.df_all_countries = pd.DataFrame(list(zip(c_name, c_total_cases, c_total_deaths, c_total_recovered, c_active,
                                c_critical)),columns=["Country","Total_Cases", "Total_Deaths", "Recovered", "Active", "Critical"])
                                                         
            """
            Todo: Following tasks  : work in progress
            1. Handle missing data by analyzing covid-19 time series dataset. Currently replacing with 0. 
            2. Making the dataframe ready for visualization
            """
            self.df_all_countries.replace(r'^\s*$', np.nan, regex=True, inplace=True)
            #display(self.df_all_countries.isna().sum()) 
            
            self.df_all_countries.fillna(0, inplace=True)
            #display(self.df_all_countries.isna().sum()) 
            
            #Set total cases for India
            self.__set_total_cases_india()
            
            display(self.df_all_countries)
            
    """
    ***************************************************************************************
    *** scrape_india_data(): Scraping Covid-19 data for India
    Note: It is mandatory to call scrape_world_data before calling this function
    ***************************************************************************************
    """
    def scrape_india_data(self):
        try:
            response = requests.get(url_covid19_India)
            # If the response was successful, no Exception will be raised
            response.raise_for_status()
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}') 
        except Exception as err:
            print(f'Other error occurred: {err}')  
        else:
            soup=BeautifulSoup(response.text, "html.parser")
            #display(soup.title)
            #Todo: 
            #india_active_cases = int(soup.find("li", class_ = "bg-blue").strong.text.strip() or 0)
            #india_total_cases_cured = int(soup.find("li", class_ = "bg-green").strong.text.strip() or 0)
            #india_total_deaths = int(soup.find("li", class_ = "bg-red").strong.text.strip() or 0)
            
            
            self.dict_india_summary['active_cases']   = int(soup.find("li", class_ = "bg-blue").strong.text.strip() or 0)
            self.dict_india_summary['total_cases_cured']  = int(soup.find("li", class_ = "bg-green").strong.text.strip() or 0)
            self.dict_india_summary['total_deaths'] = int(soup.find("li", class_ = "bg-red").strong.text.strip() or 0)
            
            self.df_india_summary = pd.DataFrame([self.dict_india_summary])
            
            print("___________________________________________________________________________________")
            print("Covid-19 updates for India")
            print("___________________________________________________________________________________")
            display(self.df_india_summary)
            
            """
            Get data for the Indian states 
            """
            state_names=[]
            state_total_cases=[]
            state_total_cured=[]
            state_total_deaths=[]
            india_states_table = soup.find("table", class_="table table-striped")
            rows=india_states_table.find_all("tr")[1:-4]
            #display(rows)
            for row in rows:
                col=row.find_all("td")
                #display(col[1].text, col[2].text, col[3].text, col[4].text)
                state_names.append(col[1].text.strip())
                state_total_cases.append(col[2].text.strip())
                state_total_cured.append(col[3].text.strip())
                state_total_deaths.append(col[4].text.strip())

            self.df_indian_states = pd.DataFrame(list(zip(state_names, state_total_cases, state_total_cured,  state_total_deaths)),
                                                 columns=["States","Total_Cases", "Cured", "Deaths"])
                                                            
            display(self.df_indian_states)      

In [162]:
"""
***********************************************
******************** Demo  ********************
***********************************************
"""
covid_obj = Covid19Tracker()
covid_obj.scrape_world_data()


___________________________________________________________________________________
Covid-19 updates for Worldwide
___________________________________________________________________________________


Unnamed: 0,total_cases,total_deaths,recovered,active,critical
0,3566111,248285,1154014,2163812,50046


Unnamed: 0,Country,Total_Cases,Total_Deaths,Recovered,Active,Critical
0,USA,1188122,68598,178263,941261,16139
1,Spain,247122,25264,148558,73300,2386
2,Italy,210717,28884,81654,100179,1501
3,UK,186599,28446,,157809,1559
4,France,168693,24895,50784,93014,3819
5,Germany,165664,6866,130600,28198,1979
6,Russia,134687,1280,16639,116768,2300
7,Turkey,126045,3397,63151,59497,1424
8,Brazil,101826,7051,42991,51784,8318
9,Iran,97424,6203,78422,12799,2690


In [163]:
covid_obj.scrape_india_data()

___________________________________________________________________________________
Covid-19 updates for India
___________________________________________________________________________________


Unnamed: 0,active_cases,total_cases,total_cases_cured,total_deaths
0,28070,40571,10886,1306


Unnamed: 0,States,Total_Cases,Cured,Deaths
0,Andaman and Nicobar Islands,33,17,0
1,Andhra Pradesh,1583,488,33
2,Arunachal Pradesh,1,1,0
3,Assam,43,32,1
4,Bihar,482,117,4
5,Chandigarh,94,19,0
6,Chhattisgarh,43,36,0
7,Delhi,4122,1256,64
8,Goa,7,7,0
9,Gujarat,5055,896,262
