In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [2]:
class Scraper():
    
    def __init__(self, url):
        
        self.url = url
        self.request = requests.get(self.url)
        self.website = self.request.text
        
        self.soup = BeautifulSoup(self.website)
        self.table = self.soup.find('table', {'class': 'wikitable'})
        
    def Extract(self):

        self.col_names = list()
        for node in range(16):
            self.col_names.append(', '.join(self.table.findAll('th')[node].text.split()))
        self.col_names.insert(11, 'New cases')
        self.col_names.insert(13, 'New deaths')

        self.columns = 18
        self.rows = len(self.table.findAll('tr'))

        self.df = pd.DataFrame(index = range(self.rows))

        for column in range(self.columns):
            values = list()
            for each_row in range(self.rows):
                row = self.table.findAll('tr')[each_row].findAll('td')
                if len(row) == 18:
                    value = ', '.join(row[column].text.split())
                    if ']' in value:
                        values.append(value.split(']')[1])
                    elif value == '?':
                        values.append(np.nan)
                    elif value and ']' not in value:
                        values.append(value)
                    else:
                        values.append(0)

                    self.df['{}'.format(column)] = pd.Series(values)

        self.df.columns = self.col_names

        return self.df.dropna(how = 'all')
    
    
    def Transform(self, extract):

        self.extract = extract
        self.df = self.extract
        self.df = self.df.rename(
            columns = {
                '2020': 'Date',
                'EC': 'Eastern Cape',
                'FS': 'Freestate',
                'GP': 'Gauteng',
                'KZN': 'KwaZulu Natal',
                'LP': 'Limpopo',
                'MP': 'Mpumalanga',
                'NW': 'North West',
                'NC': 'Northern Cape',
                'WC': 'Western Cape',
                'un': 'Unknown',
                'Confirmed': 'Total cases',
                'Deaths': 'Total deaths',
                'Rec': 'Recovered',
                'Agtests': 'Total tested'
            }
        )

        self.df.drop(labels = ['Ref'], axis = 1, inplace = True)

        self.df = pd.melt(
            self.df,
            id_vars = ['Date', 'New cases', 'Total cases', 'New deaths', 'Total deaths', 'Recovered', 'Total tested'],
            var_name = 'Province',
            value_name = 'Cases by Province'
        )

        date = self.df['Date'] + '-20'
        self.df['Date'] = date.apply(lambda date: datetime.strptime(date, '%m-%d-%y'))
        self.df.set_index('Date', inplace = True)

        self.columns = [
            'Province', 'Cases by Province', 'New cases', 'Total cases', 
            'Total tested', 'New deaths', 'Total deaths', 'Recovered'
        ]
        self.df = self.df.loc[:, self.columns]

        self.df = self.df.astype(
            { 
               'Province': 'category',
               'Cases by Province': 'float', 
               'New cases': 'int', 
               'Total cases': 'int', 
               'Total tested': 'int', 
               'New deaths': 'int', 
               'Total deaths': 'int', 
               'Recovered': 'int'
            }
        )

        return self.df

In [3]:
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_South_Africa#Cases'
scraper = Scraper(url = url)

In [10]:
data = scraper.Extract()
data.head()

Unnamed: 0,2020,EC,FS,GP,KZN,LP,MP,NW,NC,WC,un,New cases,Confirmed,New deaths,Deaths,Rec,Agtests,Ref
0,03-04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,181,
1,03-05,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,
2,03-06,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0
3,03-07,0,0,1,0,0,0,0,0,0,0,1,2,0,0,0,0,
4,03-08,0,0,0,1,0,0,0,0,0,0,1,3,0,0,0,0,


In [11]:
df = scraper.Transform(extract = data)
df.head()

Unnamed: 0_level_0,Province,Cases by Province,New cases,Total cases,Total tested,New deaths,Total deaths,Recovered
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-04,Eastern Cape,0.0,0,0,181,0,0,0
2020-03-05,Eastern Cape,0.0,1,1,0,0,0,0
2020-03-06,Eastern Cape,0.0,0,1,0,0,0,0
2020-03-07,Eastern Cape,0.0,1,2,0,0,0,0
2020-03-08,Eastern Cape,0.0,1,3,0,0,0,0


In [12]:
df.to_csv('covid19_data.csv')