# Capstone Project Backup: Analysis of Confirmed COVID-19 Cases in Singapore

## Importing of Libraries

In [501]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import re
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

## Data Collection

In [502]:
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Singapore'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [503]:
soup = soup.find("table", {"class":"wikitable sortable mw-collapsible mw-collapsed"})

In [504]:
soup = soup.find("tbody")

In [505]:
data = [tag.text.strip() for tag in soup.find_all('td')]
data = data[:-1]

In [506]:
# Show first row of the wikipedia table
data[0:11]

['1',
 '23 January 2020',
 'Discharged on 19 February 2020',
 '66',
 'Male',
 'China',
 'Imported',
 "Shangri-La's Rasa Sentosa Resort & Spa, South West",
 'Singapore General Hospital',
 'Yes',
 '[26][27]']

In [507]:
confirmed_cases = []
for item in range(0,int(len(data)/11)):
    confirmed_case = {}
    confirmed_case['case_num'] = data[item*11]
    confirmed_case['date_announced'] = data[item*11+1]
    confirmed_case['date_discharged'] = data[item*11+2]
    confirmed_case['age'] = data[item*11+3]
    confirmed_case['gender'] = data[item*11+4]
    confirmed_case['nationality'] = data[item*11+5]
    confirmed_case['case_relation'] = data[item*11+6]
    confirmed_case['place_of_stay'] = data[item*11+7]
    confirmed_case['hospitals_visited'] = data[item*11+8]
    confirmed_case['visited_affected_areas'] = data[item*11+9]
    confirmed_case['source'] = data[item*11+10]
    confirmed_cases.append(confirmed_case)
covid = pd.DataFrame(confirmed_cases)

In [508]:
covid.head()

Unnamed: 0,case_num,date_announced,date_discharged,age,gender,nationality,case_relation,place_of_stay,hospitals_visited,visited_affected_areas,source
0,1,23 January 2020,Discharged on 19 February 2020,66,Male,China,Imported,"Shangri-La's Rasa Sentosa Resort & Spa, South ...",Singapore General Hospital,Yes,[26][27]
1,2,24 January 2020,Discharged on 7 February 2020,53,Female,China,Imported,"J8 Hotel, Central",National Centre for Infectious Diseases,Yes,[28][29]
2,3,24 January 2020,Discharged on 21 February 2020,37,Male,China,ImportedSon of case 1[30],"Shangri-La's Rasa Sentosa Resort & Spa, South ...",Singapore General Hospital,Yes,[28][31]
3,4,26 January 2020,Discharged on 12 February 2020,36,Male,China,Imported,"Village Hotel Sentosa, South West",Sengkang General Hospital,Yes,[32][33]
4,5,27 January 2020,Discharged on 18 February 2020,56,Female,China,Imported,"Home at Ceylon Road, South East",National Centre for Infectious Diseases,Yes,[34][35]


In [509]:
covid.tail(1)

Unnamed: 0,case_num,date_announced,date_discharged,age,gender,nationality,case_relation,place_of_stay,hospitals_visited,visited_affected_areas,source
431,432,21 March 2020,,53,Female,United Kingdom,Imported,Visited Indonesia,National Centre for Infectious Diseases,Yes,[83][92]


## Data Cleaning & Munging

In [510]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   case_num                432 non-null    object
 1   date_announced          432 non-null    object
 2   date_discharged         432 non-null    object
 3   age                     432 non-null    object
 4   gender                  432 non-null    object
 5   nationality             432 non-null    object
 6   case_relation           432 non-null    object
 7   place_of_stay           432 non-null    object
 8   hospitals_visited       432 non-null    object
 9   visited_affected_areas  432 non-null    object
 10  source                  432 non-null    object
dtypes: object(11)
memory usage: 37.2+ KB


In [511]:
covid['case_num'] = covid['case_num'].astype(int)

In [512]:
covid['date_announced'] = pd.to_datetime(covid['date_announced'])

In [513]:
def extract_date(string):
    try:
        date_string = re.search("(\d+)[ ](\w+)[ ](\d+)", string).group(0)
    except AttributeError:
        date_string = ""
    return date_string

In [514]:
covid['date_discharged'] = covid['date_discharged'].map(lambda x: extract_date(x))
covid['date_discharged'] = pd.to_datetime(covid['date_discharged'])

In [515]:
covid['age'] = covid['age'].astype(float) # note that we convert to float as there is a case of age = '0.5'

In [516]:
covid['is_male'] = covid['gender'].map(lambda value:True if value=="Male" else False)
covid.drop(columns='gender', inplace=True)

In [517]:
covid['place_of_stay'].value_counts()

Visited United Kingdom                                          54
                                                                38
Visited United States                                           17
Visited Indonesia                                               13
Visited Philippines                                              8
                                                                ..
Home at Tampines Street 24, North East                           1
Visited France and SpainHome at Jalan Haji Salam, South East     1
Worked at SingtelHome at Stirling Road, Central                  1
Visited IndonesiaHome at Kim Tian Road, Central                  1
Home at Crawford Lane, Central                                   1
Name: place_of_stay, Length: 247, dtype: int64

In [518]:
covid['place_of_stay'] = covid['place_of_stay'].replace("", "Unspecified")

In [519]:
covid['place_of_stay'].value_counts()

Visited United Kingdom                                                                    54
Unspecified                                                                               38
Visited United States                                                                     17
Visited Indonesia                                                                         13
Visited Philippines                                                                        8
                                                                                          ..
Home at Senja Road, North West                                                             1
Visited Croatia, Austria                                                                   1
Visited FranceWorked at INSEAD Asia CampusHome at Dover Rise, South West                   1
Student at The Orange Tree Preschool @ Choa Chu KangHome at Teck Whye Lane, South West     1
Home at Wilkinson Road, South East                                    

In [520]:
covid['visited_affected_areas'].value_counts()

No     257
Yes    145
        30
Name: visited_affected_areas, dtype: int64

In [521]:
covid['visited_affected_areas'] = covid['visited_affected_areas'].replace("", "Unspecified")

In [522]:
covid['visited_affected_areas'].value_counts()

No             257
Yes            145
Unspecified     30
Name: visited_affected_areas, dtype: int64

In [523]:
# Reorder columns to mirror the wikipedia table
covid = covid [['case_num',\
                'date_announced',\
                'date_discharged',\
                'age',\
                'is_male',\
                'nationality',\
                'case_relation',\
                'place_of_stay',\
                'hospitals_visited',\
                'visited_affected_areas',\
                'source']]

In [524]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   case_num                432 non-null    int32         
 1   date_announced          432 non-null    datetime64[ns]
 2   date_discharged         142 non-null    datetime64[ns]
 3   age                     432 non-null    float64       
 4   is_male                 432 non-null    bool          
 5   nationality             432 non-null    object        
 6   case_relation           432 non-null    object        
 7   place_of_stay           432 non-null    object        
 8   hospitals_visited       432 non-null    object        
 9   visited_affected_areas  432 non-null    object        
 10  source                  432 non-null    object        
dtypes: bool(1), datetime64[ns](2), float64(1), int32(1), object(6)
memory usage: 32.6+ KB


## Exploratory Data Analysis

In [525]:
covid.describe(include="all")

Unnamed: 0,case_num,date_announced,date_discharged,age,is_male,nationality,case_relation,place_of_stay,hospitals_visited,visited_affected_areas,source
count,432.0,432,142,432.0,432,432,432,432,432,432,432
unique,,55,38,,2,28,161,247,32,3,169
top,,2020-03-21 00:00:00,2020-03-21 00:00:00,,True,Singapore,Imported,Visited United Kingdom,National Centre for Infectious Diseases,No,[83][92]
freq,,47,11,,261,276,175,54,268,257,47
first,,2020-01-23 00:00:00,2020-02-04 00:00:00,,,,,,,,
last,,2020-03-21 00:00:00,2020-03-21 00:00:00,,,,,,,,
mean,216.5,,,44.327546,,,,,,,
std,124.851912,,,16.42256,,,,,,,
min,1.0,,,0.5,,,,,,,
25%,108.75,,,30.0,,,,,,,


In [526]:
covid['nationality'].value_counts()

Singapore         276
Singapore PR       29
China              21
Indonesia          16
Philippines        14
United Kingdom     12
Malaysia           10
Australia           8
United States       6
Germany             6
France              5
Bangladesh          5
Netherlands         3
India               3
Switzerland         3
Sweden              2
Spain               2
Colombia            1
Ireland             1
Brazil              1
Canada              1
Italy               1
Myanmar             1
Japan               1
New Zealand         1
Thailand            1
Belgium             1
Russia              1
Name: nationality, dtype: int64

In [527]:
covid['place_of_stay'].value_counts()

Visited United Kingdom                                                                    54
Unspecified                                                                               38
Visited United States                                                                     17
Visited Indonesia                                                                         13
Visited Philippines                                                                        8
                                                                                          ..
Home at Senja Road, North West                                                             1
Visited Croatia, Austria                                                                   1
Visited FranceWorked at INSEAD Asia CampusHome at Dover Rise, South West                   1
Student at The Orange Tree Preschool @ Choa Chu KangHome at Teck Whye Lane, South West     1
Home at Wilkinson Road, South East                                    

In [528]:
covid['hospitals_visited'].value_counts()

National Centre for Infectious Diseases                                                268
Singapore General Hospital                                                              27
Ng Teng Fong General Hospital                                                           25
National University Hospital                                                            22
Sengkang General Hospital                                                               16
Changi General Hospital                                                                 12
Khoo Teck Puat Hospital                                                                 11
KK Women's and Children's Hospital                                                       7
Tan Tock Seng HospitalNational Centre for Infectious Diseases                            7
Gleneagles HospitalNational Centre for Infectious Diseases                               4
Alexandra Hospital                                                                       3