# Capstone Project Backup: Analysis of Confirmed COVID-19 Cases in Singapore

## Importing of Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import re
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from urllib.request import urlopen

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

  import pandas.util.testing as tm


## Data Collection

In [2]:
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_Singapore'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [3]:
soup = soup.find("table", {"class":"wikitable sortable mw-collapsible mw-collapsed"})

In [4]:
soup = soup.find("tbody")

In [5]:
data = [tag.text.strip() for tag in soup.find_all('td')]
data = data[:-1]

In [6]:
# Show first row of the wikipedia table
data[0:11]

['1',
 '23 January 2020',
 'Discharged on 19 February 2020',
 '66',
 'Male',
 'China',
 'Imported',
 "Shangri-La's Rasa Sentosa Resort & Spa, South West",
 'Singapore General Hospital',
 'Yes',
 '[24][25]']

In [7]:
confirmed_cases = []
for item in range(0,int(len(data)/11)):
    confirmed_case = {}
    confirmed_case['case_num'] = data[item*11]
    confirmed_case['date_announced'] = data[item*11+1]
    confirmed_case['date_discharged'] = data[item*11+2]
    confirmed_case['age'] = data[item*11+3]
    confirmed_case['gender'] = data[item*11+4]
    confirmed_case['nationality'] = data[item*11+5]
    confirmed_case['case_relation'] = data[item*11+6]
    confirmed_case['place_of_stay'] = data[item*11+7]
    confirmed_case['hospitals_visited'] = data[item*11+8]
    confirmed_case['visited_affected_areas'] = data[item*11+9]
    confirmed_case['source'] = data[item*11+10]
    confirmed_cases.append(confirmed_case)
covid = pd.DataFrame(confirmed_cases)

In [8]:
covid.head()

Unnamed: 0,case_num,date_announced,date_discharged,age,gender,nationality,case_relation,place_of_stay,hospitals_visited,visited_affected_areas,source
0,1,23 January 2020,Discharged on 19 February 2020,66,Male,China,Imported,"Shangri-La's Rasa Sentosa Resort & Spa, South ...",Singapore General Hospital,Yes,[24][25]
1,2,24 January 2020,Discharged on 7 February 2020,53,Female,China,Imported,"J8 Hotel, Central",National Centre for Infectious Diseases,Yes,[26][27]
2,3,24 January 2020,Discharged on 21 February 2020,37,Male,China,ImportedSon of case 1[28],"Shangri-La's Rasa Sentosa Resort & Spa, South ...",Singapore General Hospital,Yes,[26][29]
3,4,26 January 2020,Discharged on 12 February 2020,36,Male,China,Imported,"Village Hotel Sentosa, South West",Sengkang General Hospital,Yes,[30][31]
4,5,27 January 2020,Discharged on 18 February 2020,56,Female,China,Imported,"Home at Ceylon Road, South East",National Centre for Infectious Diseases,Yes,[32][33]


In [9]:
covid.tail(1)

Unnamed: 0,case_num,date_announced,date_discharged,age,gender,nationality,case_relation,place_of_stay,hospitals_visited,visited_affected_areas,source
508,509,23 March 2020,,53,Male,Singapore,ImportedContact of case 212,Visited Indonesia,National Centre for Infectious Diseases,Yes,[82][94]


## Data Cleaning & Munging

In [10]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   case_num                509 non-null    object
 1   date_announced          509 non-null    object
 2   date_discharged         509 non-null    object
 3   age                     509 non-null    object
 4   gender                  509 non-null    object
 5   nationality             509 non-null    object
 6   case_relation           509 non-null    object
 7   place_of_stay           509 non-null    object
 8   hospitals_visited       509 non-null    object
 9   visited_affected_areas  509 non-null    object
 10  source                  509 non-null    object
dtypes: object(11)
memory usage: 43.9+ KB


In [11]:
covid['case_num'] = covid['case_num'].astype(int)

In [12]:
covid['date_announced'] = pd.to_datetime(covid['date_announced'])

In [13]:
def extract_date(string):
    try:
        date_string = re.search("(\d+)[ ](\w+)[ ](\d+)", string).group(0)
    except AttributeError:
        date_string = ""
    return date_string

In [14]:
covid['date_discharged'] = covid['date_discharged'].map(lambda x: extract_date(x))
covid['date_discharged'] = pd.to_datetime(covid['date_discharged'])

In [15]:
covid['age'] = covid['age'].astype(float) # note that we convert to float as there is a case of age = '0.5'

In [16]:
covid['is_male'] = covid['gender'].map(lambda value:True if value=="Male" else False)
covid.drop(columns='gender', inplace=True)

In [17]:
covid['place_of_stay'].value_counts()

Visited United Kingdom                                                                     94
                                                                                           48
Visited United States                                                                      22
Visited Indonesia                                                                          18
Visited Malaysia                                                                           10
                                                                                           ..
Worked at 56 Senang Crescent, South EastHome at Jurong West Street 64, South West           1
Visited MalaysiaWorked at Masjid Al-MuttaqinHome at Bukit Panjang Ring Road, North West     1
Home at Upper Bukit Timah Road, North West                                                  1
Home at Sembawang Drive, North West                                                         1
Visited France and SpainHome at Jalan Haji Salam, South East

In [18]:
covid['place_of_stay'] = covid['place_of_stay'].replace("", "Unspecified")

In [19]:
covid['place_of_stay'].value_counts()

Visited United Kingdom                                                                     94
Unspecified                                                                                48
Visited United States                                                                      22
Visited Indonesia                                                                          18
Visited Malaysia                                                                           10
                                                                                           ..
Visited Germany, United States, France, Monaco                                              1
Worked at 56 Senang Crescent, South EastHome at Jurong West Street 64, South West           1
Visited MalaysiaWorked at Masjid Al-MuttaqinHome at Bukit Panjang Ring Road, North West     1
Home at Upper Bukit Timah Road, North West                                                  1
Visited PhilippinesWorked at dnata CargoHome at Flora Drive,

In [20]:
covid['visited_affected_areas'].value_counts()

No     284
Yes    201
        24
Name: visited_affected_areas, dtype: int64

In [21]:
covid['visited_affected_areas'] = covid['visited_affected_areas'].replace("", "Unspecified")

In [22]:
covid['visited_affected_areas'].value_counts()

No             284
Yes            201
Unspecified     24
Name: visited_affected_areas, dtype: int64

In [23]:
# Reorder columns to mirror the wikipedia table
covid = covid [['case_num',\
                'date_announced',\
                'date_discharged',\
                'age',\
                'is_male',\
                'nationality',\
                'case_relation',\
                'place_of_stay',\
                'hospitals_visited',\
                'visited_affected_areas',\
                'source']]

In [24]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   case_num                509 non-null    int32         
 1   date_announced          509 non-null    datetime64[ns]
 2   date_discharged         154 non-null    datetime64[ns]
 3   age                     509 non-null    float64       
 4   is_male                 509 non-null    bool          
 5   nationality             509 non-null    object        
 6   case_relation           509 non-null    object        
 7   place_of_stay           509 non-null    object        
 8   hospitals_visited       509 non-null    object        
 9   visited_affected_areas  509 non-null    object        
 10  source                  509 non-null    object        
dtypes: bool(1), datetime64[ns](2), float64(1), int32(1), object(6)
memory usage: 38.4+ KB


## Exploratory Data Analysis

In [25]:
covid.describe(include="all")

Unnamed: 0,case_num,date_announced,date_discharged,age,is_male,nationality,case_relation,place_of_stay,hospitals_visited,visited_affected_areas,source
count,509.0,509,154,509.0,509,509,509,509,509,509,509
unique,,57,40,,2,29,171,257,32,3,183
top,,2020-03-23 00:00:00,2020-03-21 00:00:00,,True,Singapore,Imported,Visited United Kingdom,National Centre for Infectious Diseases,No,[82][94]
freq,,54,11,,306,335,234,94,329,284,54
first,,2020-01-23 00:00:00,2020-02-04 00:00:00,,,,,,,,
last,,2020-03-23 00:00:00,2020-03-23 00:00:00,,,,,,,,
mean,255.0,,,43.111002,,,,,,,
std,147.07991,,,16.60169,,,,,,,
min,1.0,,,0.5,,,,,,,
25%,128.0,,,29.0,,,,,,,


In [26]:
covid['nationality'].value_counts()

Singapore         335
Singapore PR       34
China              21
Indonesia          19
Philippines        14
United Kingdom     13
Malaysia           10
Australia           9
United States       7
India               6
Germany             6
Bangladesh          5
France              5
Switzerland         3
Netherlands         3
Sweden              2
Spain               2
New Zealand         2
Ireland             2
Myanmar             2
Russia              1
Canada              1
Thailand            1
Japan               1
Belgium             1
Colombia            1
Denmark             1
Brazil              1
Italy               1
Name: nationality, dtype: int64

In [27]:
covid['place_of_stay'].value_counts()

Visited United Kingdom                                                                     94
Unspecified                                                                                48
Visited United States                                                                      22
Visited Indonesia                                                                          18
Visited Malaysia                                                                           10
                                                                                           ..
Visited Germany, United States, France, Monaco                                              1
Worked at 56 Senang Crescent, South EastHome at Jurong West Street 64, South West           1
Visited MalaysiaWorked at Masjid Al-MuttaqinHome at Bukit Panjang Ring Road, North West     1
Home at Upper Bukit Timah Road, North West                                                  1
Visited PhilippinesWorked at dnata CargoHome at Flora Drive,

In [28]:
covid['hospitals_visited'].value_counts()

National Centre for Infectious Diseases                                                329
Singapore General Hospital                                                              29
Ng Teng Fong General Hospital                                                           27
National University Hospital                                                            26
Sengkang General Hospital                                                               17
Changi General Hospital                                                                 15
Khoo Teck Puat Hospital                                                                 13
KK Women's and Children's Hospital                                                       7
Tan Tock Seng HospitalNational Centre for Infectious Diseases                            7
Alexandra Hospital                                                                       5
Gleneagles HospitalNational Centre for Infectious Diseases                               4

## Data Export

In [29]:
covid.to_csv("covid.csv")