In [1]:
# import independencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


In [2]:
#Reading sample file into a dataframe
file_path = Path('../Datasource/sample_covid_dataset.csv')
cases_df = pd.read_csv(file_path)
cases_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,_id,Row_ID,Accurate_Episode_Date,Case_Reported_Date,Test_Reported_Date,Specimen_Date,...,Outcome1,Outbreak_Related,Reporting_PHU_ID,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_City,Reporting_PHU_Postal_Code,Reporting_PHU_Website,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,0,9,9,353345,353346,353346,2020-12-14T00:00:00,2020-12-20T00:00:00,2020-12-20T00:00:00,2020-12-16T00:00:00,...,Resolved,Yes,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",Toronto,M5B 1W2,www.toronto.ca/community-people/health-wellnes...,43.656591,-79.379358
1,1,15,15,35074,35075,35075,2020-11-08T00:00:00,2020-11-09T00:00:00,2020-11-09T00:00:00,2020-11-08T00:00:00,...,Resolved,Yes,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",Toronto,M5B 1W2,www.toronto.ca/community-people/health-wellnes...,43.656591,-79.379358
2,2,16,16,131940,131941,131941,2021-02-03T00:00:00,2021-02-05T00:00:00,2021-02-05T00:00:00,2021-02-03T00:00:00,...,Resolved,Yes,2246,Niagara Region Public Health Department,1815 Sir Isaac Brock Way,Thorold,L2V 4T7,www.niagararegion.ca/health,43.116537,-79.24122
3,3,24,24,205768,205769,205769,2020-12-24T00:00:00,2020-12-29T00:00:00,2020-12-29T00:00:00,2020-11-26T00:00:00,...,Resolved,Yes,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",Toronto,M5B 1W2,www.toronto.ca/community-people/health-wellnes...,43.656591,-79.379358
4,4,26,26,387017,387018,387018,2021-01-21T00:00:00,2021-01-23T00:00:00,2021-01-23T00:00:00,2021-01-21T00:00:00,...,Resolved,Yes,2246,Niagara Region Public Health Department,1815 Sir Isaac Brock Way,Thorold,L2V 4T7,www.niagararegion.ca/health,43.116537,-79.24122


# Cleaning Data 

In [3]:
# Check the columns

cases_df.columns


Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       '_id', 'Row_ID', 'Accurate_Episode_Date', 'Case_Reported_Date',
       'Test_Reported_Date', 'Specimen_Date', 'Age_Group', 'Client_Gender',
       'Case_AcquisitionInfo', 'Outcome1', 'Outbreak_Related',
       'Reporting_PHU_ID', 'Reporting_PHU', 'Reporting_PHU_Address',
       'Reporting_PHU_City', 'Reporting_PHU_Postal_Code',
       'Reporting_PHU_Website', 'Reporting_PHU_Latitude',
       'Reporting_PHU_Longitude'],
      dtype='object')

In [4]:
# Drop unrelated columns from dataframe

columns_to_drop=['Unnamed: 0','Unnamed: 0.1', 'Unnamed: 0.1.1','Unnamed: 0.1.1.1', '_id', 'Row_ID','Age_Group','Accurate_Episode_Date',
                'Case_Reported_Date','Test_Reported_Date','Specimen_Date', 'Case_AcquisitionInfo', 'Outcome1','Outbreak_Related',
                'Reporting_PHU_City' , 'Reporting_PHU_Postal_Code',  'Reporting_PHU_Website','Client_Gender']
                
               
cleaned_cases_df = cases_df.drop(columns= columns_to_drop)


In [5]:
cleaned_cases_df.columns

Index(['Reporting_PHU_ID', 'Reporting_PHU', 'Reporting_PHU_Address',
       'Reporting_PHU_Latitude', 'Reporting_PHU_Longitude'],
      dtype='object')

In [6]:
# Rename columns to short version

cleaned_cases_df = cleaned_cases_df.rename(columns={"Accurate_Episode_Date": "Date",'Reporting_PHU_ID':'PHU_id'})
cleaned_cases_df.columns

Index(['PHU_id', 'Reporting_PHU', 'Reporting_PHU_Address',
       'Reporting_PHU_Latitude', 'Reporting_PHU_Longitude'],
      dtype='object')

In [7]:
cleaned_cases_df

Unnamed: 0,PHU_id,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",43.656591,-79.379358
1,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",43.656591,-79.379358
2,2246,Niagara Region Public Health Department,1815 Sir Isaac Brock Way,43.116537,-79.241220
3,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",43.656591,-79.379358
4,2246,Niagara Region Public Health Department,1815 Sir Isaac Brock Way,43.116537,-79.241220
...,...,...,...,...,...
13519,2253,Peel Public Health,7120 Hurontario Street,43.647471,-79.708893
13520,2261,Sudbury & District Health Unit,1300 Paris Street,46.466092,-80.998059
13521,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",43.656591,-79.379358
13522,2253,Peel Public Health,7120 Hurontario Street,43.647471,-79.708893


In [8]:
cleaned_cases_df.drop_duplicates(subset=None, keep='first', inplace=True)

In [9]:
cleaned_cases_df

Unnamed: 0,PHU_id,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_Latitude,Reporting_PHU_Longitude
0,3895,Toronto Public Health,"277 Victoria Street, 5th Floor",43.656591,-79.379358
2,2246,Niagara Region Public Health Department,1815 Sir Isaac Brock Way,43.116537,-79.24122
5,2253,Peel Public Health,7120 Hurontario Street,43.647471,-79.708893
6,2270,York Region Public Health Services,17250 Yonge Street,44.048023,-79.480239
7,2265,"Region of Waterloo, Public Health",99 Regina Street South,43.462876,-80.520913
8,2251,Ottawa Public Health,100 Constellation Drive,45.345665,-75.763912
10,2236,Halton Region Health Department,1151 Bronte Road,43.413997,-79.744796
13,2230,Durham Region Health Department,605 Rossland Road East,43.898605,-78.940341
16,2266,Wellington-Dufferin-Guelph Public Health,160 Chancellors Way,43.524881,-80.233743
19,2268,Windsor-Essex County Health Unit,1005 Ouellette Avenue,42.308796,-83.03367


In [10]:
# Saving the dataframe into csv file
cleaned_cases_df.to_csv('../Datasource/PHU_locations.csv')