In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

In [2]:
cases = pd.read_csv("conposcovidloc.csv")
cases = cases[(cases['Accurate_Episode_Date'] >= '2020-07-01') & (cases['Accurate_Episode_Date'] < '2020-11-01')]
cases = cases[cases['Reporting_PHU_City'].isin(['Mississauga', 'Oakville', 'Newmarket','Whitby','Toronto','Ottawa'])]

### Create the dimension for Patient

#### Subset the columns for Patient 

In [3]:
Patient_dimension = cases[['Row_ID','Client_Gender','Age_Group','Case_AcquisitionInfo', 'Outbreak_Related']].copy()
Patient_dimension.reset_index(drop=True, inplace =True)
Patient_dimension.columns = ['Patient_ID', 'Gender', 'Age_Group', 'Acquisition_Group', 'Outbreak_Related']

#### Make a community spread class in Acquisition Group

In [4]:
 # A ‘case with an epidemiological link’ is a case that has either been exposed to a confirmed case, 
 # or has had the same exposure as a confirmed case (e.g. eaten the same food, stayed in the same hotel, etc).
 # https://deputyprimeminister.gov.mt/en/health-promotion/idpcu/Pages/casedefinition.aspx
Patient_dimension['Acquisition_Group'].replace({'NO KNOWN EPI LINK': 'CS', 'UNSPECIFIED EPI LINK':'CS'}, inplace = True)
Patient_dimension.Acquisition_Group.unique()

array(['CC', 'OB', 'CS', 'TRAVEL', 'MISSING INFORMATION'], dtype=object)

#### Make Outbreak_Related column boolean and remove NaN values

In [5]:
Patient_dimension['Outbreak_Related'].replace({'Yes': '1'}, inplace = True)
Patient_dimension.Outbreak_Related.fillna(0, inplace = True)

In [6]:
Patient_dimension.insert(0, "Patient_surrogate_key",  np.arange(len(Patient_dimension)))

In [7]:
Patient_dimension.to_csv('Patient_dimension.csv',index = False, header=True)


### Create the PHU dimension

#### Subset the columns for PHU_Location

In [8]:
PHU_Location_dimension = cases[['Reporting_PHU_ID', 'Reporting_PHU','Reporting_PHU_Address','Reporting_PHU_City','Reporting_PHU_Postal_Code','Reporting_PHU_Website','Reporting_PHU_Latitude','Reporting_PHU_Longitude']].copy()
PHU_Location_dimension.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
PHU_Location_dimension.columns = ['PHU_Location_ID', 'PHU_Name', 'Address', 'City', 'Postal_Code', 'URL', 'Latitude', 'Longitude']

#### Add Province and PHU_Location_surrogate_key

In [9]:
PHU_Location_dimension.insert(5, 'Province',  'ON')
PHU_Location_dimension.insert(0, "PHU_Location_surrogate_key",  np.arange(len(PHU_Location_dimension)))

In [10]:
PHU_Location_dimension.to_csv('PHU_Location_dimension.csv',index = False, header=True)