# Collecting and cleaning the data in the Bayesian project

In [1]:
import numpy as np
import pandas as pd
import datetime
import os

## Parameters which should be manually

In [2]:
regions = [ 'Pirkanmaa', 'Uusimaa', 'Southwest Finland',
            'Skåne County', 'Stockholm County', 'Västra Götaland County',
            'Oslo', 'Vestland', 'Viken']

start_dates = [datetime.datetime(2020, 9, 27, 0, 0), datetime.datetime(2020, 9, 13, 0, 0), datetime.datetime(2020, 9, 27, 0, 0),
               datetime.datetime(2020, 10, 11, 0, 0), datetime.datetime(2020, 9, 13, 0, 0), datetime.datetime(2020, 10, 18, 0, 0),
               datetime.datetime(2020, 10, 18, 0, 0), datetime.datetime(2020, 10, 18, 0, 0), datetime.datetime(2020, 10, 18, 0, 0)]

end_dates = [datetime.datetime(2020, 11, 29, 0, 0), datetime.datetime(2020, 11, 15, 0, 0), datetime.datetime(2020, 11, 29, 0, 0),
             datetime.datetime(2020, 12, 13, 0, 0), datetime.datetime(2020, 11, 15, 0, 0), datetime.datetime(2020, 12, 20, 0, 0),
             datetime.datetime(2020, 12, 20, 0, 0), datetime.datetime(2020, 12, 20, 0, 0), datetime.datetime(2020, 12, 20, 0, 0)]

infections = [[39, 94, 68, 62, 94, 67, 55, 117, 210, 223],
              [161, 374, 522, 733, 708, 689, 717, 856, 922, 1428],
              [41, 80, 87, 76, 146, 158, 146, 117, 135, 247],
              [486, 1021, 1682, 2706, 3857, 3799, 5325, 5420, 8683, 9580],
              [380, 757, 1097, 1326, 1567, 2020, 3679, 6570, 8853, 9117],
              [892, 2137, 3121, 4481, 3930, 4523, 4511, 5433, 6249, 9691],
              [327, 643, 934, 1008, 1315, 973, 784, 677, 651, 648],
              [185, 392, 646, 742, 637, 251, 139, 93, 91, 182],
              [225, 701, 1091, 1296, 1323, 1348, 1118, 1203, 1143, 1095]]

populations = [515095, 1671000, 478582,
               1340000, 2344000, 1710000,
               634293, 636500,  1213354]

w = 7 # Window size for convolution

## Dataframes

In [3]:
dtypes_regions = np.dtype([
          ('region', str),
          ('population', int),
          ('critical_week_start_infections', np.datetime64), # sunday of the week
          ('critical_week_end_infections', np.datetime64)    # sunday of the week
    ])

data_regions = np.empty(0, dtype=dtypes_regions)
df_regions = pd.DataFrame(data_regions)

In [4]:
dtypes_investigated_time_period = np.dtype([
          ('region', str),  # country name
          ('last_sunday_of_the_week', np.datetime64), # current date, BASED ON TRAFFIC
          ('infections_week_sum', int), # new deaths on that date
          ('traffic_retail_average', float), # retail and recreation traffic on that date
          ('traffic_supermarket_average', float), # supermarket and pharmacy traffic on that date
          ('traffic_parks_average', float),  # park traffic on that date
          ('traffic_transit_stations_average', float), # transit station traffic on that date
          ('traffic_workplaces_average', float), # workplace traffic on that date
          ('traffic_residential_average', float), # residential traffic on that date
    ])

data_investigated_time_period = np.empty(0, dtype=dtypes_investigated_time_period)
df_investigated_time_period = pd.DataFrame(data_investigated_time_period)

## Download different datasets and fill dataframes

### Update df_regions

In [5]:
num_regions = len(regions)

def append_df_regions(df_regions, region, start_date, end_date, population):
    
    df_oslo = {'region': region,
               'population': population, 
               'critical_week_start_infections': start_date,
               'critical_week_end_infections': end_date
              } 
    return df_regions.append(df_oslo, ignore_index = True) 


for i in range(num_regions):
    df_regions = append_df_regions(df_regions, regions[i], start_dates[i], end_dates[i], populations[i])

df_regions

Unnamed: 0,region,population,critical_week_start_infections,critical_week_end_infections
0,Pirkanmaa,515095,2020-09-27,2020-11-29
1,Uusimaa,1671000,2020-09-13,2020-11-15
2,Southwest Finland,478582,2020-09-27,2020-11-29
3,Skåne County,1340000,2020-10-11,2020-12-13
4,Stockholm County,2344000,2020-09-13,2020-11-15
5,Västra Götaland County,1710000,2020-10-18,2020-12-20
6,Oslo,634293,2020-10-18,2020-12-20
7,Vestland,636500,2020-10-18,2020-12-20
8,Viken,1213354,2020-10-18,2020-12-20


### Update df_investigated_time_period

In [6]:
def append_df_investigated_time_period(df_investigated_time_period, region, start_date, end_date, infections, w):
    
    len_interval = len(infections)
    dates = [start_date + datetime.timedelta(days=w*x) for x in range(len_interval)]
    
    url = 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
    data = pd.read_csv(url)
    data = data[(data['sub_region_1'] == region)]
    data = data[(data['sub_region_2'].isnull())]
    
    # Roll a window over the infected data to get it smoothened
    data['traffic_retail_average']=data.retail_and_recreation_percent_change_from_baseline.rolling(window = w, win_type=None).mean()
    data['traffic_retail_average']=data.traffic_retail_average.fillna(method='ffill')
    data['traffic_retail_average']=data.traffic_retail_average.fillna(method='bfill').apply(np.int64)

    data['traffic_supermarket_average']=data.grocery_and_pharmacy_percent_change_from_baseline.rolling(window = w, win_type=None).mean()
    data['traffic_supermarket_average']=data.traffic_supermarket_average.fillna(method='ffill')
    data['traffic_supermarket_average']=data.traffic_supermarket_average.fillna(method='bfill').apply(np.int64)

    data['traffic_parks_average']=data.parks_percent_change_from_baseline.rolling(window = w, win_type=None).mean()
    data['traffic_parks_average']=data.traffic_parks_average.fillna(method='ffill')
    data['traffic_parks_average']=data.traffic_parks_average.fillna(method='bfill').apply(np.int64)

    data['traffic_transit_stations_average']=data.transit_stations_percent_change_from_baseline.rolling(window = w, win_type=None).mean()
    data['traffic_transit_stations_average']=data.traffic_transit_stations_average.fillna(method='ffill')
    data['traffic_transit_stations_average']=data.traffic_transit_stations_average.fillna(method='bfill').apply(np.int64)

    data['traffic_workplaces_average']=data.workplaces_percent_change_from_baseline.rolling(window = w, win_type=None).mean()
    data['traffic_workplaces_average']=data.traffic_workplaces_average.fillna(method='ffill')
    data['traffic_workplaces_average']=data.traffic_workplaces_average.fillna(method='bfill').apply(np.int64)

    data['traffic_residential_average']=data.residential_percent_change_from_baseline.rolling(window = w, win_type=None).mean()
    data['traffic_residential_average']=data.traffic_residential_average.fillna(method='ffill')
    data['traffic_residential_average']=data.traffic_residential_average.fillna(method='bfill').apply(np.int64)

    data = data[(data['date'] >= start_date.strftime('%Y-%m-%d')) & (data['date'] <= dates[-1].strftime('%Y-%m-%d')) ]
    data = data.iloc[::w, :]
    
    traffic_retail_average = data['traffic_retail_average'].tolist()
    traffic_retail_average = [(x - traffic_retail_average[0])/100 for x in traffic_retail_average]

    traffic_supermarket_average = data['traffic_supermarket_average'].tolist()
    traffic_supermarket_average = [(x - traffic_supermarket_average[0])/100 for x in traffic_supermarket_average]

    traffic_parks_average = data['traffic_parks_average'].tolist()
    traffic_parks_average = [(x - traffic_parks_average[0])/100 for x in traffic_parks_average]

    traffic_transit_stations_average = data['traffic_transit_stations_average'].tolist()
    traffic_transit_stations_average = [(x - traffic_transit_stations_average[0])/100 for x in traffic_transit_stations_average]

    traffic_workplaces_average = data['traffic_workplaces_average'].tolist()
    traffic_workplaces_average = [(x - traffic_workplaces_average[0])/100 for x in traffic_workplaces_average]

    traffic_residential_average = data['traffic_residential_average'].tolist()
    traffic_residential_average = [(x - traffic_residential_average[0])/100 for x in traffic_residential_average]
    
    for j in range(len_interval):
    
        df_current_week = {
               'region': region,
               'last_sunday_of_the_week': dates[j],
               'infections_week_sum': infections[j],
               'traffic_retail_average': traffic_retail_average[j],
               'traffic_supermarket_average': traffic_supermarket_average[j],
               'traffic_parks_average': traffic_parks_average[j],
               'traffic_transit_stations_average': traffic_transit_stations_average[j],
               'traffic_workplaces_average': traffic_workplaces_average[j],
               'traffic_residential_average': traffic_residential_average[j],
        }
        df_investigated_time_period = df_investigated_time_period.append(df_current_week, ignore_index = True)
    
    return df_investigated_time_period 

for i in range(num_regions):
    df_investigated_time_period = append_df_investigated_time_period(df_investigated_time_period, 
                                            regions[i], start_dates[i], end_dates[i], infections[i], w)
    
df_investigated_time_period

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,region,last_sunday_of_the_week,infections_week_sum,traffic_retail_average,traffic_supermarket_average,traffic_parks_average,traffic_transit_stations_average,traffic_workplaces_average,traffic_residential_average
0,Pirkanmaa,2020-09-27,39,0.00,0.00,0.00,0.00,0.00,0.00
1,Pirkanmaa,2020-10-04,94,0.01,0.01,-0.31,-0.03,0.00,0.01
2,Pirkanmaa,2020-10-11,68,-0.02,-0.01,-0.57,-0.06,0.00,0.02
3,Pirkanmaa,2020-10-18,62,0.00,-0.02,-0.36,-0.11,-0.19,0.04
4,Pirkanmaa,2020-10-25,94,-0.03,-0.02,-0.86,-0.08,0.00,0.04
...,...,...,...,...,...,...,...,...,...
85,Viken,2020-11-22,1348,-0.12,-0.05,-0.53,-0.15,-0.08,0.06
86,Viken,2020-11-29,1118,-0.04,-0.02,-0.52,-0.15,-0.08,0.06
87,Viken,2020-12-06,1203,-0.04,0.07,-0.70,-0.15,-0.06,0.06
88,Viken,2020-12-13,1143,0.01,0.15,-0.75,-0.14,-0.05,0.05


## Export the created dataframes

In [7]:
# Make sure these are imported to the correct library
output_file = os.path.join('~/Documents/Covid19TrafficAnalysis', 'DfRegions.csv')
df_regions.to_csv(output_file, index=False)

In [8]:
output_file = os.path.join('~/Documents/Covid19TrafficAnalysis', 'DfInvestigatedTimePeriod.csv')
df_investigated_time_period.to_csv(output_file, index=False)