<a href="https://colab.research.google.com/github/w-oke/covid_reproduction/blob/main/covid_google_2_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Google COVID-19 public datasets / BigQuery

Google is curating and making available a set of "[COVID-19 public datasets](https://cloud.google.com/blog/products/data-analytics/publicly-available-covid-19-data-for-analytics)" that include global data about the COVID-19 pandemic. The data, their ETL code, and information about sources is available in a [Github repository](https://github.com/GoogleCloudPlatform/covid-19-open-data/).


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import pickle

In [2]:
# dataset from Google doesn't have R Number, so test positivity rate might be used instead (new_confirmed/new_tested)
# most of the data will have to be normalized and scaled
df_link = 'https://github.com/w-oke/covid_reproduction/raw/main/covid_google_df.parquet'
df = pd.read_parquet(df_link)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84040 entries, 0 to 84732
Data columns (total 52 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   new_tested                             84040 non-null  int64         
 1   new_confirmed                          84040 non-null  int64         
 2   location_key                           84040 non-null  object        
 3   place_id                               84040 non-null  object        
 4   wikidata_id                            84040 non-null  object        
 5   country_code                           84040 non-null  object        
 6   subregion1_code                        36431 non-null  object        
 7   subregion1_name                        36431 non-null  object        
 8   date                                   84040 non-null  datetime64[ns]
 9   population                             84040 non-null  float6

In [4]:
df.head()

Unnamed: 0,new_tested,new_confirmed,location_key,place_id,wikidata_id,country_code,subregion1_code,subregion1_name,date,population,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older,area_sq_km,cumulative_persons_vaccinated,cumulative_persons_fully_vaccinated,cumulative_vaccine_doses_administered,mobility_retail_and_recreation,mobility_grocery_and_pharmacy,mobility_parks,mobility_transit_stations,mobility_workplaces,mobility_residential,stringency_index,average_temperature_celsius,rainfall_mm,snowfall_mm,school_closing,workplace_closing,cancel_public_events,restrictions_on_gatherings,public_transport_closing,stay_at_home_requirements,restrictions_on_internal_movement,international_travel_controls,income_support,debt_relief,fiscal_measures,international_support,public_information_campaigns,testing_policy,contact_tracing,emergency_investment_in_healthcare,investment_in_vaccines,facial_coverings,vaccination_policy
0,992,49,BA,ChIJ16k3xxWiSxMRDOm3QwPi920,Q225,BA,,,2020-04-26,3280815.0,295212.0,346275.0,403272.0,458385.0,447738.0,500182.0,463795.0,242498.0,123458.0,51210.0,,,,-66,-34,-10,-49,-40,9,90.74,13.027778,0.072571,,3,3.0,2.0,4.0,2.0,2.0,2.0,3.0,1.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,3.0,0.0
1,2781,501,BA,ChIJ16k3xxWiSxMRDOm3QwPi920,Q225,BA,,,2020-12-31,3280815.0,295212.0,346275.0,403272.0,458385.0,447738.0,500182.0,463795.0,242498.0,123458.0,51210.0,,,,-3,48,29,-5,-28,0,42.59,2.888889,24.60625,25.4,1,2.0,1.0,3.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0
2,2451,279,BA,ChIJ16k3xxWiSxMRDOm3QwPi920,Q225,BA,,,2021-02-11,3280815.0,295212.0,346275.0,403272.0,458385.0,447738.0,500182.0,463795.0,242498.0,123458.0,51210.0,0.0,,0.0,-17,5,-25,-23,-11,-5,42.59,,,,1,2.0,1.0,3.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,3.0,1.0
3,3596,1077,BA,ChIJ16k3xxWiSxMRDOm3QwPi920,Q225,BA,,,2020-12-07,3280815.0,295212.0,346275.0,403272.0,458385.0,447738.0,500182.0,463795.0,242498.0,123458.0,51210.0,,,,-20,-5,-22,-22,-7,1,50.0,8.0,7.239,,2,1.0,2.0,3.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0
4,582,290,BA,ChIJ16k3xxWiSxMRDOm3QwPi920,Q225,BA,,,2020-09-10,3280815.0,295212.0,346275.0,403272.0,458385.0,447738.0,500182.0,463795.0,242498.0,123458.0,51210.0,,,,-3,9,38,1,-17,-4,40.74,20.677778,0.0,,1,1.0,2.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0


In [5]:
# import the 'var' list that has the names of the columns in df
var_link = 'https://github.com/w-oke/covid_reproduction/raw/main/covid_google_var_dictionary.pkl'

with urllib.request.urlopen(var_link) as response:
   vars = pickle.load(response)

In [6]:
vars.keys()

dict_keys(['y', 'region', 'date', 'population', 'string', 'float', 'rating'])

In [7]:
vars

{'date': ['date'],
 'float': ['stringency_index',
  'average_temperature_celsius',
  'rainfall_mm',
  'snowfall_mm'],
 'population': ['population',
  'population_age_00_09',
  'population_age_10_19',
  'population_age_20_29',
  'population_age_30_39',
  'population_age_40_49',
  'population_age_50_59',
  'population_age_60_69',
  'population_age_70_79',
  'population_age_80_and_older',
  'area_sq_km',
  'cumulative_persons_vaccinated',
  'cumulative_persons_fully_vaccinated',
  'cumulative_vaccine_doses_administered'],
 'rating': ['school_closing',
  'workplace_closing',
  'cancel_public_events',
  'restrictions_on_gatherings',
  'public_transport_closing',
  'stay_at_home_requirements',
  'restrictions_on_internal_movement',
  'international_travel_controls',
  'income_support',
  'debt_relief',
  'fiscal_measures',
  'international_support',
  'public_information_campaigns',
  'testing_policy',
  'contact_tracing',
  'emergency_investment_in_healthcare',
  'investment_in_vaccines',
 

In [None]:
# Feature generation

df['positive_test_rate_tested'] = df['new_confirmed'] / df['new_tested']
df['positive_test_rate_population'] = df['new_confirmed'] / df['population']
df['test_rate_population'] = df['new_tested'] / df['population']