In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import requests
from bs4 import BeautifulSoup

### VARIATION IN GOVERNMENT RESPONSES TO COVID-19


The authors introduce the Oxford COVID-19 Government Response Tracker (OxCGRT), providing a systematic way to track the stringency of government responses to COVID-19 across countries and time. 

 More details at https://www.bsg.ox.ac.uk/research/publications/variation-government-responses-covid-19

In [2]:
#Set max row display
pd.set_option('display.max_rows', 10000)
#Set max width column 
pd.set_option('display.max_colwidth', 150)

data = pd.read_excel('data/OxCGRT_Download_latest_data (1).xlsx')
# data.shape (10561, 35)
data.head()

KeyboardInterrupt: 

In [None]:
data.columns

In [None]:
# Whcich countries we have
data.CountryName.unique()
len(data.CountryName.unique())

In [None]:
# Sorting values
# First Country
# data = data.sort_values('Date',ascending=False).reset_index() # Latest = today 20200330
data = data.sort_values('Date', ascending = False).reset_index()
data.head(20)
# The first Country to adopt 

In [None]:
# Country analisys
data.CountryName.unique()
data_country = data.groupby(['CountryName'])['CountryName'].count()
data_country.sort_values(ascending=False)

## Data Cleaning

In [None]:
# Chekcing NAs
# data.shape 1036x16
data.isna().sum()
sns.heatmap(data.isnull(), cbar=False)

In [None]:
# Let's Drop Empty columns
empty_cols = ['S1_Notes','S2_Notes','S3_Notes','S4_Notes', 'S5_Notes', 
'S6_Notes', 'S7_Notes', 'S8_Notes', 'S9_Notes', 'S10_Notes', 'S11_Notes']

data = data.drop(empty_cols, axis=1)
data.shape

In [None]:
# Let's Drop Empty rows
# data = data.dropna(axis=0, how='all')
# data.shape
data.dropna(thresh=2)
data.shape

In [None]:
# como limpar linhas repetidas de dados e ficar só com os ultimo index?

## Stringency Index

In [None]:
data.columns
stringency = data.groupby(['CountryName']).max()
stringency = data.sort_values(by=['StringencyIndex'], ascending=False)
stringency.head()

# mask for is general? posso incluir brasil mesmo assim?
# mask for stringency >90
# como manter a última data de atualização dos países?
# transformar data em data ?

# np.where retorna T or F

In [None]:
data.columns
country_cases = data.groupby(['CountryName']).agg('sum')
country_cases = country_cases.sort_values(by=['ConfirmedCases'], ascending=False)
country_cases.head()

#confirmed cases se somam, como ficar com o ultimo valor?

In [None]:
# drop columns com confirmed cases, deaths etc. Fica somente com paises e stringency index

# Population

In [None]:
# url = 'https://pt.wikipedia.org/wiki/Lista_de_pa%C3%ADses_por_popula%C3%A7%C3%A3o'
# pop = pd.read_csv(url)

## Web scrapping

In [None]:
# Some insights
# analisar se a densidade populacional afeta a disseminação do virus?
# evolução do número de casos x medidas

In [None]:
# web scrapping
import re

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://pt.wikipedia.org/wiki/Lista_das_cidades_mais_populosas_do_mundo'
response = requests.get(url, headers = headers).content
response

In [None]:
# Making the Soup
soup = BeautifulSoup(response, features = 'lxml')
soup

In [None]:
# Selecionar o elemento table  
table = soup.find_all('table', attrs = {'class':'sortable wikitable'})[0] # [0] para retornar lista do que eu quero
table

In [None]:
# Get the rows of the table <tr>
rows = table.find_all('td')
rows

In [None]:
# a href= reference
# a class="image" = image
# span style= pop

In [None]:
# tirar \n e xa0 bla bla bla 
# get_text: o  .text chama o get_text
rows = [row.text.replace('\n', ' ').replace('\xa0', '') for row in rows[1:]]
rows
#' ',[1], &00000000, ultimos numeros e strip left 0
# extrair só as cidades que quero [20]

In [None]:
countries = [re.findall(r'[a-zA-Z]' ,row) for row in rows]
countries

In [None]:
countries = [''.join(row) for row in countries]
countries
# dict_countries = {countries[idx] : float(row [-5:]) for idx, row in enumerate(rows)}
# dict_countries

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(dict_countries, orient='index')
df