In [1]:
%matplotlib inline
import pandas as pd
import requests as req
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, ttest_rel

np.set_printoptions(precision=3)


In [2]:

url = 'http://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_IDH'


# In[3]:

html_text = req.get(url).text


# In[4]:

table = pd.read_html(html_text, attrs={"class":"wikitable"})[0]


# In[5]:

def idh_format(str):
    num = float(str)/1000.0
    return num


# ### Pré-Processando IDH-M Data

# In[6]:

"""
  0,800 – 1 (Muito alto) - idh_level = 0
  0,700 - 0,799 (Alto)   - idh_level = 0
  0,600 - 0,699 (Médio)  - idh_level = 1
  0,500 - 0,599 (Baixo)  - idh_level = 2
  0 - 0,499 (Muito baixo)- idh_level = 3
"""
def idh_level(x):
    if x >= 0.7:
        return 0
    elif 0.6 <= x < 0.7:
        return 1
    elif 0.5 <= x < 0.6:
        return 2
    elif 0.4 <= x < 0.5:
        return 3
    else: raise Exception("Invalid!")
    
"""
  Abaixo da mediana de 2000 = level 0
  Igual ou acima da mediana de 2000 = level 1
"""
def idh_level2(t):
    def __level(x):
        if x >= t[4][2:].apply(lambda x: float(x)).median():
            return 1
        else: return 0   
    return __level

# In[7]:

idhm_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(idh_format).tolist(),u'I2000':table[4][2:].apply(idh_format).tolist()})
idhm_df["Ratio"] = idhm_df["I2010"]/idhm_df["I2000"]
idhm_df["idh_level_2000"] = idhm_df["I2000"].apply(idh_level2(table))



# In[15]:

st_pa = np.array([
        [u"Distrito Federal", 0.0, 0.0, 1.0],
        [u"São Paulo", 0.0, 0.925, 0.075],
        [u"Santa Catarina", 0.0, 0.0, 1.0],
        [u"Rio de Janeiro", 0.4, 0.0, 0.6],
        [u"Paraná", 0.0, 0.0, 1.0],
        [u"Rio Grande do Sul", 0.2, 0.4, 0.4],
        [u"Espírito Santo", 0.0, 0.2, 0.8],
        [u"Goiás", 0.0, 0.6, 0.4],
        [u"Minas Gerais", 0.0, 0.8, 0.2],
        [u"Mato Grosso do Sul", 0.0, 0.6, 0.4],
        [u"Mato Grosso", 0.0, 0.2, 0.8],
        [u"Amapá", 0.075, 0.0, 0.925],
        [u"Roraima", 0.275, 0.4, 0.325], # double check
        [u"Tocantins", 0.0, 0.2, 0.8], 
        [u"Rondônia", 0.0, 0.4, 0.6],
        [u"Rio Grande do Norte", 0.0, 0.0, 1.0],
        [u"Ceará", 0.6, 0.0, 0.4],
        [u"Amazonas", 0.0, 0.0, 1.0],
        [u"Pernambuco", 0.0, 0.0, 1.0],
        [u"Sergipe", 0.4, 0.2, 0.4],
        [u"Acre", 1.0, 0.0, 0.0],
        [u"Bahia", 0.4, 0.0, 0.6],
        [u"Paraíba", 0.0, 0.55, 0.45],
        [u"Piauí", 0.8, 0.0, 0.2],
        [u"Pará", 0.4, 0.6, 0.0],
        [u"Maranhão", 0.0, 0.0, 1.0],
        [u"Alagoas", 0.0, 0.0, 1.0],
       ])


st_re = np.array([
        [u"Distrito Federal", u"Centro-Oeste"],
        [u"São Paulo", u"Sudeste"],
        [u"Santa Catarina", u"Sul"],
        [u"Rio de Janeiro", u"Sudeste"],
        [u"Paraná", u"Sul"],
        [u"Rio Grande do Sul", u"Sul"],
        [u"Espírito Santo", u"Sudeste"],
        [u"Goiás", u"Centro-Oeste"],
        [u"Minas Gerais", u"Sudeste"],
        [u"Mato Grosso do Sul", u"Centro-Oeste"],
        [u"Mato Grosso", u"Centro-Oeste"],
        [u"Amapá", u"Norte"],
        [u"Roraima", u"Norte"], # double check
        [u"Tocantins", u"Norte"], 
        [u"Rondônia", u"Norte"],
        [u"Rio Grande do Norte", u"Nordeste"],
        [u"Ceará", u"Nordeste"],
        [u"Amazonas", u"Norte"],
        [u"Pernambuco", u"Nordeste"],
        [u"Sergipe", u"Nordeste"],
        [u"Acre", u"Norte"],
        [u"Bahia", u"Nordeste"],
        [u"Paraíba", u"Nordeste"],
        [u"Piauí", u"Nordeste"],
        [u"Pará", u"Norte"],
        [u"Maranhão", u"Nordeste"],
        [u"Alagoas", u"Nordeste"],
       ])


state_parties_df = pd.DataFrame({"Estado":st_pa[:,0],"PSDB":np.float64(st_pa[:,2]),"PT":np.float64(st_pa[:,1]),"Outros":np.float64(st_pa[:,3])})
state_regions_df = pd.DataFrame({"Estado":st_re[:,0],"Regiao":st_re[:,1]})

#df = idhm_df.merge(state_parties_df, on="Estado")
#df

In [3]:
idhm_df.to_csv("../data/brazil_states_idh_2000_2010.csv")

In [4]:
state_parties_df.to_csv("../data/brazil_states_parties_2000-2010.csv")

In [5]:
state_regions_df.to_csv("../data/brazil_states_regions.csv")

In [6]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[1]
table

Unnamed: 0,0,1,2,3,4
0,Posição,Unidades federativas,IDHM-Renda,,
1,Dados de 2010,Comparados aos de 2000,Em 2010,Em 2000,
2,1,(0),Distrito Federal,0.863,0.805
3,2,(0),São Paulo,0.789,0.756
4,3,(0),Rio de Janeiro,0.782,0.745
5,4,(1),Santa Catarina,0.773,0.717
6,5,(1),Rio Grande do Sul,0.769,0.72
7,6,(0),Paraná,0.757,0.704
8,7,(2),Espírito Santo,0.743,0.687
9,8,(2),Goiás,0.742,0.686


In [7]:
f = idh_level2(table)
f(0.5)

0

In [8]:
idhr_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(lambda x: float(x)).tolist(),u'I2000':table[4][2:].apply(lambda x: float(x)).tolist()})
idhr_df["Ratio"] = idhr_df["I2010"]/idhr_df["I2000"]
idhr_df["idh_level_2000"] = idhr_df["I2000"].apply(idh_level2(table))


In [9]:
idhr_df

Unnamed: 0,Estado,I2000,I2010,Ratio,idh_level_2000
0,Distrito Federal,0.805,0.863,1.07205,1
1,São Paulo,0.756,0.789,1.043651,1
2,Rio de Janeiro,0.745,0.782,1.049664,1
3,Santa Catarina,0.717,0.773,1.078103,1
4,Rio Grande do Sul,0.72,0.769,1.068056,1
5,Paraná,0.704,0.757,1.075284,1
6,Espírito Santo,0.687,0.743,1.081514,1
7,Goiás,0.686,0.742,1.081633,1
8,Mato Grosso do Sul,0.687,0.74,1.077147,1
9,Mato Grosso,0.689,0.732,1.062409,1


In [10]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[2]
idhl_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(idh_format).tolist(),u'I2000':table[4][2:].apply(idh_format).tolist()})
idhl_df["Ratio"] = idhl_df["I2010"]/idhl_df["I2000"]
idhl_df["idh_level_2000"] = idhl_df["I2000"].apply(lambda x: 1 if x >= idhl_df["I2000"].mean() else 0)


In [11]:
idhl_df

Unnamed: 0,Estado,I2000,I2010,Ratio,idh_level_2000
0,Distrito Federal,0.814,0.873,1.072482,1
1,Santa Catarina,0.812,0.86,1.059113,1
2,São Paulo,0.786,0.845,1.075064,1
3,Rio Grande do Sul,0.804,0.84,1.044776,1
4,Minas Gerais,0.759,0.838,1.104084,1
5,Rio de Janeiro,0.74,0.835,1.128378,1
6,Espírito Santo,0.777,0.835,1.074646,1
7,Mato Grosso do Sul,0.752,0.833,1.107713,1
8,Paraná,0.747,0.83,1.111111,1
9,Goiás,0.773,0.827,1.069858,1


In [12]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[3]
idhe_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(lambda x: float(x)).tolist(),u'I2000':table[4][2:].apply(lambda x: float(x)).tolist()})
idhe_df["Ratio"] = idhe_df["I2010"]/idhe_df["I2000"]
idhe_df["idh_level_2000"] = idhe_df["I2000"].apply(idh_level2(table))


In [13]:
idhe_df

Unnamed: 0,Estado,I2000,I2010,Ratio,idh_level_2000
0,Distrito Federal,0.582,0.742,1.274914,1
1,São Paulo,0.581,0.719,1.237522,1
2,Santa Catarina,0.526,0.697,1.325095,1
3,Rio de Janeiro,0.53,0.675,1.273585,1
4,Paraná,0.522,0.668,1.279693,1
5,Espírito Santo,0.491,0.653,1.329939,1
6,Goiás,0.439,0.646,1.471526,1
7,Rio Grande do Sul,0.505,0.642,1.271287,1
8,Minas Gerais,0.47,0.638,1.357447,1
9,Mato Grosso,0.426,0.635,1.49061,1


In [14]:
idhr_df.to_csv("../data/brazil_states_idhr_2000_2010.csv")

In [15]:
idhl_df.to_csv("../data/brazil_states_idhl_2000_2010.csv")

In [16]:
idhe_df.to_csv("../data/brazil_states_idhe_2000_2010.csv")