# Lidando com dados ausentes

Como resolver problemas com os `NA`s

In [1]:
import numpy as np
from numpy import nan as NA
import pandas as pd

In [2]:
data = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5,3]])

In [3]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [4]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [5]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Fazendo o drop nas colunas

In [7]:
data[3] = NA
data

Unnamed: 0,0,1,2,3
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [8]:
data.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Utilizando um threshold

In [9]:
df = pd.DataFrame(np.random.randn(7,3))

In [10]:
df

Unnamed: 0,0,1,2
0,0.450917,2.548892,-0.544899
1,2.059419,-0.512759,1.81219
2,-1.829135,-0.08148,-0.273133
3,1.324284,1.44202,1.091253
4,-0.428856,-0.216336,-0.088486
5,-2.195168,2.353221,0.605746
6,-0.031473,0.494221,-0.925886


Introduzir NAs

In [11]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

In [12]:
df

Unnamed: 0,0,1,2
0,0.450917,,
1,2.059419,,
2,-1.829135,,-0.273133
3,1.324284,,1.091253
4,-0.428856,-0.216336,-0.088486
5,-2.195168,2.353221,0.605746
6,-0.031473,0.494221,-0.925886


In [14]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.829135,,-0.273133
3,1.324284,,1.091253
4,-0.428856,-0.216336,-0.088486
5,-2.195168,2.353221,0.605746
6,-0.031473,0.494221,-0.925886


# Preenchimento de valores faltantes

In [15]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.450917,0.0,0.0
1,2.059419,0.0,0.0
2,-1.829135,0.0,-0.273133
3,1.324284,0.0,1.091253
4,-0.428856,-0.216336,-0.088486
5,-2.195168,2.353221,0.605746
6,-0.031473,0.494221,-0.925886


In [16]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,0.450917,0.5,0.0
1,2.059419,0.5,0.0
2,-1.829135,0.5,-0.273133
3,1.324284,0.5,1.091253
4,-0.428856,-0.216336,-0.088486
5,-2.195168,2.353221,0.605746
6,-0.031473,0.494221,-0.925886


Substituir pela média

In [18]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.450917,0.877035,0.081899
1,2.059419,0.877035,0.081899
2,-1.829135,0.877035,-0.273133
3,1.324284,0.877035,1.091253
4,-0.428856,-0.216336,-0.088486
5,-2.195168,2.353221,0.605746
6,-0.031473,0.494221,-0.925886


# Lidando com valores duplicados

In [19]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + [ 'two'],
                     'k2': [1,1,2,3,3,4,4]})

In [20]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [21]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [22]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [23]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [25]:
data.drop_duplicates(['k1'], keep='last')

Unnamed: 0,k1,k2
4,one,3
6,two,4


# Atualizando Valores

In [26]:
data = pd.Series([1, -999, 2, -999, 1000, 3])

In [27]:
data

0       1
1    -999
2       2
3    -999
4    1000
5       3
dtype: int64

In [28]:
data.replace(-999, NA)

0       1.0
1       NaN
2       2.0
3       NaN
4    1000.0
5       3.0
dtype: float64

In [29]:
data.replace({-999: NA, 1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# Aplicando funções

Utilizando o dataset do [imdb](https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset) disponível no kaggle.

In [30]:
imdb_dataset = pd.read_csv('/tmp/IMDb movies.csv', index_col='title')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [32]:
imdb_dataset['avg_vote']

title
Miss Jerry                        5.9
The Story of the Kelly Gang       6.1
Den sorte drøm                    5.8
Cleopatra                         5.2
L'Inferno                         7.0
                                 ... 
Le lion                           5.3
De Beentjes van Sint-Hildegard    7.7
Padmavyuhathile Abhimanyu         7.9
Sokagin Çocuklari                 6.4
La vida sense la Sara Amat        6.7
Name: avg_vote, Length: 85855, dtype: float64

In [51]:
def classificar_filme(nota_filme):
    if nota_filme['n1'] >= 8.0 and nota_filme['n2'] >= 8.0:
        return "bom"
    else:
        return "ruim"

In [35]:
imdb_dataset["avg_vote"].apply(classificar_filme)

title
Miss Jerry                        ruim
The Story of the Kelly Gang       ruim
Den sorte drøm                    ruim
Cleopatra                         ruim
L'Inferno                         ruim
                                  ... 
Le lion                           ruim
De Beentjes van Sint-Hildegard    ruim
Padmavyuhathile Abhimanyu         ruim
Sokagin Çocuklari                 ruim
La vida sense la Sara Amat        ruim
Name: avg_vote, Length: 85855, dtype: object

In [45]:
data = pd.DataFrame({"filme": ["Miss Jerry","The Story of the Kelly Gang","Cleopatra"],
                     "n1": [2, 5, 10],
                     "n2": [10, 5, 8]})

In [38]:
data

Unnamed: 0,filme,n1,n2
0,Miss Jerry,2,10
1,The Story of the Kelly Gang,5,5
2,Cleopatra,10,1


In [39]:
data.loc[:,'n1':]

Unnamed: 0,n1,n2
0,2,10
1,5,5
2,10,1


In [52]:
data.loc[:,'n1':].apply(classificar_filme, axis=1)

0    ruim
1    ruim
2     bom
dtype: object

# Discretização/binning

In [53]:
idades = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [54]:
bins = [18, 25, 35, 60, 100]

In [55]:
pd.cut(idades, bins)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [57]:
cats = pd.cut(idades, bins)

In [58]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [59]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [60]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [62]:
grupos = ['Jovem', 'Adulto', 'Meia-Idade', 'Senior']

In [63]:
pd.cut(idades, bins, labels=grupos)

['Jovem', 'Jovem', 'Jovem', 'Adulto', 'Jovem', ..., 'Adulto', 'Senior', 'Meia-Idade', 'Meia-Idade', 'Adulto']
Length: 12
Categories (4, object): ['Jovem' < 'Adulto' < 'Meia-Idade' < 'Senior']

# Detecção de Outliers

In [64]:
data = pd.DataFrame(np.random.randn(1000,4))

In [65]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.007111,-0.013061,-0.002569,0.039133
std,0.993191,0.991395,1.003261,1.013332
min,-2.807872,-3.420312,-3.514521,-3.026971
25%,-0.669398,-0.691784,-0.638471,-0.660065
50%,-0.016808,-0.028758,0.012593,0.044852
75%,0.617982,0.648727,0.663139,0.732981
max,3.240598,2.937819,3.035625,3.512846


In [66]:
col = data[2]

Procurando valores (em módulo) maiores do que 3, na coluna de índice `2`

In [67]:
col

0     -0.923469
1      1.633654
2     -0.743226
3      1.791038
4     -0.722124
         ...   
995    0.662769
996    1.613060
997    0.159773
998    0.353831
999   -0.577977
Name: 2, Length: 1000, dtype: float64

In [68]:
col[np.abs(col) > 3]

182    3.035625
388   -3.514521
495   -3.307287
Name: 2, dtype: float64

Generalizando para o dataset inteiro

In [74]:
data[(np.abs(data)>3).any(axis=1)]

Unnamed: 0,0,1,2,3
182,-0.725895,0.359582,3.035625,0.900846
259,1.335946,1.454937,-1.414669,3.512846
305,0.980448,1.232098,0.260727,-3.026971
353,0.018484,-3.420312,0.303408,1.160093
388,0.146849,1.454619,-3.514521,-0.005072
433,3.240598,-0.942152,2.340369,-0.767095
461,1.857147,-3.104729,1.397277,-2.150187
495,0.443444,-0.611386,-3.307287,-0.418265
617,1.250104,-0.299891,1.187286,3.020826
833,-0.015409,-3.065987,0.980958,-1.476884


In [75]:
data[(np.abs(data)>3).any(axis=1)] = np.sign(data) * 3

In [76]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.016499,-0.010609,0.010473,0.034559
std,1.028002,1.02068,1.029825,1.041731
min,-3.0,-3.0,-3.0,-3.0
25%,-0.670389,-0.698111,-0.638471,-0.669028
50%,-0.018346,-0.028758,0.012593,0.044852
75%,0.626294,0.649385,0.667242,0.732981
max,3.0,3.0,3.0,3.0


# Amostragem

In [77]:
df = pd.DataFrame(np.arange(5*4).reshape(5,4))

In [78]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [79]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19


In [82]:
df.sample(n=6, replace=True)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
1,4,5,6,7
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


# Consumo de dados via API REST

Dados Json:
```json
["valor1", "valor2"]
{"k1": 1,"k2": 50}
1
2
3
"valor1"
"valor2"
```

In [111]:
url="https://api.nasa.gov/neo/rest/v1/neo/browse?page=1&api_key=R8G33YMr5NaMUi0ROfJ4mUQK2EiGdbr24ZklZWs9"

In [106]:
import requests

In [112]:
resp = requests.get(url)

In [113]:
data = resp.json()

In [114]:
data['page']

{'size': 20, 'total_elements': 25973, 'total_pages': 1299, 'number': 1}

In [115]:
asteroids = {"id": [], "nome": [], "perigoso": [], "diametro_min": [], 
             "diametro_max":[], "primeira_obs": [], "ultima_obs": []}

In [116]:
for asteroid in data['near_earth_objects']:
    asteroids['id'].append(asteroid['id'])
    asteroids['nome'].append(asteroid['name'])
    asteroids['perigoso'].append(asteroid['is_potentially_hazardous_asteroid'])
    asteroids['diametro_min'].append(asteroid['estimated_diameter']['kilometers']['estimated_diameter_min'])
    asteroids['diametro_max'].append(asteroid['estimated_diameter']['kilometers']['estimated_diameter_max'])
    asteroids['primeira_obs'].append(asteroid['orbital_data']['first_observation_date'])
    asteroids['ultima_obs'].append(asteroid['orbital_data']['last_observation_date'])

In [117]:
len(asteroids['id'])

20

In [118]:
asteroids_data = pd.DataFrame(asteroids, index=asteroids["id"],
                              columns=['nome', 'primeira_obs', 'ultima_obs',
                                       'diametro_min', 'diametro_max', 'perigoso'])

In [119]:
asteroids_data['perigoso'].describe()

count        20
unique        2
top       False
freq         13
Name: perigoso, dtype: object