<a href="https://colab.research.google.com/github/valdenio458/livro-python-para-analise-de-dados/blob/main/capitulo_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

### 7.1 Tratando dados ausentes

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [None]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [None]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

Filtrando dados ausentes

In [None]:
from numpy import nan as NA

In [None]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [None]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [None]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [None]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA], [NA, 6.5, 3.]])

In [None]:
cleaned = data.dropna()

In [None]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [None]:
# Passar how='all' descartará apenas as linhas que contenham somente NAs:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [None]:
# cria uma coluna só com NAs
data[4] = NA

In [None]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [None]:
# Para descartar colunas que só contenham NAs, passe axis=1, how='all
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Preenchendo dados ausentes

In [None]:
df = pd.DataFrame(np.random.randn(7, 3))

In [None]:
df

Unnamed: 0,0,1,2
0,-1.931026,-0.379987,0.136201
1,-0.291564,-1.424945,2.314178
2,1.654676,1.361246,1.100587
3,-1.800199,0.029718,-0.931528
4,-2.376462,0.47689,0.395294
5,0.007235,-0.822494,0.914303
6,-1.007799,0.679759,0.796003


In [None]:
df.iloc[:4, 1] = NA

In [None]:
df.iloc[:2, 2] = NA

In [None]:
df

Unnamed: 0,0,1,2
0,,,
1,,,
2,,,1.100587
3,,,-0.931528
4,,0.47689,0.395294
5,,-0.822494,0.914303
6,,0.679759,0.796003


In [None]:
# Chamar fillna com uma constante como argumento, substitui valores ausentes por esse valor
df.fillna(0)

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,1.100587
3,0.0,0.0,-0.931528
4,0.0,0.47689,0.395294
5,0.0,-0.822494,0.914303
6,0.0,0.679759,0.796003


In [None]:
# Ao chamar fillna com um dicionário, podemos usar um valor de preenchimento
# diferente para cada coluna
df.fillna({1: 0.5, 2: 2.})

Unnamed: 0,0,1,2
0,,0.5,2.0
1,,0.5,2.0
2,,0.5,1.100587
3,,0.5,-0.931528
4,,0.47689,0.395294
5,,-0.822494,0.914303
6,,0.679759,0.796003


In [None]:
# fillna devolve um novo objeto. Para alterar o objeto existente use o argumento
# inplace
df.fillna(5., inplace=True)

In [None]:
df

Unnamed: 0,0,1,2
0,5.0,5.0,5.0
1,5.0,5.0,5.0
2,5.0,5.0,1.100587
3,5.0,5.0,-0.931528
4,5.0,0.47689,0.395294
5,5.0,-0.822494,0.914303
6,5.0,0.679759,0.796003


### 7.2 Transformação de dados

Removendo duplicatas

In [7]:
data = pd.DataFrame({'k1' : ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})

In [8]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [9]:
# O método 'duplicated' devolve uma Series boolena informando se cada linha
# é uma duplicata ou não
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [10]:
# 'drop_duplicates' devolve um DataFrame com dados em que o array 'duplicated' é False
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [12]:
# Adicionando uma nova coluna ao DataFrame
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [None]:
# Filtrando as duplicatas somente com base na coluna 'k1'

In [15]:
data.drop_duplicates('k1')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [22]:
# 'duplicated' e 'drop_duplicates', por padrão, mantém a primeira combinação
# de valores observada. Passar keep='last' devolverá a última
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6
