## Análise de dados com Pandas: Filtrando e limpando dados do DataFrame

### Victor Hugo Negrisoli - Desenvolvedor de Software Full-Stack & Analista de Dados

In [3]:
import pandas as pd
import numpy as np

In [6]:
datas = pd.date_range('20180101', periods = 600, freq = 'D')

df_datas = pd.DataFrame(
    np.random.randn(600, 5), 
    index = datas,
    columns = list('ABCDE')
)
df_datas

Unnamed: 0,A,B,C,D,E
2018-01-01,-1.221130,0.233367,-0.301261,-0.633673,1.078092
2018-01-02,0.234978,0.527498,2.281302,-0.611856,0.205659
2018-01-03,0.519014,-0.105936,-1.017509,0.523452,0.608871
2018-01-04,0.854911,1.847130,-1.561526,-0.863526,-0.725545
2018-01-05,0.175656,1.439507,-0.224236,0.821284,-0.515351
...,...,...,...,...,...
2019-08-19,-0.408917,0.846709,-0.859173,0.502167,0.976221
2019-08-20,-1.721271,0.247466,-0.868257,1.603525,-0.591663
2019-08-21,-0.264717,0.288417,2.003647,-0.527184,2.318121
2019-08-22,-0.863621,-1.141023,0.510898,-0.268479,-0.390499


In [9]:
df_datas["D"].head()

2018-01-01   -0.633673
2018-01-02   -0.611856
2018-01-03    0.523452
2018-01-04   -0.863526
2018-01-05    0.821284
Freq: D, Name: D, dtype: float64

In [10]:
# Filtrando dados por indexação

df_datas[1:5]

Unnamed: 0,A,B,C,D,E
2018-01-02,0.234978,0.527498,2.281302,-0.611856,0.205659
2018-01-03,0.519014,-0.105936,-1.017509,0.523452,0.608871
2018-01-04,0.854911,1.84713,-1.561526,-0.863526,-0.725545
2018-01-05,0.175656,1.439507,-0.224236,0.821284,-0.515351


In [11]:
df_datas[0:2]    

Unnamed: 0,A,B,C,D,E
2018-01-01,-1.22113,0.233367,-0.301261,-0.633673,1.078092
2018-01-02,0.234978,0.527498,2.281302,-0.611856,0.205659


#### Utilizando a função loc() e iloc() para filtrar e indexar colunas

In [16]:
# Recuperar todas as linhas das colunas B, C e D

df_datas.loc[:, ["B", "C", "D"]]

Unnamed: 0,B,C,D
2018-01-01,0.233367,-0.301261,-0.633673
2018-01-02,0.527498,2.281302,-0.611856
2018-01-03,-0.105936,-1.017509,0.523452
2018-01-04,1.847130,-1.561526,-0.863526
2018-01-05,1.439507,-0.224236,0.821284
...,...,...,...
2019-08-19,0.846709,-0.859173,0.502167
2019-08-20,0.247466,-0.868257,1.603525
2019-08-21,0.288417,2.003647,-0.527184
2019-08-22,-1.141023,0.510898,-0.268479


In [20]:
# Recupera os dados das datas 2019-08-15 a 2019-08-23 das colunas A e D

df_datas.loc["20190815":"20190823", ["A", "D"]]

Unnamed: 0,A,D
2019-08-15,1.376147,-0.132721
2019-08-16,-1.678482,0.915535
2019-08-17,0.833073,2.281822
2019-08-18,-0.397184,0.546251
2019-08-19,-0.408917,0.502167
2019-08-20,-1.721271,1.603525
2019-08-21,-0.264717,-0.527184
2019-08-22,-0.863621,-0.268479
2019-08-23,0.136093,-1.675571


In [22]:
# Filtrando por índices. Recuperando dados das linhas 0 a 5 e colunas 0 a 2

df_datas.iloc[0:5, 0:2]

Unnamed: 0,A,B
2018-01-01,-1.22113,0.233367
2018-01-02,0.234978,0.527498
2018-01-03,0.519014,-0.105936
2018-01-04,0.854911,1.84713
2018-01-05,0.175656,1.439507


In [24]:
# Especificando exatamente quais índices deseja-se apresentar, sem o uso do :

df_datas.iloc[[1, 5, 6], [0, 1, 3]]

Unnamed: 0,A,B,D
2018-01-02,0.234978,0.527498,-0.611856
2018-01-06,0.484238,-1.440035,-0.682947
2018-01-07,-0.163155,0.518006,0.671321


In [26]:
# Recuperando apenas as primeiras 5 linhas de todas as colunas

df_datas.iloc[0:5, :]

Unnamed: 0,A,B,C,D,E
2018-01-01,-1.22113,0.233367,-0.301261,-0.633673,1.078092
2018-01-02,0.234978,0.527498,2.281302,-0.611856,0.205659
2018-01-03,0.519014,-0.105936,-1.017509,0.523452,0.608871
2018-01-04,0.854911,1.84713,-1.561526,-0.863526,-0.725545
2018-01-05,0.175656,1.439507,-0.224236,0.821284,-0.515351


#### Filtros booleanos nos DataFrames

In [27]:
# Recuperando apenas dados em que o valor na coluna A seja maior que 0.

df_datas[df_datas["A"] > 0]

Unnamed: 0,A,B,C,D,E
2018-01-02,0.234978,0.527498,2.281302,-0.611856,0.205659
2018-01-03,0.519014,-0.105936,-1.017509,0.523452,0.608871
2018-01-04,0.854911,1.847130,-1.561526,-0.863526,-0.725545
2018-01-05,0.175656,1.439507,-0.224236,0.821284,-0.515351
2018-01-06,0.484238,-1.440035,-0.352850,-0.682947,-0.566463
...,...,...,...,...,...
2019-08-12,0.079280,-0.225182,-0.398803,0.810179,1.592417
2019-08-14,0.235409,-0.488013,-1.574891,-0.252974,1.086916
2019-08-15,1.376147,-0.263587,-1.515491,-0.132721,0.363965
2019-08-17,0.833073,-1.068489,0.467799,2.281822,-0.847710


In [34]:
# Recuperando apenas dados em que o valor na coluna A seja maior que 0 e que B seja negativo e que C seja maior que 1.

df_datas[
    (df_datas["A"] > 0) 
    & (df_datas["B"] < 0)
    & (df_datas["C"] > 2)
]

Unnamed: 0,A,B,C,D,E
2018-04-20,0.419227,-1.307943,2.540393,-1.485366,-1.010795
2019-07-30,1.973072,-0.379561,2.28173,-2.177296,0.194996


#### Sumarização de dados

In [38]:
datas = pd.date_range("20200101", periods = 6)

df = pd.DataFrame(np.random.randn(6, 4), index = datas, columns = ["Var A", "Var B", "Var C", "Var D"])
df

Unnamed: 0,Var A,Var B,Var C,Var D
2020-01-01,0.54928,-0.627521,1.355975,-0.961552
2020-01-02,1.184611,-0.134021,-2.733932,0.442937
2020-01-03,2.172899,0.508876,1.173713,-0.061931
2020-01-04,-0.121575,-0.025763,-0.259804,-0.975096
2020-01-05,-0.382572,-0.721048,-0.4376,0.600537
2020-01-06,0.285178,0.606885,-0.475121,-1.066795


In [44]:
df.shape

(6, 4)

In [47]:
df.dtypes

Var A    float64
Var B    float64
Var C    float64
Var D    float64
dtype: object

In [42]:
df_2 = pd.DataFrame({
    "A": 1.,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index = list(range(4)), dtype = "float32"),
    "D": np.array([3] * 4, dtype = "int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "Python" 
})

df_2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,Python
1,1.0,2013-01-02,1.0,3,train,Python
2,1.0,2013-01-02,1.0,3,test,Python
3,1.0,2013-01-02,1.0,3,train,Python


In [43]:
df_2.shape

(4, 6)

In [49]:
df_2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [52]:
df_2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [54]:
# Reindexando o DataFrame df

df_reindex = df.reindex(
    index = datas[0:4],
    columns = list(df.columns) + ["Var E"]
)

df_reindex

Unnamed: 0,Var A,Var B,Var C,Var D,Var E
2020-01-01,0.54928,-0.627521,1.355975,-0.961552,
2020-01-02,1.184611,-0.134021,-2.733932,0.442937,
2020-01-03,2.172899,0.508876,1.173713,-0.061931,
2020-01-04,-0.121575,-0.025763,-0.259804,-0.975096,


In [61]:
# Adicionando o valor 1 à coluna E nas linhas de índice 0 e 1 com loc

df_reindex.loc[datas[0]:datas[1], "Var E"] = 1
df_reindex

Unnamed: 0,Var A,Var B,Var C,Var D,Var E
2020-01-01,0.54928,-0.627521,1.355975,-0.961552,1.0
2020-01-02,1.184611,-0.134021,-2.733932,0.442937,1.0
2020-01-03,2.172899,0.508876,1.173713,-0.061931,
2020-01-04,-0.121575,-0.025763,-0.259804,-0.975096,


In [62]:
# Adicionando o valor 1 à coluna E nas linhas de índice 0 e 1com iloc

df_reindex.iloc[0:1, -1] = 1
df_reindex

Unnamed: 0,Var A,Var B,Var C,Var D,Var E
2020-01-01,0.54928,-0.627521,1.355975,-0.961552,1.0
2020-01-02,1.184611,-0.134021,-2.733932,0.442937,1.0
2020-01-03,2.172899,0.508876,1.173713,-0.061931,
2020-01-04,-0.121575,-0.025763,-0.259804,-0.975096,


In [63]:
df_reindex.describe()

Unnamed: 0,Var A,Var B,Var C,Var D,Var E
count,4.0,4.0,4.0,4.0,2.0
mean,0.946304,-0.069607,-0.116012,-0.38891,1.0
std,0.976272,0.466184,1.888943,0.700099,0.0
min,-0.121575,-0.627521,-2.733932,-0.975096,1.0
25%,0.381566,-0.257396,-0.878336,-0.964938,1.0
50%,0.866945,-0.079892,0.456954,-0.511741,1.0
75%,1.431683,0.107897,1.219278,0.064286,1.0
max,2.172899,0.508876,1.355975,0.442937,1.0


#### Trabalhando com dados missing

In [67]:
datas = pd.date_range("20190101", periods = 60, freq = "D")

df = pd.DataFrame(np.random.randn(60, 5), index = datas, columns = list("ABCDE"))
df.head()

Unnamed: 0,A,B,C,D,E
2019-01-01,0.027203,-0.04711,0.761808,-0.31804,-0.006292
2019-01-02,-0.114876,-0.123229,2.090368,0.213893,1.069659
2019-01-03,-0.065452,1.171365,1.063261,-1.64953,-0.74232
2019-01-04,-0.106477,1.847486,1.666328,-1.8008,-2.379152
2019-01-05,0.269499,-0.799648,0.019286,1.083328,1.171357


In [68]:
df.shape

(60, 5)

In [69]:
df.describe()

Unnamed: 0,A,B,C,D,E
count,60.0,60.0,60.0,60.0,60.0
mean,-0.026336,0.052683,0.008651,0.183137,-0.023397
std,0.941482,1.057704,0.929641,1.04756,1.04089
min,-2.573664,-1.828194,-2.078947,-1.8008,-2.379152
25%,-0.477166,-0.805543,-0.528927,-0.391989,-0.704222
50%,-0.098001,-0.013544,0.030027,0.198643,-0.037899
75%,0.589926,0.959175,0.735246,0.804951,0.641356
max,1.847435,1.932056,2.090368,3.393968,2.513811


In [81]:
df["F"] = df[df["A"] > 0]
df.head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.027203,-0.04711,0.761808,-0.31804,-0.006292,0.0272027
2019-01-02,-0.114876,-0.123229,2.090368,0.213893,1.069659,
2019-01-03,-0.065452,1.171365,1.063261,-1.64953,-0.74232,
2019-01-04,-0.106477,1.847486,1.666328,-1.8008,-2.379152,
2019-01-05,0.269499,-0.799648,0.019286,1.083328,1.171357,0.269499


In [74]:
df2 = df.copy()
df3 = df.copy()

In [82]:
df2.dropna().head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.027203,-0.04711,0.761808,-0.31804,-0.006292,0.0272027
2019-01-05,0.269499,-0.799648,0.019286,1.083328,1.171357,0.269499
2019-01-08,0.427151,-0.830452,-0.032618,2.090207,-0.095589,0.427151
2019-01-09,0.216371,0.394205,-0.073069,1.196191,1.444111,0.216371
2019-01-13,1.363284,-0.323954,0.215033,-1.154273,1.27554,1.36328


In [83]:
df3.fillna(np.mean(df["A"])).head()

Unnamed: 0,A,B,C,D,E,F
2019-01-01,0.027203,-0.04711,0.761808,-0.31804,-0.006292,0.027203
2019-01-02,-0.114876,-0.123229,2.090368,0.213893,1.069659,-0.026336
2019-01-03,-0.065452,1.171365,1.063261,-1.64953,-0.74232,-0.026336
2019-01-04,-0.106477,1.847486,1.666328,-1.8008,-2.379152,-0.026336
2019-01-05,0.269499,-0.799648,0.019286,1.083328,1.171357,0.269499


#### Trabalhando com dados únicos

In [84]:
df_2 = pd.DataFrame({
    "A": 1.,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index = list(range(4)), dtype = "float32"),
    "D": np.array([3] * 4, dtype = "int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "Python",
    "G": [2, 2, 4, 4],
    "H": [np.nan, 2, 4, np.nan]
})

df_2

Unnamed: 0,A,B,C,D,E,F,G,H
0,1.0,2013-01-02,1.0,3,test,Python,2,
1,1.0,2013-01-02,1.0,3,train,Python,2,2.0
2,1.0,2013-01-02,1.0,3,test,Python,4,4.0
3,1.0,2013-01-02,1.0,3,train,Python,4,


In [87]:
df_2.nunique(dropna = False)

A    1
B    1
C    1
D    1
E    2
F    1
G    2
H    3
dtype: int64

In [88]:
df_2.nunique(axis = 1, dropna = False)

0    7
1    6
2    6
3    7
dtype: int64

#### Remoção de duplicatas

In [90]:
# Removendo duplicatas da coluna G, mantendo apenas o primeiro na repetição

df_2.drop_duplicates(subset = "G")

Unnamed: 0,A,B,C,D,E,F,G,H
0,1.0,2013-01-02,1.0,3,test,Python,2,
2,1.0,2013-01-02,1.0,3,test,Python,4,4.0


In [91]:
# Removendo duplicatas da coluna G, mantendo apenas o útlimo na repetição

df_2.drop_duplicates(subset = "G", keep = "last")

Unnamed: 0,A,B,C,D,E,F,G,H
1,1.0,2013-01-02,1.0,3,train,Python,2,2.0
3,1.0,2013-01-02,1.0,3,train,Python,4,


In [92]:
# Removendo duplicatas da coluna G, removendo todas as duplicatas

df_2.drop_duplicates(subset = "G", keep = False)

Unnamed: 0,A,B,C,D,E,F,G,H


#### Ordenação dos dddos

In [94]:
df = pd.DataFrame({
    "Col 1" :["A", "A", "B", np.nan, "D", "C"],
    "Col 2" :[2, 1, 9, 8, 7, 4],
    "Col 3" :[0, 1, 9, 4, 2, 3]
})
df

Unnamed: 0,Col 1,Col 2,Col 3
0,A,2,0
1,A,1,1
2,B,9,9
3,,8,4
4,D,7,2
5,C,4,3


In [95]:
# Ordenando os valores com a função sort_values pela coluna 2

df.sort_values(by = "Col 2")

Unnamed: 0,Col 1,Col 2,Col 3
1,A,1,1
0,A,2,0
5,C,4,3
4,D,7,2
3,,8,4
2,B,9,9


In [98]:
# Ordenando os valores pelas colunas 2 e 3 em ordem decrescente

df.sort_values(by = ["Col 2", "Col 3"], ascending = False)

Unnamed: 0,Col 1,Col 2,Col 3
2,B,9,9
3,,8,4
4,D,7,2
5,C,4,3
0,A,2,0
1,A,1,1
