In [1]:
import pandas as pd
import numpy as np

In [2]:
dates = pd.date_range('1/1/2000', periods=8); dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.330976,0.337886,0.761819,1.745169
2000-01-02,0.048962,0.321171,-1.848967,0.278036
2000-01-03,-0.812127,-0.185599,-0.721741,-1.674371
2000-01-04,0.812633,-0.122611,-0.751333,0.416713
2000-01-05,0.492959,0.279125,1.260756,-0.730884
2000-01-06,-0.262607,1.694246,2.284932,-0.034723
2000-01-07,0.84037,-1.418937,-2.197424,0.981226
2000-01-08,-0.670251,-0.85548,0.004085,0.808429


In [4]:
panel = pd.Panel({
    'one': df,
    'two': df - df.mean()
})
panel

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D

# 1 Different Choices for Indexing

In [5]:
s = df['A']

In [6]:
s[dates[5]] # Columa: A e linha 2000-01-06 (indice 5)

-0.26260709249474767

In [7]:
panel['two']

Unnamed: 0,A,B,C,D
2000-01-01,-0.345847,0.33166,0.912803,1.52147
2000-01-02,0.034092,0.314946,-1.697983,0.054337
2000-01-03,-0.826998,-0.191824,-0.570757,-1.89807
2000-01-04,0.797763,-0.128836,-0.600349,0.193014
2000-01-05,0.478089,0.2729,1.41174,-0.954583
2000-01-06,-0.277477,1.68802,2.435916,-0.258422
2000-01-07,0.825499,-1.425162,-2.046439,0.757526
2000-01-08,-0.685121,-0.861705,0.155069,0.58473


In [8]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.330976,0.337886,0.761819,1.745169
2000-01-02,0.048962,0.321171,-1.848967,0.278036
2000-01-03,-0.812127,-0.185599,-0.721741,-1.674371
2000-01-04,0.812633,-0.122611,-0.751333,0.416713
2000-01-05,0.492959,0.279125,1.260756,-0.730884
2000-01-06,-0.262607,1.694246,2.284932,-0.034723
2000-01-07,0.84037,-1.418937,-2.197424,0.981226
2000-01-08,-0.670251,-0.85548,0.004085,0.808429


In [9]:
df[['A', 'B']] # Retornando colunas específicas

Unnamed: 0,A,B
2000-01-01,-0.330976,0.337886
2000-01-02,0.048962,0.321171
2000-01-03,-0.812127,-0.185599
2000-01-04,0.812633,-0.122611
2000-01-05,0.492959,0.279125
2000-01-06,-0.262607,1.694246
2000-01-07,0.84037,-1.418937
2000-01-08,-0.670251,-0.85548


In [10]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]

In [11]:
df[['A', 'B']] # ao tentar trocar os valores das colunas, note que não surtiu efeito. Para isso utilize o atributo .values

Unnamed: 0,A,B
2000-01-01,-0.330976,0.337886
2000-01-02,0.048962,0.321171
2000-01-03,-0.812127,-0.185599
2000-01-04,0.812633,-0.122611
2000-01-05,0.492959,0.279125
2000-01-06,-0.262607,1.694246
2000-01-07,0.84037,-1.418937
2000-01-08,-0.670251,-0.85548


In [12]:
df.loc[:, ['B', 'A']] = df[['A', 'B']].values

In [13]:
df[['A', 'B']] # note que agora surtiu efeito. Os valores foram trocados

Unnamed: 0,A,B
2000-01-01,0.337886,-0.330976
2000-01-02,0.321171,0.048962
2000-01-03,-0.185599,-0.812127
2000-01-04,-0.122611,0.812633
2000-01-05,0.279125,0.492959
2000-01-06,1.694246,-0.262607
2000-01-07,-1.418937,0.84037
2000-01-08,-0.85548,-0.670251


# 2 Attribute Access

In [14]:
sa = pd.Series(['1, 2, 3'], index=list('abc'))

In [15]:
dfa = df.copy()

In [16]:
sa

a    1, 2, 3
b    1, 2, 3
c    1, 2, 3
dtype: object

In [17]:
sa.b

'1, 2, 3'

In [18]:
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0.337886,-0.330976,0.761819,1.745169
2000-01-02,0.321171,0.048962,-1.848967,0.278036
2000-01-03,-0.185599,-0.812127,-0.721741,-1.674371
2000-01-04,-0.122611,0.812633,-0.751333,0.416713
2000-01-05,0.279125,0.492959,1.260756,-0.730884
2000-01-06,1.694246,-0.262607,2.284932,-0.034723
2000-01-07,-1.418937,0.84037,-2.197424,0.981226
2000-01-08,-0.85548,-0.670251,0.004085,0.808429


In [19]:
dfa.A

2000-01-01    0.337886
2000-01-02    0.321171
2000-01-03   -0.185599
2000-01-04   -0.122611
2000-01-05    0.279125
2000-01-06    1.694246
2000-01-07   -1.418937
2000-01-08   -0.855480
Freq: D, Name: A, dtype: float64

In [20]:
panel

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D

In [21]:
panel.one

Unnamed: 0,A,B,C,D
2000-01-01,-0.330976,0.337886,0.761819,1.745169
2000-01-02,0.048962,0.321171,-1.848967,0.278036
2000-01-03,-0.812127,-0.185599,-0.721741,-1.674371
2000-01-04,0.812633,-0.122611,-0.751333,0.416713
2000-01-05,0.492959,0.279125,1.260756,-0.730884
2000-01-06,-0.262607,1.694246,2.284932,-0.034723
2000-01-07,0.84037,-1.418937,-2.197424,0.981226
2000-01-08,-0.670251,-0.85548,0.004085,0.808429


In [22]:
sa

a    1, 2, 3
b    1, 2, 3
c    1, 2, 3
dtype: object

In [23]:
sa.a

'1, 2, 3'

In [24]:
sa.a = 5

In [25]:
sa

a          5
b    1, 2, 3
c    1, 2, 3
dtype: object

In [26]:
dfa.A = list(range(len(dfa.index)))

In [27]:
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.330976,0.761819,1.745169
2000-01-02,1,0.048962,-1.848967,0.278036
2000-01-03,2,-0.812127,-0.721741,-1.674371
2000-01-04,3,0.812633,-0.751333,0.416713
2000-01-05,4,0.492959,1.260756,-0.730884
2000-01-06,5,-0.262607,2.284932,-0.034723
2000-01-07,6,0.84037,-2.197424,0.981226
2000-01-08,7,-0.670251,0.004085,0.808429


In [28]:
dfa['A'] = list(range(len(dfa.index))); dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.330976,0.761819,1.745169
2000-01-02,1,0.048962,-1.848967,0.278036
2000-01-03,2,-0.812127,-0.721741,-1.674371
2000-01-04,3,0.812633,-0.751333,0.416713
2000-01-05,4,0.492959,1.260756,-0.730884
2000-01-06,5,-0.262607,2.284932,-0.034723
2000-01-07,6,0.84037,-2.197424,0.981226
2000-01-08,7,-0.670251,0.004085,0.808429


In [29]:
dfa['E'] = list(range(len(dfa.index))); dfa # usando desta forma nos permite até criar nova coluna

Unnamed: 0,A,B,C,D,E
2000-01-01,0,-0.330976,0.761819,1.745169,0
2000-01-02,1,0.048962,-1.848967,0.278036,1
2000-01-03,2,-0.812127,-0.721741,-1.674371,2
2000-01-04,3,0.812633,-0.751333,0.416713,3
2000-01-05,4,0.492959,1.260756,-0.730884,4
2000-01-06,5,-0.262607,2.284932,-0.034723,5
2000-01-07,6,0.84037,-2.197424,0.981226,6
2000-01-08,7,-0.670251,0.004085,0.808429,7


In [30]:
del dfa['E']; dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,-0.330976,0.761819,1.745169
2000-01-02,1,0.048962,-1.848967,0.278036
2000-01-03,2,-0.812127,-0.721741,-1.674371
2000-01-04,3,0.812633,-0.751333,0.416713
2000-01-05,4,0.492959,1.260756,-0.730884
2000-01-06,5,-0.262607,2.284932,-0.034723
2000-01-07,6,0.84037,-2.197424,0.981226
2000-01-08,7,-0.670251,0.004085,0.808429


In [31]:
x = pd.DataFrame({
    'x': [1, 2, 3],
    'y': [3, 4, 5]
})
x

Unnamed: 0,x,y
0,1,3
1,2,4
2,3,5


In [32]:
x.iloc[1]

x    2
y    4
Name: 1, dtype: int64

In [33]:
x.iloc[1] = dict(x=9, y=99) # trocando valores através de dicionários
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


# 3 Slicing ranges

In [34]:
s

2000-01-01    0.337886
2000-01-02    0.321171
2000-01-03   -0.185599
2000-01-04   -0.122611
2000-01-05    0.279125
2000-01-06    1.694246
2000-01-07   -1.418937
2000-01-08   -0.855480
Freq: D, Name: A, dtype: float64

In [35]:
s[:5]

2000-01-01    0.337886
2000-01-02    0.321171
2000-01-03   -0.185599
2000-01-04   -0.122611
2000-01-05    0.279125
Freq: D, Name: A, dtype: float64

In [36]:
s[:2]

2000-01-01    0.337886
2000-01-02    0.321171
Freq: D, Name: A, dtype: float64

In [37]:
s[::-1] # inverte a orde de exibição

2000-01-08   -0.855480
2000-01-07   -1.418937
2000-01-06    1.694246
2000-01-05    0.279125
2000-01-04   -0.122611
2000-01-03   -0.185599
2000-01-02    0.321171
2000-01-01    0.337886
Freq: -1D, Name: A, dtype: float64

In [38]:
s2 = s.copy()

In [39]:
s2[:5] = 0

In [40]:
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06    1.694246
2000-01-07   -1.418937
2000-01-08   -0.855480
Freq: D, Name: A, dtype: float64

In [41]:
df

Unnamed: 0,A,B,C,D
2000-01-01,0.337886,-0.330976,0.761819,1.745169
2000-01-02,0.321171,0.048962,-1.848967,0.278036
2000-01-03,-0.185599,-0.812127,-0.721741,-1.674371
2000-01-04,-0.122611,0.812633,-0.751333,0.416713
2000-01-05,0.279125,0.492959,1.260756,-0.730884
2000-01-06,1.694246,-0.262607,2.284932,-0.034723
2000-01-07,-1.418937,0.84037,-2.197424,0.981226
2000-01-08,-0.85548,-0.670251,0.004085,0.808429


In [42]:
df[:3]

Unnamed: 0,A,B,C,D
2000-01-01,0.337886,-0.330976,0.761819,1.745169
2000-01-02,0.321171,0.048962,-1.848967,0.278036
2000-01-03,-0.185599,-0.812127,-0.721741,-1.674371


In [43]:
df[::-1] # da mesma forma inverte a ordem do DataFrame

Unnamed: 0,A,B,C,D
2000-01-08,-0.85548,-0.670251,0.004085,0.808429
2000-01-07,-1.418937,0.84037,-2.197424,0.981226
2000-01-06,1.694246,-0.262607,2.284932,-0.034723
2000-01-05,0.279125,0.492959,1.260756,-0.730884
2000-01-04,-0.122611,0.812633,-0.751333,0.416713
2000-01-03,-0.185599,-0.812127,-0.721741,-1.674371
2000-01-02,0.321171,0.048962,-1.848967,0.278036
2000-01-01,0.337886,-0.330976,0.761819,1.745169


# 4 Selection By Label

In [44]:
df1 = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=pd.date_range('20130101', periods=5))
df1

Unnamed: 0,A,B,C,D
2013-01-01,0.143319,0.178123,-0.564554,-0.609368
2013-01-02,-1.101731,1.180249,2.062559,0.407836
2013-01-03,-0.192072,0.056286,1.46966,-0.672655
2013-01-04,1.779854,-1.40669,1.178098,-0.27868
2013-01-05,0.310398,2.008978,0.528252,0.321882


In [45]:
df1.loc[2:3] # gera um erro porque esse index não é inteiro

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.datetimes.DatetimeIndex'> with these indexers [2] of <class 'int'>

In [46]:
df1.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-1.101731,1.180249,2.062559,0.407836
2013-01-03,-0.192072,0.056286,1.46966,-0.672655
2013-01-04,1.779854,-1.40669,1.178098,-0.27868


In [47]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef')); s1

a    0.619299
b   -0.529958
c   -1.606716
d    0.108094
e    1.909701
f   -0.678541
dtype: float64

In [48]:
s1.loc['c':] # seleciona do "c" em diante

c   -1.606716
d    0.108094
e    1.909701
f   -0.678541
dtype: float64

In [49]:
s1.loc['b'] # seleciona apenas o "b"

-0.52995761949021269

In [50]:
s1.loc['c':] = 0 # troca os valores a partir de "c"
s1

a    0.619299
b   -0.529958
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [51]:
df1 = pd.DataFrame(np.random.randn(6, 4), 
                  index=list('abcdef'),
                  columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-1.569851,1.415394,0.612127,-0.976137
b,-1.207008,-0.041177,0.391768,-0.213627
c,-0.719906,-0.180798,-1.074607,0.266744
d,2.539821,-0.05462,-0.147434,-1.142354
e,0.912004,0.057911,-0.756624,-0.794643
f,0.985904,-0.401767,-0.041204,-0.841482


In [52]:
df1.loc[['a', 'b', 'd'], :] # Efetuando busca por indices/linhas específicos

Unnamed: 0,A,B,C,D
a,-1.569851,1.415394,0.612127,-0.976137
b,-1.207008,-0.041177,0.391768,-0.213627
d,2.539821,-0.05462,-0.147434,-1.142354


In [53]:
df1.loc['d':, 'A':'C'] # buscando as linhas a partir de "d" e as colunas de "A" a "C"

Unnamed: 0,A,B,C
d,2.539821,-0.05462,-0.147434
e,0.912004,0.057911,-0.756624
f,0.985904,-0.401767,-0.041204


In [54]:
df1.loc['a'] # equivalor a df1.xs('a')

A   -1.569851
B    1.415394
C    0.612127
D   -0.976137
Name: a, dtype: float64

In [55]:
df1.loc['a'] > 0 # testa condições com relação aos valores

A    False
B     True
C     True
D    False
Name: a, dtype: bool

In [56]:
df1.loc[:, df1.loc['a'] > 0] 

Unnamed: 0,B,C
a,1.415394,0.612127
b,-0.041177,0.391768
c,-0.180798,-1.074607
d,-0.05462,-0.147434
e,0.057911,-0.756624
f,-0.401767,-0.041204


### 4.1 Slicing with labels

In [57]:
s = pd.Series(list('abcde'), index=[0,3,2,5,4]);
s

0    a
3    b
2    c
5    d
4    e
dtype: object

In [58]:
"""
note que em questão de index, a ordem importa. Se você definir uma ordem que não seja crecente, quado você busca pelo
intervalor de dados, como o exmplo a seguir, note que o 2 está entre 3 e 5 na ordem. Por este motivo, será retornado na
seleção
""" 
s.loc[3:5] 


3    b
2    c
5    d
dtype: object

In [59]:
s.sort_index() # por este motivo, é possível ordenar os indices

0    a
2    c
3    b
4    e
5    d
dtype: object

In [60]:
s.sort_index().loc[3:5] # desta dorma é possível combinar e trazer a seleção correta

3    b
4    e
5    d
dtype: object

### 4.2 Selection By Position

In [61]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2))); s1

0    0.221739
2    1.026768
4    0.376615
6    0.008601
8   -0.000664
dtype: float64

In [62]:
s1.iloc[:3]

0    0.221739
2    1.026768
4    0.376615
dtype: float64

In [63]:
s1.iloc[3]

0.0086014188515799034

In [64]:
s1.iloc[:3] = 0; s1

0    0.000000
2    0.000000
4    0.000000
6    0.008601
8   -0.000664
dtype: float64

In [65]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                  index=list(range(0, 12, 2)),
                  columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,-1.507298,0.465877,0.190483,-1.388767
2,-0.530548,-0.526125,2.016582,-0.204417
4,-3.033768,-0.521815,-2.327173,-1.782179
6,-1.211719,0.074064,-0.059405,-1.050493
8,-0.243774,0.936565,-0.571977,-0.113823
10,0.041447,-1.070879,-0.801514,-0.855475


In [66]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,-1.507298,0.465877,0.190483,-1.388767
2,-0.530548,-0.526125,2.016582,-0.204417
4,-3.033768,-0.521815,-2.327173,-1.782179


In [67]:
df1.iloc[[1, 3, 4], [1, 3]] # combinando indice das linhas e colunas

Unnamed: 0,2,6
2,-0.526125,-0.204417
6,0.074064,-1.050493
8,0.936565,-0.113823


In [68]:
df1.iloc[1:3, :] 

Unnamed: 0,0,2,4,6
2,-0.530548,-0.526125,2.016582,-0.204417
4,-3.033768,-0.521815,-2.327173,-1.782179


In [69]:
df1.iloc[:,1:3] 

Unnamed: 0,2,4
0,0.465877,0.190483
2,-0.526125,2.016582
4,-0.521815,-2.327173
6,0.074064,-0.059405
8,0.936565,-0.571977
10,-1.070879,-0.801514


In [70]:
df1.iloc[1, 1]

-0.52612493576231456

In [71]:
df1.iloc[1]

0   -0.530548
2   -0.526125
4    2.016582
6   -0.204417
Name: 2, dtype: float64

In [72]:
x = list('abcdef'); x

['a', 'b', 'c', 'd', 'e', 'f']

In [73]:
x[4:10]

['e', 'f']

In [74]:
x[8:10]

[]

In [75]:
s = pd.Series(x)
s

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object

In [76]:
s.iloc[4:10]

4    e
5    f
dtype: object

In [77]:
s.iloc[8:10]

Series([], dtype: object)

In [78]:
df1 = pd.DataFrame(np.random.randn(5,2), columns=list('AB'))
df1

Unnamed: 0,A,B
0,1.107954,1.19112
1,1.151213,0.161311
2,0.544466,0.46911
3,-0.103324,-0.890721
4,1.339728,-1.482292


In [79]:
df1.iloc[:, 2:3]

0
1
2
3
4


In [80]:
df1.iloc[:, 1:3]

Unnamed: 0,B
0,1.19112
1,0.161311
2,0.46911
3,-0.890721
4,-1.482292


In [81]:
df1.iloc[4:6]

Unnamed: 0,A,B
4,1.339728,-1.482292


### 4.3 Selection By Callable

In [82]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                  index=list('abcdef'),
                  columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.687918,0.720501
b,-0.641584,1.049618,0.621509,1.257317
c,-0.293721,0.400003,-0.892102,-0.187255
d,-1.132476,-0.321653,0.00489,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,0.678106,0.141173,-1.186579,-0.797834


In [83]:
df1.loc[lambda df:df.A > 0, :] # todas as linhas maiores de zero

Unnamed: 0,A,B,C,D
f,0.678106,0.141173,-1.186579,-0.797834


In [84]:
df1.loc[:, lambda df: ['A', 'B']] # seleciona todas a linhas das colunas A e B

Unnamed: 0,A,B
a,-0.354447,0.17997
b,-0.641584,1.049618
c,-0.293721,0.400003
d,-1.132476,-0.321653
e,-0.255215,-0.044071
f,0.678106,0.141173


In [85]:
df1.iloc[:, lambda df: [0,1]] # mesmo resultado que o anteior, apenas buscando pela posição 

Unnamed: 0,A,B
a,-0.354447,0.17997
b,-0.641584,1.049618
c,-0.293721,0.400003
d,-1.132476,-0.321653
e,-0.255215,-0.044071
f,0.678106,0.141173


In [86]:
df1[lambda df: df.columns[0]]

a   -0.354447
b   -0.641584
c   -0.293721
d   -1.132476
e   -0.255215
f    0.678106
Name: A, dtype: float64

In [87]:
df1.A.loc[lambda s:s > 0]

f    0.678106
Name: A, dtype: float64

# 5 Reindexing

In [88]:
s = pd.Series([1, 2, 3]); s

0    1
1    2
2    3
dtype: int64

In [89]:
s.loc[[1, 2]]

1    2
2    3
dtype: int64

In [90]:
s.loc[[1, 2, 3]]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


1    2.0
2    3.0
3    NaN
dtype: float64

In [91]:
s.reindex([1, 2, 3]); s

0    1
1    2
2    3
dtype: int64

In [92]:
labels = [1, 2, 3] 
s.loc[s.index.intersection(labels)]

1    2
2    3
dtype: int64

# 6 Selecting Random Samples

In [93]:
s = pd.Series([0, 1, 2, 3, 4, 5]); s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [94]:
s.sample()

0    0
dtype: int64

In [95]:
s.sample()

5    5
dtype: int64

In [96]:
s.sample(n=3)

4    4
5    5
0    0
dtype: int64

In [97]:
s.sample(frac=0.5)

3    3
4    4
0    0
dtype: int64

In [98]:
s = pd.Series([0, 1, 2, 3, 4, 5]); s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [99]:
s.sample(n=6, replace=False)

4    4
0    0
5    5
2    2
3    3
1    1
dtype: int64

In [100]:
s.sample(n=6, replace=True)

0    0
3    3
5    5
3    3
4    4
5    5
dtype: int64

In [101]:
 example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)

2    2
5    5
4    4
dtype: int64

# 7 Setting With Enlargement

In [102]:
se = pd.Series([1, 2, 3]); se

0    1
1    2
2    3
dtype: int64

In [103]:
se[5] = 5
se

0    1
1    2
2    3
5    5
dtype: int64

In [104]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2),
                  columns=['A', 'B'])
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


In [105]:
dfi.loc[:,'C'] = dfi.loc[:,'A']
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


In [106]:
dfi.loc[3] = 5

In [107]:
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4
3,5,5,5


# 8 Boolean indexing

In [108]:
s = pd.Series(range(-3, 4))
s

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [109]:
s[s > 0]

4    1
5    2
6    3
dtype: int64

In [110]:
s[(s < -1) | (s > 0.5)] # (s maior que -1) ou (s maioe que 0.5)

0   -3
1   -2
4    1
5    2
6    3
dtype: int64

In [111]:
s[s<0]

0   -3
1   -2
2   -1
dtype: int64

In [112]:
s[~(s<0)] # diferente/oposto de s < 0

3    0
4    1
5    2
6    3
dtype: int64

In [113]:
df

Unnamed: 0,A,B,C,D
2000-01-01,0.337886,-0.330976,0.761819,1.745169
2000-01-02,0.321171,0.048962,-1.848967,0.278036
2000-01-03,-0.185599,-0.812127,-0.721741,-1.674371
2000-01-04,-0.122611,0.812633,-0.751333,0.416713
2000-01-05,0.279125,0.492959,1.260756,-0.730884
2000-01-06,1.694246,-0.262607,2.284932,-0.034723
2000-01-07,-1.418937,0.84037,-2.197424,0.981226
2000-01-08,-0.85548,-0.670251,0.004085,0.808429


In [114]:
df[df['A'] > 0] # todas as linhas que tiverem na coluna A maior que zero

Unnamed: 0,A,B,C,D
2000-01-01,0.337886,-0.330976,0.761819,1.745169
2000-01-02,0.321171,0.048962,-1.848967,0.278036
2000-01-05,0.279125,0.492959,1.260756,-0.730884
2000-01-06,1.694246,-0.262607,2.284932,-0.034723


In [115]:
df2 = pd.DataFrame({
    'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
    'c': np.random.randn(7)
})
df2

Unnamed: 0,a,b,c
0,one,x,0.967069
1,one,y,1.34089
2,two,y,1.432307
3,three,x,-0.272445
4,two,y,-0.776809
5,one,x,-0.833244
6,six,x,0.74163


In [116]:
criterion = df2['a'].map(lambda x: x.startswith('t'))
criterion

0    False
1    False
2     True
3     True
4     True
5    False
6    False
Name: a, dtype: bool

In [117]:
df2[criterion]

Unnamed: 0,a,b,c
2,two,y,1.432307
3,three,x,-0.272445
4,two,y,-0.776809


In [118]:
df2[[x.startswith('t') for x in df2['a']]] # equivalente a instrução anterior

Unnamed: 0,a,b,c
2,two,y,1.432307
3,three,x,-0.272445
4,two,y,-0.776809


In [119]:
df2[criterion & (df2['b'] == 'x')] # multiplos critérios

Unnamed: 0,a,b,c
3,three,x,-0.272445


# 9 Indexing with isin

In [120]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [121]:
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [122]:
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

In [123]:
s[s.index.isin([2, 4, 6])]

4    0
2    2
dtype: int64

In [124]:
s.reindex([2, 4, 6])

2    2.0
4    0.0
6    NaN
dtype: float64

In [125]:
s_mi = pd.Series(np.arange(6),
                index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32

In [126]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

0  c    2
1  a    3
dtype: int32

In [127]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int32

In [128]:
df = pd.DataFrame({
    'vals': [1, 2, 3, 4],
    'ids': ['a', 'b', 'f', 'n'],
    'ids': ['a', 'n', 'c', 'n']
})
df

Unnamed: 0,ids,vals
0,a,1
1,n,2
2,c,3
3,n,4


In [129]:
values = ['a', 'b', 1, 3]

In [130]:
df.isin(values)

Unnamed: 0,ids,vals
0,True,True
1,False,False
2,False,True
3,False,False


In [131]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}

In [132]:
df.isin(values)

Unnamed: 0,ids,vals
0,True,True
1,False,False
2,False,True
3,False,False


In [133]:
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}

In [134]:
row_mask = df.isin(values).all(1)
row_mask

0     True
1    False
2    False
3    False
dtype: bool

In [135]:
df[row_mask]

Unnamed: 0,ids,vals
0,a,1


# 10 The where() Method and Masking

In [136]:
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [138]:
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [140]:
s.where(s > 1) # preserva o shape da série

4    NaN
3    NaN
2    2.0
1    3.0
0    4.0
dtype: float64

In [141]:
df

Unnamed: 0,ids,vals
0,a,1
1,n,2
2,c,3
3,n,4


In [142]:
df1

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.687918,0.720501
b,-0.641584,1.049618,0.621509,1.257317
c,-0.293721,0.400003,-0.892102,-0.187255
d,-1.132476,-0.321653,0.00489,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,0.678106,0.141173,-1.186579,-0.797834


In [144]:
df1[df1 < 0] # tabém preserva o shape da série

Unnamed: 0,A,B,C,D
a,-0.354447,,-0.687918,
b,-0.641584,,,
c,-0.293721,,-0.892102,-0.187255
d,-1.132476,-0.321653,,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,,,-1.186579,-0.797834


In [146]:
df1.where(df1 < 0)

Unnamed: 0,A,B,C,D
a,-0.354447,,-0.687918,
b,-0.641584,,,
c,-0.293721,,-0.892102,-0.187255
d,-1.132476,-0.321653,,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,,,-1.186579,-0.797834


In [148]:
df1.where(df1 < 0, -df1) # desta forma, onde a condição é falsa, retorna um valor oposto ao original 

Unnamed: 0,A,B,C,D
a,-0.354447,-0.17997,-0.687918,-0.720501
b,-0.641584,-1.049618,-0.621509,-1.257317
c,-0.293721,-0.400003,-0.892102,-0.187255
d,-1.132476,-0.321653,-0.00489,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,-0.678106,-0.141173,-1.186579,-0.797834


In [149]:
s2 = s.copy()

In [151]:
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [153]:
df2 = df1.copy()
df2[df2 < 0] = 0
df2

Unnamed: 0,A,B,C,D
a,0.0,0.17997,0.0,0.720501
b,0.0,1.049618,0.621509,1.257317
c,0.0,0.400003,0.0,0.0
d,0.0,0.0,0.00489,0.0
e,0.0,0.0,0.0,0.0
f,0.678106,0.141173,0.0,0.0


In [154]:
df_orig = df1.copy()

In [159]:
df_orig.where(df1 > 0, -df1, inplace=True)
df_orig

Unnamed: 0,A,B,C,D
a,0.354447,0.17997,0.687918,0.720501
b,0.641584,1.049618,0.621509,1.257317
c,0.293721,0.400003,0.892102,0.187255
d,1.132476,0.321653,0.00489,0.896635
e,0.255215,0.044071,1.904697,1.074253
f,0.678106,0.141173,1.186579,0.797834


In [162]:
df1.where(df1 < 0, -df1) == np.where(df1 <0, df1, -df1) # através do méthod where direto no numpy, obtém o mesmo resultado

Unnamed: 0,A,B,C,D
a,True,True,True,True
b,True,True,True,True
c,True,True,True,True
d,True,True,True,True
e,True,True,True,True
f,True,True,True,True


In [163]:
df2 = df1.copy()
df2

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.687918,0.720501
b,-0.641584,1.049618,0.621509,1.257317
c,-0.293721,0.400003,-0.892102,-0.187255
d,-1.132476,-0.321653,0.00489,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,0.678106,0.141173,-1.186579,-0.797834


In [164]:
df2[ df2[1:4] > 0 ] = 3
df2

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.687918,0.720501
b,-0.641584,3.0,3.0,3.0
c,-0.293721,3.0,-0.892102,-0.187255
d,-1.132476,-0.321653,3.0,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,0.678106,0.141173,-1.186579,-0.797834


In [165]:
df2 = df1.copy()
df2

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.687918,0.720501
b,-0.641584,1.049618,0.621509,1.257317
c,-0.293721,0.400003,-0.892102,-0.187255
d,-1.132476,-0.321653,0.00489,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,0.678106,0.141173,-1.186579,-0.797834


In [166]:
df2.where(df2 > 0, df2['A'], axis='index')

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.354447,0.720501
b,-0.641584,1.049618,0.621509,1.257317
c,-0.293721,0.400003,-0.293721,-0.293721
d,-1.132476,-1.132476,0.00489,-1.132476
e,-0.255215,-0.255215,-0.255215,-0.255215
f,0.678106,0.141173,0.678106,0.678106


In [167]:
df3 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
})
df3

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [169]:
df3.where(lambda x: x > 4) # Seleciona todos os valores maiores que 4

Unnamed: 0,A,B,C
0,,,7
1,,5.0,8
2,,6.0,9


In [170]:
df3.where(lambda x: x > 4, lambda x: x + 10) # no segundo padrâmetro, troca os valores opostos a > 4 para a soma do valor + 10

Unnamed: 0,A,B,C
0,11,14,7
1,12,5,8
2,13,6,9


In [171]:
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [172]:
s.mask(s >= 0)

4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64

In [173]:
df1

Unnamed: 0,A,B,C,D
a,-0.354447,0.17997,-0.687918,0.720501
b,-0.641584,1.049618,0.621509,1.257317
c,-0.293721,0.400003,-0.892102,-0.187255
d,-1.132476,-0.321653,0.00489,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,0.678106,0.141173,-1.186579,-0.797834


In [175]:
df1.mask(df1 >= 0)

Unnamed: 0,A,B,C,D
a,-0.354447,,-0.687918,
b,-0.641584,,,
c,-0.293721,,-0.892102,-0.187255
d,-1.132476,-0.321653,,-0.896635
e,-0.255215,-0.044071,-1.904697,-1.074253
f,,,-1.186579,-0.797834


# 11 The query() Method(Experimental)

In [176]:
n = 10

In [182]:
df = pd.DataFrame(np.random.randn(n, 3), columns=list('abc')); df

Unnamed: 0,a,b,c
0,0.820717,0.556645,1.83169
1,-2.395832,0.206729,0.860208
2,0.10406,0.836671,1.651465
3,0.39872,0.570067,-1.701049
4,-0.072416,0.421049,-0.691881
5,0.423879,-0.680297,-0.239673
6,-0.73489,0.120235,-0.10176
7,1.243805,0.764088,-0.725253
8,-0.240856,-0.086164,-1.349885
9,0.359551,3.955407,0.442284


In [183]:
df[(df.a < df.b) & (df.b < df.c)]

Unnamed: 0,a,b,c
1,-2.395832,0.206729,0.860208
2,0.10406,0.836671,1.651465


In [187]:
df.query('(a < b) & (b < c)')

Unnamed: 0,a,b,c
1,-2.395832,0.206729,0.860208
2,0.10406,0.836671,1.651465


In [195]:
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
df

Unnamed: 0,b,c
0,2,0
1,3,2
2,1,2
3,1,4
4,4,2
5,1,4
6,0,1
7,1,0
8,0,3
9,1,4


In [196]:
df.index.name = 'a'; df

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,0
1,3,2
2,1,2
3,1,4
4,4,2
5,1,4
6,0,1
7,1,0
8,0,3
9,1,4


In [198]:
df.query('a > b and b < c')

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1,2
3,1,4
5,1,4
6,0,1
8,0,3
9,1,4


In [199]:
df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df

Unnamed: 0,b,c
0,5,7
1,6,9
2,5,6
3,3,5
4,8,0
5,2,5
6,5,0
7,2,5
8,3,5
9,1,8


In [200]:
df.query('index < b < c')

Unnamed: 0,b,c
0,5,7
1,6,9
2,5,6


In [201]:
df = pd.DataFrame({'a': np.random.randint(5, size=5)})
df

Unnamed: 0,a
0,4
1,3
2,0
3,3
4,2


In [203]:
df.index.name = 'a'; df

Unnamed: 0_level_0,a
a,Unnamed: 1_level_1
0,4
1,3
2,0
3,3
4,2


In [205]:
df.query('a > 2') # usa a coluna "a" e não o index

Unnamed: 0_level_0,a
a,Unnamed: 1_level_1
0,4
1,3
3,3


In [206]:
df.query('index > 2')

Unnamed: 0_level_0,a
a,Unnamed: 1_level_1
3,3
4,2


### 11.1 MultiIndex query()

In [207]:
n = 10

In [210]:
colors = np.random.choice(['red', 'green'], size=n); colors

array(['red', 'red', 'red', 'green', 'red', 'red', 'green', 'red', 'green',
       'green'],
      dtype='<U5')

In [211]:
foods = np.random.choice(['eggs', 'ham'], size=n); foods

array(['eggs', 'eggs', 'eggs', 'ham', 'eggs', 'ham', 'ham', 'ham', 'eggs',
       'ham'],
      dtype='<U4')

In [213]:
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
index

MultiIndex(levels=[['green', 'red'], ['eggs', 'ham']],
           labels=[[1, 1, 1, 0, 1, 1, 0, 1, 0, 0], [0, 0, 0, 1, 0, 1, 1, 1, 0, 1]],
           names=['color', 'food'])

In [214]:
df = pd.DataFrame(np.random.randn(n, 2), index=index); df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
red,eggs,0.504501,0.308099
red,eggs,0.284127,-0.301078
red,eggs,0.175139,-0.440275
green,ham,0.676975,-1.37676
red,eggs,0.365873,-0.281448
red,ham,-1.061871,-0.866456
green,ham,-0.536142,-2.899209
red,ham,0.255772,-3.018705
green,eggs,0.693401,-1.152582
green,ham,0.111968,-0.038173


In [215]:
df.query('color == "red"')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
red,eggs,0.504501,0.308099
red,eggs,0.284127,-0.301078
red,eggs,0.175139,-0.440275
red,eggs,0.365873,-0.281448
red,ham,-1.061871,-0.866456
red,ham,0.255772,-3.018705


In [216]:
df.index.names = [None, None]; df

Unnamed: 0,Unnamed: 1,0,1
red,eggs,0.504501,0.308099
red,eggs,0.284127,-0.301078
red,eggs,0.175139,-0.440275
green,ham,0.676975,-1.37676
red,eggs,0.365873,-0.281448
red,ham,-1.061871,-0.866456
green,ham,-0.536142,-2.899209
red,ham,0.255772,-3.018705
green,eggs,0.693401,-1.152582
green,ham,0.111968,-0.038173


In [217]:
df.query('ilevel_0 == "red"')

Unnamed: 0,Unnamed: 1,0,1
red,eggs,0.504501,0.308099
red,eggs,0.284127,-0.301078
red,eggs,0.175139,-0.440275
red,eggs,0.365873,-0.281448
red,ham,-1.061871,-0.866456
red,ham,0.255772,-3.018705


### 11.2 query() Use Cases

In [218]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc')); df

Unnamed: 0,a,b,c
0,0.683717,0.21932,0.253746
1,0.358265,0.695423,0.671867
2,0.498658,0.771475,0.926531
3,0.483084,0.388504,0.164051
4,0.174082,0.039103,0.173035
5,0.064435,0.640246,0.074634
6,0.814918,0.197132,0.213481
7,0.847691,0.90885,0.450902
8,0.343025,0.852729,0.689747
9,0.870733,0.360893,0.752112


In [219]:
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns); df2

Unnamed: 0,a,b,c
0,0.455203,0.602751,0.081013
1,0.313924,0.613801,0.453896
2,0.682894,0.553612,0.855376
3,0.975352,0.594587,0.351525
4,0.212222,0.734058,0.891093
5,0.706012,0.323554,0.019186
6,0.021501,0.748007,0.537111
7,0.875334,0.3273,0.551592
8,0.013215,0.773567,0.13621
9,0.03983,0.96622,0.418365


In [220]:
expr = '0.0 <= a <= c <= 0.5'

In [223]:
map(lambda frame: frame.query(expr), [df, df2])

<map at 0x1881ebd8a20>

### 11.3 The "in" and "not in" operators 

In [224]:
df = pd.DataFrame({
    'a': list('aabbccddeeff'),
    'b': list('aaaabbbbcccc'),
    'c': np.random.randint(5, size=12),
    'd': np.random.randint(9, size=12)
})
df

Unnamed: 0,a,b,c,d
0,a,a,0,0
1,a,a,4,3
2,b,a,4,6
3,b,a,1,2
4,c,b,4,6
5,c,b,2,3
6,d,b,0,7
7,d,b,0,1
8,e,c,2,6
9,e,c,4,3


In [225]:
df.query('a in b')

Unnamed: 0,a,b,c,d
0,a,a,0,0
1,a,a,4,3
2,b,a,4,6
3,b,a,1,2
4,c,b,4,6
5,c,b,2,3


In [226]:
df[df.a.isin(df.b)] # utilizando puramente, temos o mesmo resultado

Unnamed: 0,a,b,c,d
0,a,a,0,0
1,a,a,4,3
2,b,a,4,6
3,b,a,1,2
4,c,b,4,6
5,c,b,2,3


In [227]:
df.query('a not in b')

Unnamed: 0,a,b,c,d
6,d,b,0,7
7,d,b,0,1
8,e,c,2,6
9,e,c,4,3
10,f,c,2,6
11,f,c,4,5


In [228]:
df[~df.a.isin(df.b)] # utilizando puramente, temos o mesmo resultado

Unnamed: 0,a,b,c,d
6,d,b,0,7
7,d,b,0,1
8,e,c,2,6
9,e,c,4,3
10,f,c,2,6
11,f,c,4,5


In [229]:
df.query('a in b and c < d')

Unnamed: 0,a,b,c,d
2,b,a,4,6
3,b,a,1,2
4,c,b,4,6
5,c,b,2,3


In [234]:
df[df.a.isin(df.b) & (df.c < df.d)] # utilizando puramente, temos o mesmo resultado

Unnamed: 0,a,b,c,d
2,b,a,4,6
3,b,a,1,2
4,c,b,4,6
5,c,b,2,3


### 11.4 Boolean Operators

In [235]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc')); df

Unnamed: 0,a,b,c
0,0.40703,0.03287,0.12301
1,0.988224,0.298825,0.548394
2,0.755148,0.064187,0.89156
3,0.32009,0.988824,0.454689
4,0.485716,0.146062,0.158979
5,0.953393,0.404164,0.979055
6,0.098895,0.400052,0.378404
7,0.44393,0.515168,0.105889
8,0.169999,0.971467,0.512192
9,0.856632,0.830647,0.338432


In [236]:
df['bools'] = np.random.rand(len(df)) > 0.5

In [237]:
df

Unnamed: 0,a,b,c,bools
0,0.40703,0.03287,0.12301,False
1,0.988224,0.298825,0.548394,True
2,0.755148,0.064187,0.89156,True
3,0.32009,0.988824,0.454689,True
4,0.485716,0.146062,0.158979,True
5,0.953393,0.404164,0.979055,False
6,0.098895,0.400052,0.378404,True
7,0.44393,0.515168,0.105889,False
8,0.169999,0.971467,0.512192,False
9,0.856632,0.830647,0.338432,True


In [238]:
df.query('~bools')

Unnamed: 0,a,b,c,bools
0,0.40703,0.03287,0.12301,False
5,0.953393,0.404164,0.979055,False
7,0.44393,0.515168,0.105889,False
8,0.169999,0.971467,0.512192,False


In [239]:
df.query('not bools')

Unnamed: 0,a,b,c,bools
0,0.40703,0.03287,0.12301,False
5,0.953393,0.404164,0.979055,False
7,0.44393,0.515168,0.105889,False
8,0.169999,0.971467,0.512192,False


In [240]:
 df.query('not bools') == df[~df.bools]

Unnamed: 0,a,b,c,bools
0,True,True,True,True
5,True,True,True,True
7,True,True,True,True
8,True,True,True,True


### 11.5 Performance of query()

In [254]:
df2 = pd.DataFrame({
    'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
    'c': np.random.randn(7)
})
df2

Unnamed: 0,a,b,c
0,one,x,1.584577
1,one,y,0.210624
2,two,x,0.097035
3,two,y,1.729868
4,two,x,0.595795
5,three,x,2.274867
6,four,x,-0.398296


In [253]:
df2.duplicated('a')

0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [244]:
df2.duplicated('a', keep='last') # seta a última ocorrência como duplicado

0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [246]:
df2.duplicated('a', keep=False) # Seta todos os registros que contém algum registro duplicado

0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool

In [247]:
df2.drop_duplicates('a')

Unnamed: 0,a,b,c
0,one,x,-0.612819
2,two,x,2.000303
5,three,x,0.499208
6,four,x,-0.673312


In [248]:
df2.drop_duplicates('a', keep='last')

Unnamed: 0,a,b,c
1,one,y,-0.339502
4,two,x,-2.026667
5,three,x,0.499208
6,four,x,-0.673312


In [249]:
df2.drop_duplicates('a', keep=False)

Unnamed: 0,a,b,c
5,three,x,0.499208
6,four,x,-0.673312


In [255]:
df2.duplicated(['a', 'b'])

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [257]:
df2.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,c
0,one,x,1.584577
1,one,y,0.210624
2,two,x,0.097035
3,two,y,1.729868
5,three,x,2.274867
6,four,x,-0.398296


In [258]:
df3 = pd.DataFrame({
    'a': np.arange(6),
    'b': np.random.randn(6)
}, index=['a', 'a', 'b', 'c', 'b', 'a'])
df3

Unnamed: 0,a,b
a,0,0.808848
a,1,0.037349
b,2,-0.364358
c,3,-0.374786
b,4,-1.5215
a,5,2.231923


In [259]:
df3.index.duplicated()

array([False,  True, False, False,  True,  True], dtype=bool)

In [260]:
df3[~df3.index.duplicated()]

Unnamed: 0,a,b
a,0,0.808848
b,2,-0.364358
c,3,-0.374786


In [261]:
df3[~df3.index.duplicated(keep='last')]

Unnamed: 0,a,b
c,3,-0.374786
b,4,-1.5215
a,5,2.231923


In [262]:
df3[~df3.index.duplicated(keep=False)]

Unnamed: 0,a,b
c,3,-0.374786
