# Indexing y Multi-indexing

En este libro vamos a repasar indexing y selección para después pasar a un tema más complejo que se denomina multi-indexing 

In [2]:
import pandas as pd
import numpy as np

## Repaso: Indexing y Selección

### Basics

In [3]:
dates = pd.date_range('1/1/2000', periods=8)

In [4]:
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,-1.385548,0.333766,0.478162,-0.755252
2000-01-02,0.232174,0.526203,-0.614703,-0.750943
2000-01-03,-0.700666,-0.043063,0.365381,0.135293
2000-01-04,0.167689,0.306283,-0.499565,0.554072
2000-01-05,0.160107,-1.357551,-0.724843,-2.055395
2000-01-06,-1.006647,0.314039,-0.636161,-0.725341
2000-01-07,-1.234014,-1.8923,1.58538,-1.4472
2000-01-08,-0.617764,0.848052,-0.167808,0.673527


In [10]:
panel = pd.Panel({'one':df,'two':df-df.mean()})
panel

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D

#### Seleccionar por etiqueta

In [11]:
s = df['A'] #Seleccionar en un dataframe
s

2000-01-01   -1.385548
2000-01-02    0.232174
2000-01-03   -0.700666
2000-01-04    0.167689
2000-01-05    0.160107
2000-01-06   -1.006647
2000-01-07   -1.234014
2000-01-08   -0.617764
Freq: D, Name: A, dtype: float64

In [12]:
s[dates[5]] #Seleccionar en una serie

-1.0066472656720473

In [13]:
panel['one'] #Seleccionar en un panel

Unnamed: 0,A,B,C,D
2000-01-01,-1.385548,0.333766,0.478162,-0.755252
2000-01-02,0.232174,0.526203,-0.614703,-0.750943
2000-01-03,-0.700666,-0.043063,0.365381,0.135293
2000-01-04,0.167689,0.306283,-0.499565,0.554072
2000-01-05,0.160107,-1.357551,-0.724843,-2.055395
2000-01-06,-1.006647,0.314039,-0.636161,-0.725341
2000-01-07,-1.234014,-1.8923,1.58538,-1.4472
2000-01-08,-0.617764,0.848052,-0.167808,0.673527


#### Filas y columnas por etiqueta: loc

In [14]:
s1 = pd.Series(np.random.randn(6),index=list('abcdef'))
s1

a    0.291057
b    0.689482
c    0.734399
d    0.706290
e   -0.071455
f    0.038736
dtype: float64

In [15]:
s1.loc['c':]

c    0.734399
d    0.706290
e   -0.071455
f    0.038736
dtype: float64

In [16]:
df1 = pd.DataFrame(np.random.randn(6,4),index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.954572,-1.075148,0.805682,-2.242652
b,0.129024,1.770631,-2.005802,-1.079239
c,0.104996,-1.239516,0.255903,-0.010419
d,1.611956,-0.81938,-2.454132,-0.300352
e,-1.907964,-1.062856,0.11213,-0.861355
f,0.021526,0.022782,0.353548,-0.105166


In [17]:
df1.loc['d':, 'A':'C']

Unnamed: 0,A,B,C
d,1.611956,-0.81938,-2.454132
e,-1.907964,-1.062856,0.11213
f,0.021526,0.022782,0.353548


#### Filas y columnas por posición: iloc

In [18]:
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1

0    0.901284
2    1.823950
4   -0.181943
6    0.656310
8   -0.835871
dtype: float64

In [19]:
s1.iloc[:3]

0    0.901284
2    1.823950
4   -0.181943
dtype: float64

In [20]:
df1 = pd.DataFrame(np.random.randn(6,4),index=list(range(0,12,2)),columns=list(range(0,8,2)))
df1

Unnamed: 0,0,2,4,6
0,-0.413759,0.314202,1.219957,0.400629
2,-0.27139,-0.776967,0.873421,1.69335
4,1.313954,-0.061396,-0.96754,1.798445
6,-2.270469,-1.506359,0.79842,1.58017
8,0.666926,0.597965,1.300862,0.965257
10,0.639399,-0.723598,1.378302,-0.771591


In [16]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,-1.413857,-0.501133,-0.932773,-1.6883
2,0.37963,0.070933,1.691123,0.528713
4,-1.593157,-0.218655,1.019817,0.522174


In [21]:
df1.iloc[1:5, 2:4]

Unnamed: 0,4,6
2,0.873421,1.69335
4,-0.96754,1.798445
6,0.79842,1.58017
8,1.300862,0.965257


In [22]:
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,-0.776967,1.69335
6,-1.506359,1.58017
10,-0.723598,-0.771591


#### Selección por una función

In [23]:
df1 = pd.DataFrame(np.random.randn(6, 4),index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.389052,0.859056,-0.436732,0.072396
b,-0.24897,0.032571,0.712498,0.854963
c,1.039431,2.063472,-0.473162,1.884896
d,1.415044,0.541205,0.217151,1.041558
e,-0.850487,0.206528,0.899416,0.571748
f,-1.089972,0.40914,-0.786791,-1.622863


In [24]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
a,0.389052,0.859056,-0.436732,0.072396
c,1.039431,2.063472,-0.473162,1.884896
d,1.415044,0.541205,0.217151,1.041558


In [25]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,0.389052,0.859056
b,-0.24897,0.032571
c,1.039431,2.063472
d,1.415044,0.541205
e,-0.850487,0.206528
f,-1.089972,0.40914


In [26]:
df1['A'].loc[lambda s: s > 0]

a    0.389052
c    1.039431
d    1.415044
Name: A, dtype: float64

#### Seleccionar muestras aleatorias

In [27]:
s = pd.Series([0,1,2,3,4,5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [28]:
s.sample(n=4) #Seleccionar una muestra aleatoria de tamaño 4

4    4
1    1
0    0
3    3
dtype: int64

In [29]:
s.sample(frac=0.5) #Seleccionar la mitad de las observaciones

3    3
5    5
2    2
dtype: int64

Estas operaciones se pueden hacer sin reemplazo (default) o con reemplazo

In [30]:
s.sample(n=6, replace=False) # Explícitamente sin reemplazo

4    4
1    1
5    5
3    3
0    0
2    2
dtype: int64

In [31]:
s.sample(n=6, replace=True) # Explícitamente con reemplazo

0    0
3    3
3    3
2    2
0    0
2    2
dtype: int64

Por último podemos colocar pesos específicos para la probabilidad de esas filas de ser elegidas con el argumento 'weights'

In [32]:
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]

In [36]:
s.sample(n=3, weights=example_weights)

4    4
2    2
5    5
dtype: int64

#### Seleccionar y definir valores escalares

Si bien podemos seleccionar y definir valores usando loc o iloc, la manera más eficiente computacionalmente es utilizando los métodos **at** e **iat**

Vemos por ejemplo que

In [37]:
s.at[4]

4

In [38]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-1.385548,0.333766,0.478162,-0.755252
2000-01-02,0.232174,0.526203,-0.614703,-0.750943
2000-01-03,-0.700666,-0.043063,0.365381,0.135293
2000-01-04,0.167689,0.306283,-0.499565,0.554072
2000-01-05,0.160107,-1.357551,-0.724843,-2.055395
2000-01-06,-1.006647,0.314039,-0.636161,-0.725341
2000-01-07,-1.234014,-1.8923,1.58538,-1.4472
2000-01-08,-0.617764,0.848052,-0.167808,0.673527


In [39]:
df.at[dates[3],'D']

0.554071624420442

Nota:

In [40]:
dates[3]

Timestamp('2000-01-04 00:00:00', freq='D')

In [41]:
type(dates[3])

pandas._libs.tslib.Timestamp

Veremos como manejar estos objetos en otro libro de Pandas

Manejarlos por índice funciona del siguiente modo

In [42]:
s.iat[5]

5

In [43]:
df.iat[3, 0]

0.16768944004778744

#### Seleccionar con booleanos: casos especiales

El caso canónico que hemos visto es el siguiente

In [44]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2000-01-02,0.232174,0.526203,-0.614703,-0.750943
2000-01-04,0.167689,0.306283,-0.499565,0.554072
2000-01-05,0.160107,-1.357551,-0.724843,-2.055395


Aquí veremos algunos casos más complejos en los que usamos lógica para filtrar un dataframe

In [45]:
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                           'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                            'c' : np.random.randn(7)})

In [46]:
df2

Unnamed: 0,a,b,c
0,one,x,-0.299466
1,one,y,1.43278
2,two,y,1.047035
3,three,x,-0.071687
4,two,y,0.359552
5,one,x,1.125888
6,six,x,1.293806


In [81]:
df2[[x.startswith('t') for x in df2['a']]]

Unnamed: 0,a,b,c
2,two,y,0.371906
3,three,x,0.176173
4,two,y,-0.273691


O haciendolo más complejo podemos escribir

In [84]:
df2[([x.startswith('t') for x in df2['a']]) & (df2['b']=='x')]

Unnamed: 0,a,b,c
3,three,x,0.176173


#### Seleccionando con el método 'isin'

In [48]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [49]:
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [50]:
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

#### Introduciendo Multi-indexing

In [52]:
s_mi = pd.Series(np.arange(6),
                    index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int64

In [44]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])] #el método multi-index isin acepta una lista de secuencias

0  c    2
1  a    3
dtype: int64

In [95]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int64

#### Listas y diccionarios como argumentos de isin

In [54]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], 'ids2': ['a', 'n', 'c', 'n']})
df

Unnamed: 0,ids,ids2,vals
0,a,a,1
1,b,n,2
2,f,c,3
3,n,n,4


In [55]:
lista_values = ['a', 'b', 1, 3]
lista_values

['a', 'b', 1, 3]

In [56]:
df.isin(lista_values)

Unnamed: 0,ids,ids2,vals
0,True,True,True
1,True,False,False
2,False,False,True
3,False,False,False


Alternativamente para un diccinario podemos buscar valores específicos en columnas específicas

In [58]:
dict_values = {'ids': ['a', 'b'], 'vals': [1, 3]}

In [59]:
df.isin(dict_values)

Unnamed: 0,ids,ids2,vals
0,True,False,True
1,True,False,False
2,False,False,True
3,False,False,False


In [60]:
%store df

Stored 'df' (DataFrame)
