# Indexing y Multi-indexing

En este libro vamos a repasar indexing y selección para después pasar a un tema más complejo que se denomina multi-indexing 

In [2]:
import pandas as pd
import numpy as np

## Repaso: Indexing y Selección

### Basics

In [3]:
dates = pd.date_range('1/1/2000', periods=8)

In [4]:
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,1.082712,-0.498114,0.488012,2.412402
2000-01-02,-0.308787,0.905746,-0.451486,0.864329
2000-01-03,1.741558,0.210442,-0.163237,-0.081883
2000-01-04,-0.819779,1.992856,-0.43031,-0.870303
2000-01-05,0.148878,1.859134,-0.231081,1.108654
2000-01-06,-0.167095,-0.708631,0.31359,0.484786
2000-01-07,-0.992399,-0.039384,-0.4191,1.559865
2000-01-08,1.128518,-0.266641,-0.537932,0.490813


In [5]:
panel = pd.Panel({'one':df,'two':df-df.mean()})
panel

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D

#### Seleccionar por etiqueta

In [6]:
s = df['A'] #Seleccionar en un dataframe
s

2000-01-01    1.082712
2000-01-02   -0.308787
2000-01-03    1.741558
2000-01-04   -0.819779
2000-01-05    0.148878
2000-01-06   -0.167095
2000-01-07   -0.992399
2000-01-08    1.128518
Freq: D, Name: A, dtype: float64

In [7]:
s[dates[5]] #Seleccionar en una serie

-0.16709523770606061

In [8]:
panel['one'] #Seleccionar en un panel

Unnamed: 0,A,B,C,D
2000-01-01,1.082712,-0.498114,0.488012,2.412402
2000-01-02,-0.308787,0.905746,-0.451486,0.864329
2000-01-03,1.741558,0.210442,-0.163237,-0.081883
2000-01-04,-0.819779,1.992856,-0.43031,-0.870303
2000-01-05,0.148878,1.859134,-0.231081,1.108654
2000-01-06,-0.167095,-0.708631,0.31359,0.484786
2000-01-07,-0.992399,-0.039384,-0.4191,1.559865
2000-01-08,1.128518,-0.266641,-0.537932,0.490813


#### Filas y columnas por etiqueta: loc

In [9]:
s1 = pd.Series(np.random.randn(6),index=list('abcdef'))
s1

a    0.633684
b   -0.726289
c    1.046788
d    0.755937
e   -0.086696
f   -0.384969
dtype: float64

In [10]:
s1.loc['c':]

c    1.046788
d    0.755937
e   -0.086696
f   -0.384969
dtype: float64

In [11]:
df1 = pd.DataFrame(np.random.randn(6,4),index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.619074,-0.701089,1.417736,0.301368
b,-1.016768,-0.118694,-0.561012,0.383509
c,0.168499,0.688955,-1.422907,-0.505661
d,-2.735321,0.118948,-0.212234,-0.249859
e,0.118151,1.152005,0.274484,0.540116
f,-0.851055,0.182887,1.403356,0.004591


In [12]:
df1.loc['d':, 'A':'C']

Unnamed: 0,A,B,C
d,-2.735321,0.118948,-0.212234
e,0.118151,1.152005,0.274484
f,-0.851055,0.182887,1.403356


#### Filas y columnas por posición: iloc

In [13]:
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1

0   -0.174819
2   -0.136379
4    0.699752
6    0.829023
8    0.289819
dtype: float64

In [14]:
s1.iloc[:3]

0   -0.174819
2   -0.136379
4    0.699752
dtype: float64

In [15]:
df1 = pd.DataFrame(np.random.randn(6,4),index=list(range(0,12,2)),columns=list(range(0,8,2)))
df1

Unnamed: 0,0,2,4,6
0,-1.413857,-0.501133,-0.932773,-1.6883
2,0.37963,0.070933,1.691123,0.528713
4,-1.593157,-0.218655,1.019817,0.522174
6,1.430354,-0.160502,0.893263,0.627332
8,0.343327,1.911004,2.021934,0.092868
10,-1.438014,1.191018,-0.624401,-0.489499


In [16]:
df1.iloc[:3]

Unnamed: 0,0,2,4,6
0,-1.413857,-0.501133,-0.932773,-1.6883
2,0.37963,0.070933,1.691123,0.528713
4,-1.593157,-0.218655,1.019817,0.522174


In [17]:
df1.iloc[1:5, 2:4]

Unnamed: 0,4,6
2,1.691123,0.528713
4,1.019817,0.522174
6,0.893263,0.627332
8,2.021934,0.092868


In [18]:
df1.iloc[[1, 3, 5], [1, 3]]

Unnamed: 0,2,6
2,0.070933,0.528713
6,-0.160502,0.627332
10,1.191018,-0.489499


#### Selección por una función

In [19]:
df1 = pd.DataFrame(np.random.randn(6, 4),index=list('abcdef'),columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-1.37741,-1.300673,2.11695,-0.520631
b,-1.173102,-0.135465,0.889631,1.760113
c,1.386049,-0.405735,-0.774647,-0.892901
d,-1.090596,-0.210852,0.974558,-0.194796
e,-0.185026,-0.974662,-3.447973,-1.873164
f,0.801013,-0.37243,-0.572179,1.257244


In [20]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
c,1.386049,-0.405735,-0.774647,-0.892901
f,0.801013,-0.37243,-0.572179,1.257244


In [21]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-1.37741,-1.300673
b,-1.173102,-0.135465
c,1.386049,-0.405735
d,-1.090596,-0.210852
e,-0.185026,-0.974662
f,0.801013,-0.37243


In [22]:
df1['A'].loc[lambda s: s > 0]

c    1.386049
f    0.801013
Name: A, dtype: float64

#### Seleccionar muestras aleatorias

In [23]:
s = pd.Series([0,1,2,3,4,5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [24]:
s.sample(n=4) #Seleccionar una muestra aleatoria de tamaño 4

3    3
5    5
1    1
4    4
dtype: int64

In [25]:
s.sample(frac=0.5) #Seleccionar la mitad de las observaciones

3    3
2    2
0    0
dtype: int64

Estas operaciones se pueden hacer sin reemplazo (default) o con reemplazo

In [26]:
s.sample(n=6, replace=False) # Explícitamente sin reemplazo

1    1
2    2
3    3
0    0
4    4
5    5
dtype: int64

In [27]:
s.sample(n=6, replace=True) # Explícitamente con reemplazo

2    2
5    5
2    2
0    0
4    4
0    0
dtype: int64

Por último podemos colocar pesos específicos para la probabilidad de esas filas de ser elegidas con el argumento 'weights'

In [28]:
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]

In [29]:
s.sample(n=3, weights=example_weights)

4    4
5    5
2    2
dtype: int64

#### Seleccionar y definir valores escalares

Si bien podemos seleccionar y definir valores usando loc o iloc, la manera más eficiente computacionalmente es utilizando los métodos **at** e **iat**

Vemos por ejemplo que

In [62]:
s.at[4]

4

In [64]:
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.686828,-1.114131,0.104952,-0.289503
2000-01-02,0.947504,-0.397911,-0.490816,-0.896818
2000-01-03,-0.868306,-0.06689,-0.237893,0.512618
2000-01-04,-1.74535,-0.449802,-0.304172,-0.5737
2000-01-05,0.269781,-0.193423,0.825389,1.031012
2000-01-06,-0.907934,0.268942,1.090905,0.119219
2000-01-07,-0.698249,-0.757163,-0.980433,0.770444
2000-01-08,0.967514,-1.439328,0.368967,0.156992


In [70]:
df.at[dates[3],'D'] 

-0.57370015722489442

Nota:

In [71]:
dates[3]

Timestamp('2000-01-04 00:00:00', freq='D')

In [72]:
type(dates[3])

pandas.tslib.Timestamp

Veremos como manejar estos objetos en otro libro de Pandas

Manejarlos por índice funciona del siguiente modo

In [30]:
s.iat[5]

5

In [31]:
df.iat[3, 0]

-0.8197792240410835

#### Seleccionar con booleanos: casos especiales

El caso canónico que hemos visto es el siguiente

In [76]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D,E
2000-01-02,0.947504,-0.397911,-0.490816,-0.896818,
2000-01-05,0.269781,-0.193423,0.825389,1.031012,
2000-01-08,0.967514,-1.439328,0.368967,0.156992,


Aquí veremos algunos casos más complejos en los que usamos lógica para filtrar un dataframe

In [32]:
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                           'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                            'c' : np.random.randn(7)})

In [33]:
df2

Unnamed: 0,a,b,c
0,one,x,-1.28791
1,one,y,1.285529
2,two,y,0.483912
3,three,x,-1.592708
4,two,y,-0.510384
5,one,x,2.856532
6,six,x,1.103245


In [81]:
df2[[x.startswith('t') for x in df2['a']]]

Unnamed: 0,a,b,c
2,two,y,0.371906
3,three,x,0.176173
4,two,y,-0.273691


O haciendolo más complejo podemos escribir

In [84]:
df2[([x.startswith('t') for x in df2['a']]) & (df2['b']=='x')]

Unnamed: 0,a,b,c
3,three,x,0.176173


#### Seleccionando con el método 'isin'

In [35]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [36]:
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [88]:
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

#### Introduciendo Multi-indexing

In [42]:
s_mi = pd.Series(np.arange(6),
                    index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int64

In [44]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])] #el método multi-index isin acepta una lista de secuencias

0  c    2
1  a    3
dtype: int64

In [95]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int64

#### Listas y diccionarios como argumentos de isin

In [45]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], 'ids2': ['a', 'n', 'c', 'n']})
df

Unnamed: 0,ids,ids2,vals
0,a,a,1
1,b,n,2
2,f,c,3
3,n,n,4


In [46]:
lista_values = ['a', 'b', 1, 3]
lista_values

['a', 'b', 1, 3]

In [102]:
df.isin(lista_values)

Unnamed: 0,ids,ids2,vals
0,True,True,True
1,True,False,False
2,False,False,True
3,False,False,False


Alternativamente para un diccinario podemos buscar valores específicos en columnas específicas

In [103]:
dict_values = {'ids': ['a', 'b'], 'vals': [1, 3]}

In [104]:
df.isin(dict_values)

Unnamed: 0,ids,ids2,vals
0,True,False,True
1,True,False,False
2,False,False,True
3,False,False,False


In [47]:
%store df

Stored 'df' (DataFrame)
