# Indexing
https://pandas.pydata.org/pandas-docs/stable/10min.html#min

In [1]:
import pandas as pd
import numpy as np

In [3]:
# Make sure resuls can be replicated
np.random.seed(0)
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274
2013-01-04,0.761038,0.121675,0.443863,0.333674
2013-01-05,1.494079,-0.205158,0.313068,-0.854096
2013-01-06,-2.55299,0.653619,0.864436,-0.742165


In [4]:
list('ABCD')

['A', 'B', 'C', 'D']

## 1. [] indexing

In [None]:
# a single column

In [8]:
# multiple columns
df[['A','B']]

Unnamed: 0,A,B
2013-01-01,1.764052,0.400157
2013-01-02,1.867558,-0.977278
2013-01-03,-0.103219,0.410599
2013-01-04,0.761038,0.121675
2013-01-05,1.494079,-0.205158
2013-01-06,-2.55299,0.653619


In [21]:
# row(s)
# : is a must
df[1:2]

Unnamed: 0,A,B,C,D
2013-01-02,1.867558,-0.977278,0.950088,-0.151357


In [23]:
df['2013-01-01':'2013-01-03']

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274


In [26]:
df[dates[0]:dates[2]]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274


In [14]:
# boolean
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,,0.950088,
2013-01-03,,0.410599,0.144044,1.454274
2013-01-04,0.761038,0.121675,0.443863,0.333674
2013-01-05,1.494079,,0.313068,
2013-01-06,,0.653619,0.864436,


In [18]:
df[df.A>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-04,0.761038,0.121675,0.443863,0.333674
2013-01-05,1.494079,-0.205158,0.313068,-0.854096


In [20]:
type(df.A>0)

pandas.core.series.Series

## 2. Label-based Indexing (loc)

In [31]:
# select all columns of certain rows
df.loc[dates[:3],:]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274


In [38]:
df.loc[[dates[0],dates[2]]]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-03,-0.103219,0.410599,0.144044,1.454274


In [40]:
# select all rows of certain columns
df.loc[:, 'A':'C']

Unnamed: 0,A,B,C
2013-01-01,1.764052,0.400157,0.978738
2013-01-02,1.867558,-0.977278,0.950088
2013-01-03,-0.103219,0.410599,0.144044
2013-01-04,0.761038,0.121675,0.443863
2013-01-05,1.494079,-0.205158,0.313068
2013-01-06,-2.55299,0.653619,0.864436


In [44]:
# select certain columns of certain rows
df.loc[[df.index[0], df.index[2]],['B','D']]

Unnamed: 0,B,D
2013-01-01,0.400157,2.240893
2013-01-03,0.410599,1.454274


In [47]:
# labels can be strings
df.index = [str(e) for e in df.index]

In [48]:
df.index

Index(['2013-01-01 00:00:00', '2013-01-02 00:00:00', '2013-01-03 00:00:00',
       '2013-01-04 00:00:00', '2013-01-05 00:00:00', '2013-01-06 00:00:00'],
      dtype='object')

In [54]:
# labels can be integers (not the same as positions)
df.index = list(range(2,8))

In [58]:
df.index

Int64Index([2, 3, 4, 5, 6, 7], dtype='int64')

In [59]:
df

Unnamed: 0,A,B,C,D
2,1.764052,0.400157,0.978738,2.240893
3,1.867558,-0.977278,0.950088,-0.151357
4,-0.103219,0.410599,0.144044,1.454274
5,0.761038,0.121675,0.443863,0.333674
6,1.494079,-0.205158,0.313068,-0.854096
7,-2.55299,0.653619,0.864436,-0.742165


In [57]:
# 2 is treated as a label by loc, not position
df.loc[2]

A    1.764052
B    0.400157
C    0.978738
D    2.240893
Name: 2, dtype: float64

## 3. Position-based Indexing (iloc)

In [65]:
# select all columns of certain rows
df.iloc[:3,:]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274


In [66]:
df.iloc[:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274


In [69]:
# select all rows of certain columns
df.iloc[:, [0,3]]

Unnamed: 0,A,D
2013-01-01,1.764052,2.240893
2013-01-02,1.867558,-0.151357
2013-01-03,-0.103219,1.454274
2013-01-04,0.761038,0.333674
2013-01-05,1.494079,-0.854096
2013-01-06,-2.55299,-0.742165


In [71]:
# select certain columns of certain rows
df.iloc[2:4, [0,3]]

Unnamed: 0,A,D
2013-01-03,-0.103219,1.454274
2013-01-04,0.761038,0.333674


In [73]:
df.iloc[2,3]

1.4542735069629751

In [74]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274
2013-01-04,0.761038,0.121675,0.443863,0.333674
2013-01-05,1.494079,-0.205158,0.313068,-0.854096
2013-01-06,-2.55299,0.653619,0.864436,-0.742165


## 4. Accessing a single element (at, iat, concatenated)

In [76]:
# label based
df.at[df.index[2], 'A']

-0.10321885179355784

In [80]:
df.at[dates[2], 'A']

-0.10321885179355784

In [82]:
# position based
df.iat[0,0]

1.764052345967664

In [83]:
df.iloc[0,0]

1.764052345967664

## 5. Position + Label-based Indexing

In [84]:
# df.columns.get_loc('A')
df.iloc[2,df.columns.get_loc('A')]

-0.10321885179355784

In [86]:
# df.columns.get_indexer(['A', 'B'])
df.iloc[2,df.columns.get_indexer(['A', 'B'])]

A   -0.103219
B    0.410599
Name: 2013-01-03 00:00:00, dtype: float64

In [88]:
df.loc[df.index[0],df.columns[[1,3]]]

B    0.400157
D    2.240893
Name: 2013-01-01 00:00:00, dtype: float64

In [90]:
df.loc[df.index[0],['A','C']]

A    1.764052
C    0.978738
Name: 2013-01-01 00:00:00, dtype: float64

In [92]:
# deprecated (ix)
df.ix[0,['A','C']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


A    1.764052
C    0.978738
Name: 2013-01-01 00:00:00, dtype: float64

## 6. Potential errors

In [95]:
# df[1,1] = None
df[1,1]=None
df

Unnamed: 0,A,B,C,D,"(1, 1)"
2013-01-01,1.764052,0.400157,0.978738,2.240893,
2013-01-02,1.867558,-0.977278,0.950088,-0.151357,
2013-01-03,-0.103219,0.410599,0.144044,1.454274,
2013-01-04,0.761038,0.121675,0.443863,0.333674,
2013-01-05,1.494079,-0.205158,0.313068,-0.854096,
2013-01-06,-2.55299,0.653619,0.864436,-0.742165,


In [97]:
df['new'] = -2
df

Unnamed: 0,A,B,C,D,"(1, 1)",new
2013-01-01,1.764052,0.400157,0.978738,2.240893,,-2
2013-01-02,1.867558,-0.977278,0.950088,-0.151357,,-2
2013-01-03,-0.103219,0.410599,0.144044,1.454274,,-2
2013-01-04,0.761038,0.121675,0.443863,0.333674,,-2
2013-01-05,1.494079,-0.205158,0.313068,-0.854096,,-2
2013-01-06,-2.55299,0.653619,0.864436,-0.742165,,-2


In [98]:
# df[1] = 2
df[1] = 2
df

Unnamed: 0,A,B,C,D,"(1, 1)",new,1
2013-01-01,1.764052,0.400157,0.978738,2.240893,,-2,2
2013-01-02,1.867558,-0.977278,0.950088,-0.151357,,-2,2
2013-01-03,-0.103219,0.410599,0.144044,1.454274,,-2,2
2013-01-04,0.761038,0.121675,0.443863,0.333674,,-2,2
2013-01-05,1.494079,-0.205158,0.313068,-0.854096,,-2,2
2013-01-06,-2.55299,0.653619,0.864436,-0.742165,,-2,2
