In [1]:
# discuss how to slice and dice the date and generally get the subset of pandas object
# Python and Numpy: Indexing operators "[]"
#                   Attribiute operators "."
# those operators provide quick access to Pandas data structures
# In order to do that we take advantage of the optimized pandas data access methods
# Pandas supports 3 types of Multi-axes indexing;
# 1 .loc() --> label based
# 2 .iloc() --> Integer based
# 3. ix() --> both label and integer based

In [2]:
import pandas as pd
import numpy as np

In [8]:
# .loc() has multiple access methods;
# A single scalar label
# A list of labels
# A slice object
# A boolean array
#
# loc takes two single/list/range operator seprated by ','
# FIRST ONE INDICATES THE ROW AND SECOND ONE INDICATES COLUMNS

In [4]:
df1=pd.DataFrame(np.random.randn(8, 4),
index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], columns=['A', 'B', 'C', 'D'])
print(df1)

          A         B         C         D
a -0.502218  1.810672 -0.203699  0.000925
b  0.446390 -0.534370  0.287454  0.063961
c  0.246056 -0.383903 -1.334632 -3.355300
d  0.466529 -0.466289 -0.395543  0.173302
e  1.305217 -1.149312 -1.333190  0.316756
f -1.776167  0.297515  0.184875  0.412699
g  0.984320 -0.964657  1.000113 -1.902128
h -0.993946  1.929660 -1.189435 -0.559774


In [6]:
# select all rows for a specific column
print(df1.loc[:, 'A'])

a   -0.502218
b    0.446390
c    0.246056
d    0.466529
e    1.305217
f   -1.776167
g    0.984320
h   -0.993946
Name: A, dtype: float64


In [7]:
# select all rows for multiple columns, say list[]
print(df1.loc[:, ['A', 'C']])

          A         C
a -0.502218 -0.203699
b  0.446390  0.287454
c  0.246056 -1.334632
d  0.466529 -0.395543
e  1.305217 -1.333190
f -1.776167  0.184875
g  0.984320  1.000113
h -0.993946 -1.189435


In [9]:
# select few rows for multiple columns, say list[]
print(df1.loc[['a', 'b', 'f', 'h'], ['A', 'C']])

          A         C
a -0.502218 -0.203699
b  0.446390  0.287454
f -1.776167  0.184875
h -0.993946 -1.189435


In [11]:
# select a range of rows for all columns
print(df1.loc['d':'h'])

          A         B         C         D
d  0.466529 -0.466289 -0.395543  0.173302
e  1.305217 -1.149312 -1.333190  0.316756
f -1.776167  0.297515  0.184875  0.412699
g  0.984320 -0.964657  1.000113 -1.902128
h -0.993946  1.929660 -1.189435 -0.559774


In [12]:
# for getting values with a boolean array
print(df1.loc['a']>0)

A    False
B     True
C    False
D     True
Name: a, dtype: bool


In [13]:
# .iloc() --> Pandas provides various methods in order to get purely integer-based indexing (0-based indexing)
# An integer
# A list of integers
# A range of values

In [16]:
df2=pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])

# select all rows for a specific column
print(df2.iloc[:4])

          A         B         C         D
0 -0.458188 -0.723821  1.203139 -1.573360
1 -0.606182 -1.113955  1.257961 -0.236944
2 -1.284485  0.194982  0.223861  0.877854
3  0.849394 -0.278745  0.065461  1.559150


In [19]:
# integer slicing
print(df2.iloc[:4])
print(df2.iloc[1:5, 2:4])

          A         B         C         D
0 -0.458188 -0.723821  1.203139 -1.573360
1 -0.606182 -1.113955  1.257961 -0.236944
2 -1.284485  0.194982  0.223861  0.877854
3  0.849394 -0.278745  0.065461  1.559150
          C         D
1  1.257961 -0.236944
2  0.223861  0.877854
3  0.065461  1.559150
4  2.079210  1.364467


In [21]:
# slicing through list of values
print(df2.iloc[[1, 3, 5], [1, 3]])
print(df2.iloc[1:3, :])
print(df2.iloc[:, 1:3])

          B         D
1 -1.113955 -0.236944
3 -0.278745  1.559150
5 -0.053883  0.946383
          A         B         C         D
1 -0.606182 -1.113955  1.257961 -0.236944
2 -1.284485  0.194982  0.223861  0.877854
          B         C
0 -0.723821  1.203139
1 -1.113955  1.257961
2  0.194982  0.223861
3 -0.278745  0.065461
4 -0.541671  2.079210
5 -0.053883  0.066775
6 -0.684205 -0.564774
7 -0.335316  0.384833


In [22]:
# .ix() --> besides pure label based on an integer-based
# Pandas provides a hybrid method for selecting and subsetting the object usign the .ix() operator

In [26]:
df3= pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
print(df3.ix[:])
# integer slicing
print(df3.ix[:4])

          A         B         C         D
0 -2.274758 -0.831959  0.174994 -0.255741
1 -0.344413 -0.178531 -1.106049  0.918453
2  1.011636  1.416053  0.726312 -0.940443
3 -0.188262 -0.653735 -0.972859  0.767472
4 -1.683423  0.106368  0.599044  1.872763
5 -0.503244 -0.523624  0.104896 -0.421260
6  0.282882  0.564843  0.918164 -0.847237
7 -0.348125  0.207524  0.001500  0.112598
          A         B         C         D
0 -2.274758 -0.831959  0.174994 -0.255741
1 -0.344413 -0.178531 -1.106049  0.918453
2  1.011636  1.416053  0.726312 -0.940443
3 -0.188262 -0.653735 -0.972859  0.767472
4 -1.683423  0.106368  0.599044  1.872763


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [24]:
# index slicing
print(df3.ix[:, 'A'])

0   -1.399241
1   -0.674457
2    0.712368
3    0.204011
4   -0.438710
5    1.022080
6   -0.830151
7   -0.343595
Name: A, dtype: float64


In [27]:
# Use of notations
# series --> s.loc[indexer]
# dataframe --> df.loc[row_index, col_index]
# panel --> p.loc[item_index, major_index, minor_index]

In [29]:
df4 = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
print(df4)
print(df4['A'])

          A         B         C         D
0  0.286150  1.597984  1.530656 -0.733359
1 -1.279044 -0.105773  0.598463 -0.912623
2  1.826509 -0.203080 -0.269949  1.422620
3  0.219650 -1.163726 -1.775056 -0.864152
4  1.710932 -0.106214 -0.807421  0.454815
5  0.112195 -1.338337  0.256076  0.459335
6  1.041988  0.102152  1.189320 -0.910600
7  0.251924 -0.086450  0.610794  1.616250
0    0.286150
1   -1.279044
2    1.826509
3    0.219650
4    1.710932
5    0.112195
6    1.041988
7    0.251924
Name: A, dtype: float64


In [30]:
print(df4[['A', 'B']])

          A         B
0  0.286150  1.597984
1 -1.279044 -0.105773
2  1.826509 -0.203080
3  0.219650 -1.163726
4  1.710932 -0.106214
5  0.112195 -1.338337
6  1.041988  0.102152
7  0.251924 -0.086450


In [31]:
print(df4[2:2])

Empty DataFrame
Columns: [A, B, C, D]
Index: []


In [32]:
# Attribute Access --> columns can be selected using the attribute operator '.' (dot)
print(df4.A)

0    0.286150
1   -1.279044
2    1.826509
3    0.219650
4    1.710932
5    0.112195
6    1.041988
7    0.251924
Name: A, dtype: float64
