# Reindexing

In [1]:
# An important method on pandas objects is reindex, which means to
# create a new object with the values rearranged to align with the new index
import numpy as np
import pandas as pd

In [2]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b','a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [3]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [4]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0,2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [5]:
obj3.reindex(range(6),method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [6]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [7]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [8]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [9]:
frame.loc[['a', 'c', 'd'], ['Texas', 'California']]

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


# Dropping Entries from an Axis

In [10]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c','d', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [11]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [12]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [13]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah','New York'],
        columns=['one', 'two', 'three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [14]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [15]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [16]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [17]:
obj.drop('c', inplace=True)
obj # Many functions, like drop, which modify the size or shape of a Series or
# DataFrame, can manipulate an object in-place without returning a new object

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

# Indexing,Selection,and Filtering

In [18]:
# Series indexing (obj[...]) works analogously to NumPy array indexing,
# except you can use the Series’s index values instead of only integers

In [21]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [22]:
obj[1]

  obj[1]


1.0

In [35]:
obj.iloc[1] # Since loc operator indexes exclusively with labels, there is also a iloc
# operator that indexes exclusively with integers to work consistently whether the index contains integers or not

1.0

In [23]:
obj['b']

1.0

In [24]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [26]:
obj[[1,3]]

  obj[[1,3]]


b    1.0
d    3.0
dtype: float64

In [34]:
obj.iloc[[1,3]]

b    1.0
d    3.0
dtype: float64

In [27]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [29]:
# While you can select data by label this way, the preferred way to select
# index values is the special loc operator:
obj.loc[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [37]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah','New York'],
        columns=['one', 'two', 'three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [38]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [39]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [40]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [43]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection on DataFrame with loc and iloc

In [46]:
# DataFrame has special operators loc and iloc for labelbased and integer-based indexing, 
# respectively
#  Since DataFrame is twodimensional, you can select a subset of the rows and columns 
# with NumPy like notation using either axis labels (loc) or integers (iloc)

In [47]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [48]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [49]:
data.loc[:'Utah','two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [50]:
data.iloc[:,:3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


# Integer Indexing Pitfalls

In [66]:
ser = pd.Series(np.arange(3.),index = [2,1,0])
ser

2    0.0
1    1.0
0    2.0
dtype: float64

In [67]:
# ser[0] may be 2.0 or 0.0
# [0] may represents 'position 0' or 'index 0' 

In [68]:
# ser[-1] 
# Here we have an index containing 0, 1, 2, but inferring what the user wants (label-based
# indexing or position-based) is difficult

In [69]:
# On the other hand, with a non-integer index, there is no potential for ambiguity
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
# ser2[-1]
ser2.iloc[-1]

2.0

In [70]:
# On the other hand, slicing with integers is always integer-oriented
ser[:2]

2    0.0
1    1.0
dtype: float64

# Arithmetric and Data Alignment

In [71]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

In [72]:
df1

Unnamed: 0,A
0,1
1,2


In [73]:
df2

Unnamed: 0,B
0,3
1,4


In [75]:
df1 + df2 # The internal data alignment introduces missing values in the label locations
# that don’t overlap

Unnamed: 0,A,B
0,,
1,,


In [77]:
# Arithmetic methods with fill values
df1.add(df2,fill_value = 0) # Using the add method on df1, I pass df2 and an argument to fill_value:

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,4.0


In [78]:
1 / df1

Unnamed: 0,A
0,1.0
1,0.5


In [79]:
df1.rdiv(1)

Unnamed: 0,A
0,1.0
1,0.5


In [81]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),columns = list('abcd'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [82]:
df2 = pd.DataFrame(np.arange(20.).reshape(4,5),columns = list('abcde'))
df2.loc[1,'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [83]:
df1.reindex(columns = df2.columns,fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [84]:
# Operation between DataFrame and Series
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [85]:
arr[0]

array([0., 1., 2., 3.])

In [87]:
arr - arr[0] # When we subtract arr[0] from arr, the subtraction is performed once
# for each row. This is referred to as broadcasting

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [90]:
#  Operations between a DataFrame and a Series are similar:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),columns = list('bde'),index = ['Utah','Ohio','Texas','Oregon'])
series = frame.iloc[0]

In [91]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [92]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [93]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [96]:
# If an index value is not found in either the DataFrame’s columns or the
# Series’s index, the objects will be reindexed to form the union
series2 = pd.Series(range(3),index = list('bef'))
series2

b    0
e    1
f    2
dtype: int64

In [97]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [107]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [100]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [106]:
frame.loc['Utah']

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [104]:
frame.sub(series3,axis = 'index') # The axis number that you pass is the axis to match on. In this case we mean
# to match on the DataFrame’s row index

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


# Function Application and Mapping

In [110]:
frame = pd.DataFrame(np.random.standard_normal((4,3)),columns = list('bde'),index = ['Utah', 'Ohio', 'Texas',
'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-1.202854,-0.292076,0.559499
Ohio,-0.540819,-1.356891,-0.540711
Texas,0.074949,0.717486,-0.885752
Oregon,1.596922,1.320512,-0.937907


In [111]:
frame.abs()

Unnamed: 0,b,d,e
Utah,1.202854,0.292076,0.559499
Ohio,0.540819,1.356891,0.540711
Texas,0.074949,0.717486,0.885752
Oregon,1.596922,1.320512,0.937907


In [112]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.202854,0.292076,0.559499
Ohio,0.540819,1.356891,0.540711
Texas,0.074949,0.717486,0.885752
Oregon,1.596922,1.320512,0.937907


In [117]:
# Another frequent operation is applying a function on one-dimensional arrays to each column or row
# DataFrame’s apply method does exactly this:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    2.799776
d    2.677403
e    1.497406
dtype: float64

In [121]:
frame.apply(f,axis = 'columns') # operate along axis1 (axis = 'columns')

Unnamed: 0,min,max
Utah,-1.202854,0.559499
Ohio,-1.356891,-0.540711
Texas,-0.885752,0.717486
Oregon,-0.937907,1.596922


In [122]:
def f(x):
    return pd.Series([x.min(),x.max()],index = ['min','max'])

In [123]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.202854,-1.356891,-0.937907
max,1.596922,1.320512,0.559499


In [124]:
# Element-wise Python functions can be used, too
format = lambda x: '%.2f' % x

In [126]:
# frame.applymap(format)
frame.map(format)

Unnamed: 0,b,d,e
Utah,-1.2,-0.29,0.56
Ohio,-0.54,-1.36,-0.54
Texas,0.07,0.72,-0.89
Oregon,1.6,1.32,-0.94


In [127]:
frame['e'].map(format)

Utah       0.56
Ohio      -0.54
Texas     -0.89
Oregon    -0.94
Name: e, dtype: object

# Sorting and Ranging

In [130]:
obj = pd.Series(range(4),index = list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int64

In [129]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [132]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),columns = list('dabc'),index =['three','one'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [135]:
frame.sort_index(axis = 0)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [134]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [138]:
# The data is sorted in ascending order by default
frame.sort_index(axis = 1,ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [140]:
obj = pd.Series([4,7,-3,2]) # To sort a Series by its values, use its sort_values method
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [141]:
# When sorting a DataFrame, you can use the data in one or more columns as
# the sort keys. To do so, pass one or more column names to the by option of sort_values

In [144]:
frame = pd.DataFrame({'b':[4, 7, -3, 2],'a':[0, 1,0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [146]:
frame.sort_values(by = 'b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [151]:
frame.sort_values(by = ['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [152]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [154]:
obj.rank?

[1;31mSignature:[0m
[0mobj[0m[1;33m.[0m[0mrank[0m[1;33m([0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mmethod[0m[1;33m:[0m [1;34m"Literal['average', 'min', 'max', 'first', 'dense']"[0m [1;33m=[0m [1;34m'average'[0m[1;33m,[0m[1;33m
[0m    [0mnumeric_only[0m[1;33m:[0m [1;34m'bool_t'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mna_option[0m[1;33m:[0m [1;34m"Literal['keep', 'top', 'bottom']"[0m [1;33m=[0m [1;34m'keep'[0m[1;33m,[0m[1;33m
[0m    [0mascending[0m[1;33m:[0m [1;34m'bool_t'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mpct[0m[1;33m:[0m [1;34m'bool_t'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'Self'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute numerical data ranks (1 through n) along axis.

By default, equal values are assigned a rank that is the average of the

In [155]:
df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog','spider', 'snake'],
                        'Number_legs': [4, 2, 4, 8, np.nan]})
df

Unnamed: 0,Animal,Number_legs
0,cat,4.0
1,penguin,2.0
2,dog,4.0
3,spider,8.0
4,snake,


In [157]:
s = pd.Series(range(5), index=list("abcde"))
s["d"] = s["b"]
s

a    0
b    1
c    2
d    1
e    4
dtype: int64

In [158]:
s.rank() # average of 2 and 3 is 2.5

a    1.0
b    2.5
c    4.0
d    2.5
e    5.0
dtype: float64

In [160]:
s.rank(method='first')

a    1.0
b    2.0
c    4.0
d    3.0
e    5.0
dtype: float64

# Axis Index with Duplicate Labels

In [161]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [163]:
obj.index.is_unique

False

In [164]:
obj['a']

a    0
a    1
dtype: int64

In [165]:
obj['c']

4