In [1]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

In [3]:
s1 = Series([4.5, 7.2, -5.3, 3.6], index=list('dbac'))
s1

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

### Re-indexing

In [4]:
s2 = Series(s1, index=['l', 'm', 'n', 'o']) # makes a filter using these indices and found none
s2

l   NaN
m   NaN
n   NaN
o   NaN
dtype: float64

In [6]:
s1.index = list('lmno')
s1

l    4.5
m    7.2
n   -5.3
o    3.6
dtype: float64

In [9]:
s1.reindex(['m', 'n', 'o', 'p', 'q'])

m    7.2
n   -5.3
o    3.6
p    NaN
q    NaN
dtype: float64

In [10]:
s1.reindex(['m', 'n', 'o', 'p', 'q'], method='ffill') # forward fill: fills new cols with last number

m    7.2
n   -5.3
o    3.6
p    3.6
q    3.6
dtype: float64

### How to align data

In [12]:
s1 = Series([4.5, 7.2, -5.3, 3.6], index=list('dbac'))
s2 = Series([0,7,1,2,3], index=list('dbcef'))

s1 + s2 # + cannot take default(Nan) as argument

a     NaN
b    14.2
c     4.6
d     4.5
e     NaN
f     NaN
dtype: float64

In [16]:
s1.add(s2) # advantage of add is that u can change the default value (Nan) for empty cols 

a     NaN
b    14.2
c     4.6
d     4.5
e     NaN
f     NaN
dtype: float64

In [17]:
s1.add(s2, fill_value=0.0) 

a    -5.3
b    14.2
c     4.6
d     4.5
e     2.0
f     3.0
dtype: float64

In [18]:
frame1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
frame2 = DataFrame(np.arange(20.).reshape((4,5)), columns=list('abcde'))

frame1.add(frame2)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [19]:
frame1.add(frame2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [20]:
# operations: add, sub, div, mul, pow, floordiv
frame1.sub(frame2, fill_value=0) # f1 - f2

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,-4.0
1,-1.0,-1.0,-1.0,-1.0,-9.0
2,-2.0,-2.0,-2.0,-2.0,-14.0
3,-15.0,-16.0,-17.0,-18.0,-19.0


In [21]:
# operations with reverse-factor order: radd, rsub, rdiv, rmul, rpow, rfloordiv
frame1.rsub(frame2, fill_value=0) # f2 - f1

Unnamed: 0,a,b,c,d,e
0,0.0,0.0,0.0,0.0,4.0
1,1.0,1.0,1.0,1.0,9.0
2,2.0,2.0,2.0,2.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [22]:
frame1.rdiv(1) # multiplicative inverse of f1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [24]:
# data of f1 and col names from f2, which adds a nan of e that we fill with 0
frame1.reindex(columns=frame2.columns, fill_value=1000) # advantage reindex uses fill_value

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,1000
1,4.0,5.0,6.0,7.0,1000
2,8.0,9.0,10.0,11.0,1000


In [26]:
frame1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
frame2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),index=['Sinaloa', 'Tamaulipas', 'Jalisco', 'Nayarit'])
s1 = frame2.iloc[0] # first columns
s1

b    0.0
d    1.0
e    2.0
Name: Sinaloa, dtype: float64

In [27]:
s2 = Series(range(3), index=['b','e','f'])
s2

b    0
e    1
f    2
dtype: int64

In [29]:
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [30]:
# substract the first row to a matrix: the numpy array repeats the row 3 times in order to be able to subract it
# numpy array: broadcasting until the dimension is same
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [32]:
# pandas does the same
display(frame2)
display(s1)
frame2 - s1

Unnamed: 0,b,d,e
Sinaloa,0.0,1.0,2.0
Tamaulipas,3.0,4.0,5.0
Jalisco,6.0,7.0,8.0
Nayarit,9.0,10.0,11.0


b    0.0
d    1.0
e    2.0
Name: Sinaloa, dtype: float64

Unnamed: 0,b,d,e
Sinaloa,0.0,0.0,0.0
Tamaulipas,3.0,3.0,3.0
Jalisco,6.0,6.0,6.0
Nayarit,9.0,9.0,9.0


In [33]:
# broadcast: repeats the 2 elem array for the n rows, but third element doesnt do anything
s2 = Series(range(2), index=['b','e'])
display(frame2)
display(s1)
frame2 - s2

Unnamed: 0,b,d,e
Sinaloa,0.0,1.0,2.0
Tamaulipas,3.0,4.0,5.0
Jalisco,6.0,7.0,8.0
Nayarit,9.0,10.0,11.0


b    0.0
d    1.0
e    2.0
Name: Sinaloa, dtype: float64

Unnamed: 0,b,d,e
Sinaloa,0.0,,1.0
Tamaulipas,3.0,,4.0
Jalisco,6.0,,7.0
Nayarit,9.0,,10.0


In [34]:
f3 = frame2 - s2
f3

Unnamed: 0,b,d,e
Sinaloa,0.0,,1.0
Tamaulipas,3.0,,4.0
Jalisco,6.0,,7.0
Nayarit,9.0,,10.0


In [36]:
# change nans for zeros only using add
f3.add(0, fill_value=0)

Unnamed: 0,b,d,e
Sinaloa,0.0,0.0,1.0
Tamaulipas,3.0,0.0,4.0
Jalisco,6.0,0.0,7.0
Nayarit,9.0,0.0,10.0


In [37]:
s3 = frame2['d']
s3

Sinaloa        1.0
Tamaulipas     4.0
Jalisco        7.0
Nayarit       10.0
Name: d, dtype: float64

In [38]:
display(frame2)
display(s3)
# none are alligned, so all nan
frame2 - s3

Unnamed: 0,b,d,e
Sinaloa,0.0,1.0,2.0
Tamaulipas,3.0,4.0,5.0
Jalisco,6.0,7.0,8.0
Nayarit,9.0,10.0,11.0


Sinaloa        1.0
Tamaulipas     4.0
Jalisco        7.0
Nayarit       10.0
Name: d, dtype: float64

Unnamed: 0,Jalisco,Nayarit,Sinaloa,Tamaulipas,b,d,e
Sinaloa,,,,,,,
Tamaulipas,,,,,,,
Jalisco,,,,,,,
Nayarit,,,,,,,


In [40]:
display(frame2)
display(s3)
# substract s3 to frame2 with indices
frame2 - s3

Unnamed: 0,b,d,e
Sinaloa,0.0,1.0,2.0
Tamaulipas,3.0,4.0,5.0
Jalisco,6.0,7.0,8.0
Nayarit,9.0,10.0,11.0


Sinaloa        1.0
Tamaulipas     4.0
Jalisco        7.0
Nayarit       10.0
Name: d, dtype: float64

Unnamed: 0,Jalisco,Nayarit,Sinaloa,Tamaulipas,b,d,e
Sinaloa,,,,,,,
Tamaulipas,,,,,,,
Jalisco,,,,,,,
Nayarit,,,,,,,


In [41]:
# substract d as col for each col
# axis can be row indices
frame2.sub(s3, axis='index')

Unnamed: 0,b,d,e
Sinaloa,-1.0,0.0,1.0
Tamaulipas,-1.0,0.0,1.0
Jalisco,-1.0,0.0,1.0
Nayarit,-1.0,0.0,1.0


### Functions and mapping

In [42]:
display(frame2)

Unnamed: 0,b,d,e
Sinaloa,0.0,1.0,2.0
Tamaulipas,3.0,4.0,5.0
Jalisco,6.0,7.0,8.0
Nayarit,9.0,10.0,11.0


In [43]:
# all numpy functions can be applied to df
frame3 = frame2.rsub(1)
display(frame3)

Unnamed: 0,b,d,e
Sinaloa,1.0,0.0,-1.0
Tamaulipas,-2.0,-3.0,-4.0
Jalisco,-5.0,-6.0,-7.0
Nayarit,-8.0,-9.0,-10.0


In [44]:
np.abs(frame3)

Unnamed: 0,b,d,e
Sinaloa,1.0,0.0,1.0
Tamaulipas,2.0,3.0,4.0
Jalisco,5.0,6.0,7.0
Nayarit,8.0,9.0,10.0


In [45]:
def f(x):
    return x.max() - x.min()
f(np.array([4,0,2,-3]))

7

In [48]:
frame3

Unnamed: 0,b,d,e
Sinaloa,1.0,0.0,-1.0
Tamaulipas,-2.0,-3.0,-4.0
Jalisco,-5.0,-6.0,-7.0
Nayarit,-8.0,-9.0,-10.0


In [47]:
# col0: 1+8, col1: 0+9
frame3.apply(f)

b    9.0
d    9.0
e    9.0
dtype: float64

In [49]:
frame3.apply(f, axis='columns') # apply it by row: cols: 1+1, -2+4

Sinaloa       2.0
Tamaulipas    2.0
Jalisco       2.0
Nayarit       2.0
dtype: float64

In [51]:
def f2(x):
    return Series([x.min(), x.max(), x.median()], index=['min', 'max', 'median'])
frame3.apply(f2)

Unnamed: 0,b,d,e
min,-8.0,-9.0,-10.0
max,1.0,0.0,-1.0
median,-3.5,-4.5,-5.5


In [52]:
frame3.apply(f2, axis='columns')

Unnamed: 0,min,max,median
Sinaloa,-1.0,1.0,0.0
Tamaulipas,-4.0,-2.0,-3.0
Jalisco,-7.0,-5.0,-6.0
Nayarit,-10.0,-8.0,-9.0
