# Mod12 Object Operations in Pandas

## Ufuncs: Operations Between DataFrame and Series

In [4]:
import pandas as pd
import numpy as np

### Review NumPy Operation

In [2]:
# 2d array
ar=np.array([[1,2,3],[10,20,30]]); ar

array([[ 1,  2,  3],
       [10, 20, 30]])

In [5]:
# 1d array
ar[0]

array([1, 2, 3])

subtraction between a two-dimensional array and one of its rows is applied row-wise

In [5]:
ar-ar[0]

array([[ 0,  0,  0],
       [ 9, 18, 27]])

### Pandas Operation

In Pandas, the convention similarly operates row-wise by default:

In [43]:
rng = np.random.RandomState(42)
A = rng.randint(10, size=(3, 4))
A

array([[6, 3, 7, 4],
       [6, 9, 2, 6],
       [7, 4, 3, 7]])

In [44]:
df = pd.DataFrame(A, columns=list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,6,3,7,4
1,6,9,2,6
2,7,4,3,7


In [45]:
df.iloc[0]

Q    6
R    3
S    7
T    4
Name: 0, dtype: int32

In [52]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,6,-5,2
2,1,1,-4,3


## Pandas Objects Operations

In [5]:
ind_char = pd.Index(['a','b','c','d','e'])
ind_char

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [6]:
ind_num = pd.RangeIndex(5)
ind_num

RangeIndex(start=0, stop=5, step=1)

In [7]:
s1 = pd.Series(range(10, 15), ind_char)
s1

a    10
b    11
c    12
d    13
e    14
dtype: int64

In [8]:
s2 = pd.Series(range(30, 40, 2), ind_char)
s2

a    30
b    32
c    34
d    36
e    38
dtype: int64

In [9]:
df1 = pd.DataFrame(100, index=ind_num, columns=ind_char)
df1

Unnamed: 0,a,b,c,d,e
0,100,100,100,100,100
1,100,100,100,100,100
2,100,100,100,100,100
3,100,100,100,100,100
4,100,100,100,100,100


In [38]:
df2 = pd.DataFrame(np.arange(25).reshape(5,5),
        index=ind_num, columns=ind_char)
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


### Series and Series Operation(ch11 review)

Series operatrion is index alignment

In [35]:
s1 + s2

a    40
b    43
c    46
d    49
e    52
dtype: int64

In [18]:
s1.sample(frac=1)

e    14
d    13
a    10
c    12
b    11
dtype: int64

In [19]:
# shuffle s1, same result
s1.sample(frac=1) + s2

a    40
b    43
c    46
d    49
e    52
dtype: int64

### DataFrame and DataFrame Opertaion(ch11 review)

In [20]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,100,101,102,103,104
1,105,106,107,108,109
2,110,111,112,113,114
3,115,116,117,118,119
4,120,121,122,123,124


In [28]:
df2.sample(frac=1).sample(frac=1, axis=1)

Unnamed: 0,c,d,b,a,e
1,7,8,6,5,9
0,2,3,1,0,4
2,12,13,11,10,14
3,17,18,16,15,19
4,22,23,21,20,24


In [23]:
# Shuffle second DataFrame on both axes. The index and columns will still align
df1 + df2.sample(frac=1).sample(frac=1, axis=1)

Unnamed: 0,a,b,c,d,e
0,100,101,102,103,104
1,105,106,107,108,109
2,110,111,112,113,114
3,115,116,117,118,119
4,120,121,122,123,124


In [29]:
# Add a scalar. Nothing to align with
# so broadcasts to everything
df1 + 1

Unnamed: 0,a,b,c,d,e
0,101,101,101,101,101
1,101,101,101,101,101
2,101,101,101,101,101
3,101,101,101,101,101
4,101,101,101,101,101


### DataFrame and Series Operation

In [50]:
s1.keys(),df1.keys()

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 Index(['a', 'b', 'c', 'd', 'e'], dtype='object'))

In [39]:
# align by keys
display(df1+s1)
display(df1.add(s1,axis=1))

Unnamed: 0,a,b,c,d,e
0,110,111,112,113,114
1,110,111,112,113,114
2,110,111,112,113,114
3,110,111,112,113,114
4,110,111,112,113,114


Unnamed: 0,a,b,c,d,e
0,110,111,112,113,114
1,110,111,112,113,114
2,110,111,112,113,114
3,110,111,112,113,114
4,110,111,112,113,114


In [41]:
s3 = pd.Series(range(50, 10, -8), ind_num)
s3

0    50
1    42
2    34
3    26
4    18
dtype: int64

In [42]:
# DataFrame default column alignment 
# cannot find matching column
df1 + s3

Unnamed: 0,a,b,c,d,e,0,1,2,3,4
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,


Using transposition to resolve the problem

In [43]:
df1.T

Unnamed: 0,0,1,2,3,4
a,100,100,100,100,100
b,100,100,100,100,100
c,100,100,100,100,100
d,100,100,100,100,100
e,100,100,100,100,100


In [44]:
df1.T + s3

Unnamed: 0,0,1,2,3,4
a,150,142,134,126,118
b,150,142,134,126,118
c,150,142,134,126,118
d,150,142,134,126,118
e,150,142,134,126,118


In [45]:
(df1.T + s3).T

Unnamed: 0,a,b,c,d,e
0,150,150,150,150,150
1,142,142,142,142,142
2,134,134,134,134,134
3,126,126,126,126,126
4,118,118,118,118,118


Better way to resolve the problem
<details>
    <summary>Axis in DataFrame</summary>
    <img src='./img/2D_axis_1.jpg'>
</details>

In [48]:
display(df1.add(s3, axis='rows'))
display(df1.add(s3, axis=0))

Unnamed: 0,a,b,c,d,e
0,150,150,150,150,150
1,142,142,142,142,142
2,134,134,134,134,134
3,126,126,126,126,126
4,118,118,118,118,118


Unnamed: 0,a,b,c,d,e
0,150,150,150,150,150
1,142,142,142,142,142
2,134,134,134,134,134
3,126,126,126,126,126
4,118,118,118,118,118


## Lab

<b>有一個 DataFrame df 如下，試著正確的運算 df - df['Z'] 的結果</b>

operate column-wise by specifying the ``axis`` keyword:

In [29]:
np.random.seed(62)
A = np.random.randint(10, size=(3, 4))

df = pd.DataFrame(A, columns=list('WXYZ'))
df

Unnamed: 0,W,X,Y,Z
0,2,8,1,9
1,3,5,1,4
2,5,1,9,8


In [30]:
df['Z']

0    9
1    4
2    8
Name: Z, dtype: int64