# Mod11 Ufuncs in Pandas

## Ufuncs: Index Preservation

In [1]:
import pandas as pd
import numpy as np

Review NumPy ufunc

In [2]:
arr=np.arange(5)
arr

array([0, 1, 2, 3, 4])

In [4]:
np.exp(arr)

array([ 1.        ,  2.71828183,  7.3890561 , 20.08553692, 54.59815003])

In [3]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int32

In [4]:
ser.values

array([6, 3, 7, 4])

In [5]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [8]:
df.values

array([[6, 9, 2, 6],
       [7, 4, 3, 7],
       [7, 2, 5, 4]])

In [8]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [11]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158
2,1096.633158,7.389056,148.413159,54.59815


In [11]:
np.exp(df['A'])

0     403.428793
1    1096.633158
2    1096.633158
Name: A, dtype: float64

In [12]:
np.exp(df[['B','D']])

Unnamed: 0,B,D
0,8103.083928,403.428793
1,54.59815,1096.633158
2,7.389056,54.59815


In [13]:
np.exp(df[0:2])

Unnamed: 0,A,B,C,D
0,403.428793,8103.083928,7.389056,403.428793
1,1096.633158,54.59815,20.085537,1096.633158


In [14]:
np.exp(df.iloc[1])

A    1096.633158
B      54.598150
C      20.085537
D    1096.633158
Name: 1, dtype: float64

In [16]:
np.exp(df.iloc[1]).index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


## UFuncs: Index Alignment

### Index alignment in Series

In [12]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [13]:
area

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64

In [18]:
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [15]:
population / area
# The resulting array contains the union of indices of the two input arrays
# any missing values are filled in with NaN by default

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [16]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

In [17]:
A

0    2
1    4
2    6
dtype: int64

In [18]:
B

1    1
2    3
3    5
dtype: int64

In [23]:
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

explicit specification of the fill value for any elements in ``A`` or ``B`` that might be missing:

In [24]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

In [20]:
rng = np.random.RandomState(42)
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,6,19
1,14,10


In [21]:
rng = np.random.RandomState(42)
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,6,3,7
1,4,6,9
2,2,6,7


In [22]:
A + B

Unnamed: 0,A,B,C
0,9.0,25.0,
1,20.0,14.0,
2,,,


In [23]:
A.add(B,fill_value=10)

Unnamed: 0,A,B,C
0,9.0,25.0,17.0
1,20.0,14.0,19.0
2,16.0,12.0,17.0


fill with the mean of all values in ``A`` (computed by first stacking the rows of ``A``):

In [27]:
A.mean()

A    10.0
B    14.5
dtype: float64

<details>
    <summary><b>DataFrame.stack() 說明圖</b></summary>
    <img src='./img/df_stack.png'>
</details>

In [30]:
A.stack()

0  A     6
   B    19
1  A    14
   B    10
dtype: int64

In [31]:
fill = A.stack().mean()
print(fill)
print(np.mean(A.values)) # 效果與A.stack().mean()相同

12.25
12.25


In [33]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,9.0,25.0,19.25
1,20.0,14.0,21.25
2,18.25,14.25,19.25


## Lab

<b>有兩個 Series 如下，求兩個 Series 的和，遇到 NaN 以零代替</b>

In [35]:
s1 = pd.Series([8.6, -3.2, 6.1, 2.4], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-4.2, 7.3, -2.7, 5, 1.8],
               index=['a', 'c', 'e', 'f', 'g'])

In [36]:
s1

a    8.6
c   -3.2
d    6.1
e    2.4
dtype: float64

In [37]:
s2

a   -4.2
c    7.3
e   -2.7
f    5.0
g    1.8
dtype: float64

<b>有兩個 DataFrame 如下，求兩個 DataFrame 的乘積，遇到 NaN 以 1 代替</b>

In [38]:
np.random.seed(61)
df1 = pd.DataFrame(np.random.randint(10,size=12).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.random.randint(20,size=20).reshape((4, 5)),
                   columns=list('abcde'))

In [39]:
df1

Unnamed: 0,a,b,c,d
0,3,2,2,0
1,2,1,7,5
2,3,7,0,9


In [40]:
df2

Unnamed: 0,a,b,c,d,e
0,10,19,15,4,2
1,19,10,5,14,8
2,2,13,6,3,2
3,15,13,14,15,9
