In [1]:
import numpy as np 
import pandas as pd

index = pd.date_range("1/1/2000", periods=8)
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

In [2]:
# Function application
# https://pandas.pydata.org/docs/user_guide/basics.html#function-application

# Tablewise Function Application: pipe()
# Row or Column-wise Function Application: apply()
# Aggregation API: agg() and transform()
# Applying Elementwise Functions: applymap()

In [3]:
# Tablewise function application
# 주고 받는 단위가 Table이란 의미

In [4]:
def extract_city_name(df):
    """
    df를 argument로 하고 있다.
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    # col = "city_name"
    df["city_and_country"] = df["city_name"] + "-" + country_name
    return df

In [5]:
df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [6]:
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago-US


In [7]:
# pipe
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago-US


* <span style='font-size:13px;'>기존에는 함수에 argument로 DataFrmae을 전달하여야 하나, pipe를 쓴다는 것은, 함수가 아닌 `DataFrame에` Pipe를 통하여 함수를 직접 적용한다.
* <span style='font-size:13px;'>위의 예에서 볼 수 있듯이 code의 가독성이 더 좋다.

In [8]:
import statsmodels.formula.api as sm

bb = pd.read_csv("baseball.csv", index_col="id")

(
    bb.query("h > 0")
    .assign(ln_h=lambda df: np.log(df.h))
    .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)")
    .fit()
    .summary()
)

0,1,2,3
Dep. Variable:,hr,R-squared:,0.685
Model:,OLS,Adj. R-squared:,0.665
Method:,Least Squares,F-statistic:,34.28
Date:,"Tue, 01 Aug 2023",Prob (F-statistic):,3.48e-15
Time:,18:48:32,Log-Likelihood:,-205.92
No. Observations:,68,AIC:,421.8
Df Residuals:,63,BIC:,432.9
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8484.7720,4664.146,-1.819,0.074,-1.78e+04,835.780
C(lg)[T.NL],-2.2736,1.325,-1.716,0.091,-4.922,0.375
ln_h,-1.3542,0.875,-1.547,0.127,-3.103,0.395
year,4.2277,2.324,1.819,0.074,-0.417,8.872
g,0.1841,0.029,6.258,0.000,0.125,0.243

0,1,2,3
Omnibus:,10.875,Durbin-Watson:,1.999
Prob(Omnibus):,0.004,Jarque-Bera (JB):,17.298
Skew:,0.537,Prob(JB):,0.000175
Kurtosis:,5.225,Cond. No.,14900000.0


In [9]:
# apply는 (1)함수와 (2)적용방향(axis)를 받아 처리한다.

In [27]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=['A', 'B', 'C'],
    index=pd.date_range('1/1/2017', periods=10)
)

tsdf

Unnamed: 0,A,B,C
2017-01-01,0.27837,0.246862,-0.090683
2017-01-02,-2.130206,1.096712,-2.103602
2017-01-03,-1.22315,-0.62089,-1.152949
2017-01-04,0.959679,0.579901,-0.492249
2017-01-05,-1.112302,-1.363842,-1.025832
2017-01-06,0.524362,-0.152818,-0.373051
2017-01-07,0.354844,-0.315082,0.962789
2017-01-08,-0.120536,-0.26976,-0.235639
2017-01-09,1.879419,1.558778,-0.75978
2017-01-10,0.268676,0.48373,0.651702


In [34]:
# lambda함수를 apply method로 적용
tsdf.apply(lambda x: x.idxmax(), axis=0)
# 여기서 x는 columns을 하나씩 불러 들인 것이다.따라서 return갑은 tsdf의 columns=[...]를 index로하는 Series가 된다
# idxmax()라는 함수는 Series 중에 최고값을 return하는 것이 아니라 그 최고값의 index를 return한다. 

A   2017-01-09
B   2017-01-09
C   2017-01-07
dtype: datetime64[ns]

In [11]:
df

Unnamed: 0,A,B,C
2000-01-01,0.526057,-1.502137,-1.19817
2000-01-02,0.929433,0.499059,0.534854
2000-01-03,-0.730648,-0.093314,0.206938
2000-01-04,0.519011,-1.743665,1.258101
2000-01-05,-0.125452,1.527041,0.45193
2000-01-06,-2.133801,0.27118,-0.230116
2000-01-07,0.752412,0.098321,0.170244
2000-01-08,-0.771045,1.118836,0.181748


In [12]:
# 일반 함수를 정의하고 apply method로 적용
# 일반 arg는 tuple의 형태로 전달, keyword arg는 "key=  "     🔰
def subtract_and_divide(x, sub, divide=1):
    return(x-sub) / divide
df.apply(subtract_and_divide, args=(5,), divide=3)


Unnamed: 0,A,B,C
2000-01-01,-1.491314,-2.167379,-2.066057
2000-01-02,-1.356856,-1.500314,-1.488382
2000-01-03,-1.910216,-1.697771,-1.597687
2000-01-04,-1.493663,-2.247888,-1.2473
2000-01-05,-1.708484,-1.157653,-1.516023
2000-01-06,-2.377934,-1.576273,-1.743372
2000-01-07,-1.415863,-1.633893,-1.609919
2000-01-08,-1.923682,-1.293721,-1.606084


In [13]:
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2017-01-01,-0.723179,-0.390889,2.09954
2017-01-02,1.029738,-0.8728,0.862476
2017-01-03,0.772954,0.14131,-0.955953
2017-01-04,,,
2017-01-05,,,
2017-01-06,,,
2017-01-07,,,
2017-01-08,-0.070706,-0.291981,-1.32502
2017-01-09,1.308736,-0.167677,1.180992
2017-01-10,0.676163,-0.286896,0.302608


In [14]:
# Series Method를 apply로 적용     🔰🔰💢
tsdf.apply(pd.Series.interpolate)

Unnamed: 0,A,B,C
2017-01-01,-0.723179,-0.390889,2.09954
2017-01-02,1.029738,-0.8728,0.862476
2017-01-03,0.772954,0.14131,-0.955953
2017-01-04,0.604222,0.054652,-1.029766
2017-01-05,0.43549,-0.032006,-1.10358
2017-01-06,0.266758,-0.118664,-1.177393
2017-01-07,0.098026,-0.205323,-1.251207
2017-01-08,-0.070706,-0.291981,-1.32502
2017-01-09,1.308736,-0.167677,1.180992
2017-01-10,0.676163,-0.286896,0.302608


In [15]:
s = pd.Series([np.nan, "single_one", np.nan,
               "fill_two_more", np.nan, np.nan, np.nan,
               4.71, np.nan])
s

0              NaN
1       single_one
2              NaN
3    fill_two_more
4              NaN
5              NaN
6              NaN
7             4.71
8              NaN
dtype: object

In [16]:
s.interpolate(method='pad', limit=2)
# NaN을 채우는데 연속적인 2개의 NaN밖에 못채운다. 

0              NaN
1       single_one
2       single_one
3    fill_two_more
4    fill_two_more
5    fill_two_more
6              NaN
7             4.71
8             4.71
dtype: object

In [17]:
# Aggregation API(agg())
# Aggregating with multiple functions

In [18]:
df = pd.DataFrame([[1, 2, 3],
                   [4, 5, 6],
                   [7, 8, 9],
                   [np.nan, np.nan, np.nan]],
                  columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,7.0,8.0,9.0
3,,,


In [35]:
df.agg(['sum', 'min', 'std', 'median'])
# You can also pass "named methods as strings"('sum', 'mean' etc).

Unnamed: 0,A,B,C
sum,12.0,15.0,18.0
min,1.0,2.0,3.0
std,3.0,3.0,3.0
median,4.0,5.0,6.0


In [20]:
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})

Unnamed: 0,A,B
sum,12.0,
min,1.0,2.0
max,,8.0


In [21]:
df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
# 여기서 x, y, z는 함수를 정의한다.   🔰💢


Unnamed: 0,A,B,C
x,7.0,,
y,,2.0,
z,,,6.0


In [22]:
# lambda
tsdf["A"].agg(["sum", lambda x: x.mean()])


sum         2.993707
<lambda>    0.498951
Name: A, dtype: float64

In [23]:
# 정의 함수
def mymean(x):
    return x.mean()

tsdf["A"].agg(["sum", mymean])


sum       2.993707
mymean    0.498951
Name: A, dtype: float64

In [24]:
# Aggregating with a dict
# 어떤 column(row)에 어떤 함수를 적용할 것인가를 Dictionary에서 정의해서 준다.
tsdf.agg({'A': 'mean', 'B':'sum'})

A    0.498951
B   -1.868934
dtype: float64

In [38]:
# Transform API
tsdf.iloc[3:7] = np.nan
tsdf

Unnamed: 0,A,B,C
2017-01-01,0.27837,0.246862,-0.090683
2017-01-02,-2.130206,1.096712,-2.103602
2017-01-03,-1.22315,-0.62089,-1.152949
2017-01-04,,,
2017-01-05,,,
2017-01-06,,,
2017-01-07,,,
2017-01-08,-0.120536,-0.26976,-0.235639
2017-01-09,1.879419,1.558778,-0.75978
2017-01-10,0.268676,0.48373,0.651702


In [39]:
# Here transform() received a single function; this is equivalent to a "ufunc" application.
# can recieve multiple functions including lambda func, dictionary
tsdf.transform(np.abs)

Unnamed: 0,A,B,C
2017-01-01,0.27837,0.246862,0.090683
2017-01-02,2.130206,1.096712,2.103602
2017-01-03,1.22315,0.62089,1.152949
2017-01-04,,,
2017-01-05,,,
2017-01-06,,,
2017-01-07,,,
2017-01-08,0.120536,0.26976,0.235639
2017-01-09,1.879419,1.558778,0.75978
2017-01-10,0.268676,0.48373,0.651702


In [None]:
# Applying elementwise functions
# applymap for DataFrame, map for Series

In [41]:
s = pd.Series(
    ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]
)

t = pd.Series({"six": 6.0, "seven": 7.0})
# Series를 dictionary로 정의 할 때 keys는 row index가 된다.(whereas DataFrame: column index)

t

six      6.0
seven    7.0
dtype: float64

In [42]:
# s Series의 value값과 t Series의 index를 mapping한다.
# mapping을 위한 t Series를 정의하여 처리한다.
s.map(t)

a    6.0
b    7.0
c    6.0
d    7.0
e    6.0
dtype: float64