# 함수 적용 (apply)

함수 적용은 시리즈 또는 데이터플임의 개별 변수를 특정 함수에 대응시키는 것을 의미한다.
분석가가 직접 만든 함수 (lambda 함수 등)을 적용할 수 있으므로, 판다스에서 기본적으로 제공하는 함수로 처리하기 힘든 작업을 할 수 있음
이를 이용해서 모델링 시 필요한 새로운 변수를 만들거나, 기존 변수를 변경하는 등으로 활용 가능함 


In [1]:
import pandas as pd
from sklearn import datasets

In [4]:
data = datasets.load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
def half_length(length):
    return length / 2

In [6]:
sepal_half = df.loc[:,['sepal length (cm)']].apply(half_length)
#전체 행 중에서 sepal length열과 교집합인 데이터를 부른다음 각각의 데이터를 half_length 함수의 인자로 대입, 해당함수의 리턴값으로 값이 변경됨
#그렇게 만들어진 시리즈를 sepal_half에 할당 
print(sepal_half.head())

   sepal length (cm)
0               2.55
1               2.45
2               2.35
3               2.30
4               2.50


In [7]:
sepal_half = df.loc[:,['sepal length (cm)']].apply(lambda x : x/2)
print(sepal_half.head())

   sepal length (cm)
0               2.55
1               2.45
2               2.35
3               2.30
4               2.50


In [8]:
def area(length, width):
    return length*width

In [18]:
sepal_area = df.loc[:,['sepal length (cm)', 'sepal width (cm)']].apply(lambda x : area(x['sepal length (cm)'], x['sepal width (cm)']), axis=1)
#전체 행과 sepal length, sepal width열의 교집합인 데이터(행 :? 열:2인 데이터프레임)을 부르고
#lambda의 x는 그 데이터프레임의 각 행을 의미한다. x['sepal length (cm)']는 해당 데이터프레임의 각행의 sepal length 열과의 교집합인 값을 의미한다.
#lambda 인자 : 식, 식자리에 앞에서 선언한 함수를 사용해도 좋다. 
#apply메소드는 기본적으로 '시리즈'에 적용가능함 따라서 데이터프레임에 적용시 axis = 1 또는 axis = 0 인자가 없으면 안됨 
#axis = 1 인자는 모든 행에 적용하겠다는 의미 
#즉 lambda의 x가 교집합 데이터의 각 행을 추출한다는 의미 
#만약 axis = 0을 하면 x가 교집합 데이터의 각 열을 추출함. 그러면 행과 열을 뽑을 수가 없음 
print(sepal_area.head())

0    17.85
1    14.70
2    15.04
3    14.26
4    18.00
dtype: float64


In [12]:
df_half = df.applymap(half_length) #applymap은 데이터프레임에 사용가능한 메소드이며 df안의 모든원소에 적용한다 

In [13]:
df_half

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,2.55,1.75,0.70,0.10
1,2.45,1.50,0.70,0.10
2,2.35,1.60,0.65,0.10
3,2.30,1.55,0.75,0.10
4,2.50,1.80,0.70,0.10
...,...,...,...,...
145,3.35,1.50,2.60,1.15
146,3.15,1.25,2.50,0.95
147,3.25,1.50,2.60,1.00
148,3.10,1.70,2.70,1.15


In [16]:
df_double = df.apply(lambda x: x*2 , axis = 0) #apply를 데이터프레임에 적용하기 위해서 축 정보를 입력함, axis = 0(모든 열에 적용)
                                               #여기서는 axis = 0 이나 axis =1 모두 같은 결과를 나타냄 
                                               #각 행을 뽑아서 2배를 하든 각 열을 뽑아서 2배를 하든 어차피 전체적으로 데이터는 2배가 되는 거니까
df_double.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,10.2,7.0,2.8,0.4
1,9.8,6.0,2.8,0.4
2,9.4,6.4,2.6,0.4
3,9.2,6.2,3.0,0.4
4,10.0,7.2,2.8,0.4


In [19]:
def area(length, width):
    return length*width 

df['sepal_area'] =  df.loc[:,['sepal length (cm)', 'sepal width (cm)']].apply(\
                    lambda x : area(x['sepal length (cm)'], x['sepal width (cm)']), axis=1)
#기존 데이터 프레임에 새로운 열을 추가해서 기존 데이터프레임에 반영 
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   sepal_area  
0       17.85  
1       14.70  
2       15.04  
3       14.26  
4       18.00  


## 실습

1. petal length (cm) 변수에 제곱에 대한 시리즈 결과값을 추출하세요. 함수방식과 lambda함수 방식 모두 구현!

2. petal의 면적을 구하는 apply 함수를 구현하세요. 여기서는 lambda 방식을 활용해보세요

#### petal length 제곱 구하기

In [22]:
def square_length(length):
    return length**2

In [23]:
petal_square = df.loc[:,['petal length (cm)']].apply(square_length)
print(petal_square.head())

   petal length (cm)
0               1.96
1               1.96
2               1.69
3               2.25
4               1.96


#### petal의 면적을 구하는 apply 함수 구현

In [None]:
# petal 면적을 구하기 (lambda 이용하기)

In [None]:
# 기존 데이터셋(df)에서 하고싶은 방식을 이용하여 새로운 변수 2가지 이상을 만들어보고, 하나의 데이터셋으로 만들어보세요
# df.head() 시에 새로 추가된 변수가 같이 표현되어야 합니다.

seaborn 패키지에서 데이터를 받아서 아래 설명대로 진행하시면 됩니다. 설치가되어있지 않다면 아래 Pip install seaborn을 실행시키세요

1. 나이 열에 10살을 더하는 함수를 구현하여 적용하고 결과값을 확인하세요
2. 나이에 특정 파라미터 숫자를 입력받아서 더하는 함수를 구현하고, 파라미터에 5를 넣어서 결과값을 확인하세요.

-- ex) def add_age_num(age, num): -> age는 데이터에서 입력받고, num은 사용자가 넣을 수 있도록

In [None]:
! pip install seaborn

In [None]:
import seaborn as sns
data = sns.load_dataset('titanic')
data.head()

# 데이터 행/열 다루기

# 열 순서 변경하기

In [24]:
boston = datasets.load_boston()
boston = pd.DataFrame(boston.data, columns=boston.feature_names)
boston.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [25]:
col_names = list(boston.columns)
print(col_names)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [27]:
sorted_col_names = sorted(col_names) #리스트를 오름차순 정렬, reverse = True 인자 있으면 내림차순 정렬 
print(sorted_col_names) #sorted_col_names는 리스트임 
boston_col_sort = boston[sorted_col_names] #df[[]]구조 df[[a,b,c,d,...]]해당열을 모두 선택한다는 의미, 즉 오름차순으로 정렬된 df를 할당한다는 의미
boston_col_sort.head()

['AGE', 'B', 'CHAS', 'CRIM', 'DIS', 'INDUS', 'LSTAT', 'NOX', 'PTRATIO', 'RAD', 'RM', 'TAX', 'ZN']


Unnamed: 0,AGE,B,CHAS,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,RM,TAX,ZN
0,65.2,396.9,0.0,0.00632,4.09,2.31,4.98,0.538,15.3,1.0,6.575,296.0,18.0
1,78.9,396.9,0.0,0.02731,4.9671,7.07,9.14,0.469,17.8,2.0,6.421,242.0,0.0
2,61.1,392.83,0.0,0.02729,4.9671,7.07,4.03,0.469,17.8,2.0,7.185,242.0,0.0
3,45.8,394.63,0.0,0.03237,6.0622,2.18,2.94,0.458,18.7,3.0,6.998,222.0,0.0
4,54.2,396.9,0.0,0.06905,6.0622,2.18,5.33,0.458,18.7,3.0,7.147,222.0,0.0


In [None]:
user_sort_col = ['AGE', 'B', 'DIS', 'INDUS', 'CHAS', 'NOX', 'LSTAT', 'NOX', 'RAD', 'RM', 'TAX', 'ZN', 'PTRATIO']
#내가 원하는 순서대로 리스트를 만들어서 열로 설정가능함 
boston_col_sort = boston[user_sort_col]
boston_col_sort.head()

## 열 이름 변경

In [28]:
boston.columns=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12','x13']
#df.columns = [새롭게 열의 이름을 지정]
boston.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


### 주의사항 :: 열 갯수가 맞지 않을경우 아래와 같이 에러가 발생함

In [29]:
boston.columns=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
boston.head()

ValueError: Length mismatch: Expected axis has 13 elements, new values have 10 elements

In [31]:
boston.rename(columns={'ZN':'x1', 'AGE':'x2'}, inplace=True) #바꾸고 싶은거만 딕셔너리를 활용해서 바꿀 수 있음 
boston.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## 특정 조건식에 맞는 행 추출

In [32]:
boston = datasets.load_boston()
boston = pd.DataFrame(boston.data, columns=boston.feature_names)
boston.head()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [34]:
bool_mask = boston['INDUS']>10 #보스턴 df의 특정열의 데이터를 선택하고 10이 넘는 것은 True로 값을 가지는 시리즈를 할당함 
print(bool_mask)

0      False
1      False
2      False
3      False
4      False
       ...  
501     True
502     True
503     True
504     True
505     True
Name: INDUS, Length: 506, dtype: bool


In [35]:
indus_over_10 = boston.loc[bool_mask,:] #true로 나온 행만 선택 , loc과 불리안의 활용으로 어떤 조건을 만족하는 특정열의 데이터가 속한 행만 추출 가능
indus_over_10.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
70,0.08826,0.0,10.81,0.0,0.413,6.417,6.6,5.2873,4.0,305.0,19.2,383.73,6.72
71,0.15876,0.0,10.81,0.0,0.413,5.961,17.5,5.2873,4.0,305.0,19.2,376.94,9.88
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52
73,0.19539,0.0,10.81,0.0,0.413,6.245,6.2,5.2873,4.0,305.0,19.2,377.17,7.54
74,0.07896,0.0,12.83,0.0,0.437,6.273,6.0,4.2515,5.0,398.0,18.7,394.92,6.78


In [36]:
mask = (boston['AGE']>10) & (boston['TAX']>300) #여러조건을 하는 경우 반드시 조건별 ()를 사용한다.
boston.loc[mask,:]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.60,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.90,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.10
10,0.22489,12.5,7.87,0.0,0.524,6.377,94.3,6.3467,5.0,311.0,15.2,392.52,20.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,0.28960,0.0,9.69,0.0,0.585,5.390,72.9,2.7986,6.0,391.0,19.2,396.90,21.14
497,0.26838,0.0,9.69,0.0,0.585,5.794,70.6,2.8927,6.0,391.0,19.2,396.90,14.10
498,0.23912,0.0,9.69,0.0,0.585,6.019,65.3,2.4091,6.0,391.0,19.2,396.90,12.92
499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6.0,391.0,19.2,395.77,15.10


## 실습해보세요

1. LSTAT > 20 이거나(or) RM < 5.0 인 데이터를 추출해보세요.

2. LSTAT > 20 이거나(or) RM < 5.0 인 데이터 중 열을 'CRIM', 'ZN', 'INDUS' 만 추출하세요. 

In [None]:
# 1. LSTAT > 20 이거나(or) RM < 5.0 인 데이터를 추출해보세요.

In [None]:
# 2. LSTAT > 20 이거나(or) RM < 5.0 인 데이터 중 열을 'CRIM', 'ZN', 'INDUS' 만 추출하세요. 

## 특정 조건식에 맞는 행 추출 (isin 활용)

In [40]:
boston.astype({'RAD':'int'}) #?
boston.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [41]:
mask_1 = (boston['RAD'] == 4) | (boston['RAD'] == 5) | (boston['RAD'] == 6) #이렇게 or을 여러개 사용해서 특정조건 표현 가능 
(boston.loc[mask,:]).head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1
10,0.22489,12.5,7.87,0.0,0.524,6.377,94.3,6.3467,5.0,311.0,15.2,392.52,20.45


In [None]:
mask_2 = boston['RAD'].isin([4,5,6]) #특정조건은 isin이 더 편리하다. 리스트 원소중 하나에 포함되면 True를 반환 
(boston.loc[mask,:]).head()

## 실습해보세요
seaborn 패키지에서 데이터를 받아서 아래 설명대로 진행하시면 됩니다. 설치가되어있지 않다면 아래 Pip install seaborn을 실행시키세요

1. isin 함수를 이용해서 'sibsp' 변수가 3 또는 4 또는 5인 경우를 추출하세요

2. isin 함수를 이용해서 'sex' male 인 경우를 추출하세요

In [None]:
! pip install seaborn

In [None]:
import seaborn as sns
data = sns.load_dataset('titanic')
data.head()

In [None]:
# isin 함수를 이용해서 'sibsp' 변수가 3 또는 4 또는 5인 경우를 추출하세요

In [None]:
# isin 함수를 이용해서 'sex' male 인 경우를 추출하세요