### 인덱싱, 데이터 조작, 인덱스 조작
- loc() : 라벨 기반의 2차원 인덱싱
- iloc() : 순서를 나타내는 정수 기반의 2차원 인덱싱

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# df.loc[행 인덱스 값]
# df.loc[행 인덱스 값, 열 인덱스 값]

sample_df = pd.DataFrame(np.arange(10, 22).reshape(3, 4), index = ['a', 'b', 'c'], columns = ['A', 'B', 'C', 'D'])
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [4]:
sample_df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [5]:
type(sample_df.loc['a'])

pandas.core.series.Series

In [6]:
# Series이기 때문에 관련 함수 사용 가능
sample_df.loc['a'].values

array([10, 11, 12, 13])

In [12]:
display(sample_df.loc['b':'c'])
print('*' * 50)

display(sample_df['b':'c']) 
print('*' * 50)

display(sample_df.loc[['b', 'c']]) # 세 방법 모두 같은 결과가 나온다.

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


**************************************************


Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


**************************************************


Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [17]:
sample_df.A

a    10
b    14
c    18
Name: A, dtype: int32

In [18]:
type(sample_df.A)

pandas.core.series.Series

In [25]:
sample_df.loc[sample_df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [29]:
sample_df2 = pd.DataFrame(np.arange(10, 26).reshape(4, 4), columns = ['A', 'B', 'C', 'D'])
sample_df2

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [34]:
sample_df2.loc[1:2] # end index - 1 이 아닌 2 로 인식

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [39]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [37]:
sample_df.loc['a', 'A']

10

In [38]:
sample_df.loc['b':, 'A']

b    14
c    18
Name: A, dtype: int32

In [44]:
sample_df.loc['a', :]

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [48]:
sample_df.loc[['b', 'c'], ['C', 'D']]

Unnamed: 0,C,D
b,16,17
c,20,21


In [54]:
sample_df.loc[sample_df.A > 10, ['C', 'D']]

Unnamed: 0,C,D
b,16,17
c,20,21


In [49]:
# iloc
# 인덱스 이름이 아닌 정수 인덱스를 통한 접근
sample_df.iloc[0, 1]

11

In [50]:
sample_df.iloc[:, 1]

a    11
b    15
c    19
Name: B, dtype: int32

In [52]:
sample_df.iloc[0, 2:4]

C    12
D    13
Name: a, dtype: int32

In [53]:
sample_df.iloc[-1, 1:3]

B    19
C    20
Name: c, dtype: int32

In [55]:
sample_df.iloc[-1] = sample_df.iloc[-1] * 2
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


In [56]:
# count
# 결측치를 제외한 데이터의 개수를 반환
s = pd.Series(range(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [58]:
s[5] = np.NaN
s[2] = np.NaN
s.count()

8

In [60]:
np.random.seed(2)
count_df = pd.DataFrame(np.random.randint(5, size = (4, 4)), dtype = np.float64)
count_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,4.0
3,4.0,3.0,4.0,2.0


In [61]:
count_df.count()

0    4
1    4
2    4
3    4
dtype: int64

In [62]:
count_df.iloc[1, 0] = np.NaN
count_df.iloc[2, 3] = np.NaN
count_df.iloc[3, 0] = np.NaN
count_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,,3.0,4.0,2.0


In [63]:
count_df.count()

0    2
1    4
2    4
3    3
dtype: int64

In [80]:
import seaborn as sns

titanic = sns.load_dataset('titanic', engine = 'python')
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [88]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
age_0          891 non-null int64
age_by_10      714 non-null float64
dtypes: bool(2), category(2), float64(3), int64(5), object(5)
memory usage: 94.5+ KB


In [89]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0


In [90]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
age_0          891
age_by_10      714
dtype: int64

In [91]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone', 'age_0', 'age_by_10'],
      dtype='object')

In [92]:
type(titanic['pclass'])

pandas.core.series.Series

In [94]:
# value_counts()
# 특정 Series의 각 value들의 개수를 count하는 함수
titanic['pclass'].value_counts()

3    491
1    216
2    184
Name: pclass, dtype: int64

In [83]:
titanic['pclass'].value_counts().values

array([491, 216, 184], dtype=int64)

In [84]:
# 새로운 열 추가
# age_0 일괄적으로 0 할당
titanic['age_0'] = 0
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone', 'age_0'],
      dtype='object')

In [85]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0


In [86]:
# age의 각 값에 10을 곱한 age_by_10 컬럼 생성
titanic['age_by_10'] = titanic['age'] * 10
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0


In [95]:
# parch와 sibsp의 값과 1을 더한 family_no 컬럼 생성
titanic['family_no'] = titanic['parch'] + titanic['sibsp'] + 1
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0,1


In [96]:
# age_by_10 컬럼 값에 일괄적으로 +100 처리
titanic['age_by_10'] += 100
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,320.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,480.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,360.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,450.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,450.0,1


In [98]:
# drop : 데이터 삭제
# inplace = True 옵션을 주면 원본 데이터에서 삭제가 이루어지고 반영된다.
# age_0 열을 삭제

titanic_drop_df = titanic.drop('age_0', axis = 1).head()
titanic_drop_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,320.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,480.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,360.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,450.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,450.0,1


In [99]:
titanic.drop(['age_0', 'age_by_10', 'family_no'], axis = 1, inplace = True)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [100]:
# 행 삭제
titanic.drop([0, 1, 2], axis = 0, inplace = True)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [105]:
print(type(titanic.index.values))
print(titanic.index.shape)

<class 'numpy.ndarray'>
(888,)


In [107]:
# 인덱스에 대한 슬라이싱 및 인덱싱
titanic.index.values[:5]

array([3, 4, 5, 6, 7], dtype=int64)

In [108]:
# 인덱스 추출
titanic.index.values[6]

9

In [231]:
series_fare = titanic['fare']
print('series')
print(series_fare[:10])
print('type', type(series_fare))

series
3     53.1000
4      8.0500
5      8.4583
6     51.8625
7     21.0750
8     11.1333
9     30.0708
10    16.7000
11    26.5500
12     8.0500
Name: fare, dtype: float64
type <class 'pandas.core.series.Series'>


In [233]:
# max, min, sum
print('max :', series_fare.max())
print('min :', series_fare.min())
print('sum :', series_fare.sum())
print('sum :', np.sum(series_fare))
print('*' * 50)

print('DC 10% :')
print(series_fare[:10] * 0.9)

max : 512.3292
min : 0.0
sum : 28607.491
sum : 28607.491
**************************************************
DC 10% :
3     47.79000
4      7.24500
5      7.61247
6     46.67625
7     18.96750
8     10.01997
9     27.06372
10    15.03000
11    23.89500
12     7.24500
Name: fare, dtype: float64


In [117]:
# reset_index() : 새로운 인덱스를 할당하고, 기존 인덱스는 인덱스라는 새로운 컬럼명으로 추가
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [118]:
titanic_reset_index_df = titanic.reset_index(inplace = False)
titanic_reset_index_df.head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
3,6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [119]:
titanic_reset_index_df['fare'].head()

0    53.1000
1     8.0500
2     8.4583
3    51.8625
4    21.0750
Name: fare, dtype: float64

In [122]:
titanic_reset_index_df[['pclass', 'fare']].head()

Unnamed: 0,pclass,fare
0,1,53.1
1,3,8.05
2,3,8.4583
3,1,51.8625
4,3,21.075


In [129]:
# titanic_reset_index_df['pclass'] == 3
titanic_reset_index_df[titanic_reset_index_df['pclass'] == 3].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
5,8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
7,10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False


In [131]:
titanic_reset_index_df.iloc[0:7, 2:4]

Unnamed: 0,pclass,sex
0,1,female
1,3,male
2,3,male
3,1,male
4,3,male
5,3,female
6,2,female


In [132]:
titanic_reset_index_df.iloc[[4, 6, 8], [2, 4, 6]]

Unnamed: 0,pclass,age,parch
4,3,2.0,1
6,2,14.0,0
8,1,58.0,0


In [134]:
# age가 60 이상인 정보만 추출
titanic_reset_index_df[titanic_reset_index_df['age'] >= 60].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
30,33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
51,54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
93,96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
113,116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
167,170,0,1,male,61.0,0,0,33.5,S,First,man,True,B,Southampton,no,True


In [168]:
# age가 60 이상인 pclass, survived, who 만 추출
display(titanic_reset_index_df[titanic_reset_index_df['age'] >= 60][['pclass', 'survived', 'who']].head())
print('*' * 50)

display(titanic_reset_index_df.loc[titanic_reset_index_df['age'] >= 60, ['pclass', 'survived', 'who']].head())

Unnamed: 0,pclass,survived,who
30,2,0,man
51,1,0,man
93,1,0,man
113,3,0,man
167,1,0,man


**************************************************


Unnamed: 0,pclass,survived,who
30,2,0,man
51,1,0,man
93,1,0,man
113,3,0,man
167,1,0,man


In [156]:
# 나이가 60보다 크고 선실등급이 1등급이고 성별이 여자인 데이터 추출
titanic_reset_index_df[(titanic_reset_index_df['age'] > 60) & (titanic_reset_index_df['pclass'] == 1) & (titanic_reset_index_df['sex'] == 'female')]

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
272,275,1,1,female,63.0,1,0,77.9583,S,First,woman,False,D,Southampton,yes,False
826,829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


In [197]:
# 정렬
# sort_index
# sort_value

np.random.seed(100)
sort_df = pd.DataFrame(np.random.randint(0, 10, (6, 4)))
sort_df

Unnamed: 0,0,1,2,3
0,8,8,3,7
1,7,0,4,2
2,5,2,2,2
3,1,0,8,4
4,0,9,6,2
5,4,1,5,3


In [198]:
sort_df.columns = ['A', 'B', 'C', 'D']
sort_df.index = pd.date_range('20201014', periods = 6)

sort_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [199]:
# error
# np.random.shuffle(sort_df.index)

random_date = np.random.permutation(sort_df.index)
random_date

array(['2020-10-14T00:00:00.000000000', '2020-10-16T00:00:00.000000000',
       '2020-10-15T00:00:00.000000000', '2020-10-17T00:00:00.000000000',
       '2020-10-19T00:00:00.000000000', '2020-10-18T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [200]:
sort_df2 = sort_df.reindex(index = random_date, columns = ['B', 'A', 'D', 'C'])
sort_df2

Unnamed: 0,B,A,D,C
2020-10-14,8,8,7,3
2020-10-16,2,5,2,2
2020-10-15,0,7,2,4
2020-10-17,0,1,4,8
2020-10-19,1,4,3,5
2020-10-18,9,0,2,6


In [234]:
# axis = 0 : row, axis = 1 : col
sort_df2.sort_index(axis = 0, ascending = True)

Unnamed: 0,B,A,D,C
2020-10-14,8,8,7,3
2020-10-15,0,7,2,4
2020-10-16,2,5,2,2
2020-10-17,0,1,4,8
2020-10-18,9,0,2,6
2020-10-19,1,4,3,5


In [235]:
# 특정 컬럼 값을 기준으로 행 정렬
sort_df2.sort_values(by = 'B', ascending = False)

Unnamed: 0,B,A,D,C
2020-10-18,9,0,2,6
2020-10-14,8,8,7,3
2020-10-16,2,5,2,2
2020-10-19,1,4,3,5
2020-10-15,0,7,2,4
2020-10-17,0,1,4,8


In [236]:
sort_df2['row_sum'] = sort_df2.sum(axis=1)
sort_df2

Unnamed: 0,B,A,D,C,row_sum
2020-10-14,8,8,7,3,26
2020-10-16,2,5,2,2,11
2020-10-15,0,7,2,4,13
2020-10-17,0,1,4,8,13
2020-10-19,1,4,3,5,13
2020-10-18,9,0,2,6,17


In [237]:
sort_df2.loc['row_sum'] = sort_df2.sum(axis = 0)
sort_df2

Unnamed: 0,B,A,D,C,row_sum
2020-10-14 00:00:00,8,8,7,3,26
2020-10-16 00:00:00,2,5,2,2,11
2020-10-15 00:00:00,0,7,2,4,13
2020-10-17 00:00:00,0,1,4,8,13
2020-10-19 00:00:00,1,4,3,5,13
2020-10-18 00:00:00,9,0,2,6,17
row_sum,20,25,20,28,93


In [194]:
# 타이타닉호 승객의 평균 나이를 구하라.
print(round(np.mean(titanic['age']), 2))

# 타이타닉호 승객 중 여성 승객의 평균 나이를 구하라.
print(round(np.mean(titanic[titanic['sex'] == 'female']['age']), 2))

# 타이타닉호 승객 중 1등실 선실의 여성 승객의 평균 나이를 구하라.
print(round(np.mean(titanic[(titanic['pclass'] == 1) & (titanic['sex'] == 'female')]['age']), 2))

29.7
27.88
34.57


### apply 변환
- 행이나 열 단위로 복잡한 데이터 가공이 필요한 경우 사용하는 함수이다
- lambda 식
- apply 함수는 인자로 함수를 넘겨받을 수 있다. 

In [204]:
def get_square(a) :
    return a**2

In [205]:
print('제곱근 :', get_square(3))

제곱근 : 9


In [206]:
# 람다식으로 변환
lambda_square = lambda a : a**2
print('제곱근 :', lambda_square(3))

제곱근 : 9


In [208]:
np.random.seed(100)

apply_df = pd.DataFrame(np.random.randint(0, 10, (6, 4)))
apply_df.columns = ['A', 'B', 'C', 'D']
apply_df.index = pd.date_range('20201014', periods = 6)
apply_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [210]:
# 각 행의 column에 대해서 최대값 - 최소값을 구해 새로운 column 추가
# 각 column에서 최대값, 최소값을 구해 출력
func = lambda x : x.max() - x.min()

apply_df.apply(func, axis = 1)

2020-10-14    5
2020-10-15    7
2020-10-16    3
2020-10-17    8
2020-10-18    9
2020-10-19    4
Freq: D, dtype: int64

In [211]:
apply_df['row 최대 - 최소'] = apply_df.apply(func, axis = 1)
apply_df

Unnamed: 0,A,B,C,D,row 최대 - 최소
2020-10-14,8,8,3,7,5
2020-10-15,7,0,4,2,7
2020-10-16,5,2,2,2,3
2020-10-17,1,0,8,4,8
2020-10-18,0,9,6,2,9
2020-10-19,4,1,5,3,4


In [228]:
# embark_town의 문자열 개수를 별도의 컬럼인 embark_len 추가
titanic_reset_index_df['embark_len'] = titanic_reset_index_df['embark_town'].apply(lambda x : len(str(x)))
display(titanic_reset_index_df[['embark_town', 'embark_len']].head())
print('*' * 50)

# if ~ else 절을 활용하여 나이가 15세 이하면 child 그렇지 않으면 adult로 구분하는 child_adult 추가
titanic_reset_index_df['child_adult'] = titanic_reset_index_df['age'].apply(lambda x : 'child' if x < 15 else 'adult')
display(titanic_reset_index_df[['age', 'child_adult']].head())

Unnamed: 0,embark_town,embark_len
0,Southampton,11
1,Southampton,11
2,Queenstown,10
3,Southampton,11
4,Southampton,11


**************************************************


Unnamed: 0,age,child_adult
0,35.0,adult
1,35.0,adult
2,,adult
3,54.0,adult
4,2.0,child
