# <span style="color:red">Pandas Overview</span>

- 구조화된 데이터의 처리를 지원하는 python 라이브러리
- 고성능 Array 계산 라이브러리인 numpy와 통합하여 <br/> 강력한 "스프레드시트" 처리 기능을 제공
- 인덱싱, 연산용 함수, 전처리 함수 등을 제공함 


---

# <span style="color:red">Pandas - series</span>

** dataframe: Data Table 전체를 포함하는 Object <br/>
** series: dataframe 중 하나의 column에 해당하는 데이터의 모음 Object

---


### Series
- Subclass of numpy.ndarray
- Data: any type
- Index labels need not be ordered
- Duplicates are possible (but result in reduced functionality)

In [2]:
from pandas import Series, DataFrame
import pandas as pd

In [4]:
example_obj = Series() # Shift + TAB을 이용해서 parameter 확인 가능

In [5]:
list_data = [1,2,3,4,5]
example_obj = Series(data = list_data)
example_obj # index와 data로 구성. index를 기준으로 data join이 실행된다.

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
list_data = [1,2,3,4,5]
list_name = ['a','b','c','d','e']
example_obj = Series(data = list_data, index = list_name)
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [11]:
# dict 형태로도 만들 수 있다.
dict_data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
example_obj = Series(dict_data, dtype=np.float32, name="example_data")
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

---

In [9]:
example_obj['a']

1.0

In [10]:
example_obj['a'] = 3.2
example_obj

a    3.2
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [13]:
example_obj.values # numpy 형태

array([1., 2., 3., 4., 5.], dtype=float32)

In [14]:
example_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [16]:
example_obj.name = "number"
example_obj.index.name = "alphabet"
example_obj

alphabet
a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: number, dtype: float32

In [17]:
dict_data_1 = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
indexes = ['a','b','c','d','e','f','g','h']
series_obj_1 = Series(dict_data_1, index=indexes)
series_obj_1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
dtype: float64

---
# <span style="color:red">Pandas - data frame</span>
- numpy array-like
- Each column can have a different type
- Row and column index
- Size mutable: insert and delete columns


- Series를 모아서 만든 data table (기본 2차원)

In [18]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [20]:
raw_data = {
    'first_name' : ['Jason','Molly', 'Tina', 'Jake', 'Amy'],
    'last_name' : ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
    'age' : [42, 52, 36, 24, 73],
    'city' : ['San Francisco', 'Baltimore', 'Miami','Douglas','Boston']
}
df = pd.DataFrame(raw_data, columns = ['first_name','last_name','age','city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [21]:
DataFrame(raw_data, columns=['age','city'])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [22]:
DataFrame(raw_data, columns = ['first_name','last_name','age','city','debt'])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


---

In [23]:
# column 선택 - series 추출
df = DataFrame(raw_data, columns = ['first_name','last_name','age','city','debt'])
df.first_name

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [24]:
df['first_name']

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

### loc은 index 이름, iloc은 index number

In [25]:
# row 선택 - loc (index location)
df.loc[1]

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [26]:
# row 선택 - iloc (index position)
df['age'].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [30]:
s = pd.Series(np.nan, index = [49, 48, 47, 46, 45, 1,2,3,4,5])
s.loc[:3] # index name

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [28]:
s.iloc[:3] # index number

49   NaN
48   NaN
47   NaN
dtype: float64

---

### Column에 새로운 데이터 할당

In [31]:
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [32]:
df.T # transpose

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True


In [33]:
df.values

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [34]:
df.to_csv()

',first_name,last_name,age,city,debt\n0,Jason,Miller,42,San Francisco,True\n1,Molly,Jacobson,52,Baltimore,True\n2,Tina,Ali,36,Miami,False\n3,Jake,Milner,24,Douglas,False\n4,Amy,Cooze,73,Boston,True\n'

---
### Column을 삭제함

In [35]:
del df['debt']
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


---

In [36]:
pop = {
    'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio' : {2000: 1.5, 2001: 1.7, 2002: 3.6}
}
DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [37]:
values = Series(data=['M','F','F'], index=[0,1,3])
values

0    M
1    F
3    F
dtype: object

In [38]:
df['sex'] = values
df

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,F
4,Amy,Cooze,73,Boston,


---
# <span style="color:red">Pandas - selection & drop </span>

In [40]:
df['city'].head(3)

0    San Francisco
1        Baltimore
2            Miami
Name: city, dtype: object

In [41]:
df[['city', 'age']].head(3)

Unnamed: 0,city,age
0,San Francisco,42
1,Baltimore,52
2,Miami,36


### Selection with index number

In [43]:
df[:3] # column 이름 없이 사용하는 index number는 row 기준 표시

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,


In [44]:
df['age'][:3] # column 이름과 함께 row index 사용 시, 해당 column만

0    42
1    52
2    36
Name: age, dtype: int64

### Series selection

In [47]:
account_series = df['age']
account_series[:3]

0    42
1    52
2    36
Name: age, dtype: int64

In [48]:
account_series[[0,1,2]]

0    42
1    52
2    36
Name: age, dtype: int64

In [49]:
account_series[account_series < 30]

3    24
Name: age, dtype: int64

### Index 변경

In [51]:
df.index = df['age']
del df['age']
df.head()

Unnamed: 0_level_0,first_name,last_name,city,sex
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Jason,Miller,San Francisco,M
52,Molly,Jacobson,Baltimore,F
36,Tina,Ali,Miami,
24,Jake,Milner,Douglas,F
73,Amy,Cooze,Boston,


### basic, loc, iloc selection

In [52]:
df[['first_name', 'last_name']][:2] # Column과 index number

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [56]:
df.loc[[42, 52], ['first_name', 'last_name']] # Column과 index name

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [53]:
df.iloc[:2, :2] # Column number와 index number

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [57]:
df[['first_name','last_name']].iloc[:4]

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson
36,Tina,Ali
24,Jake,Milner


---
### Index 재설정

In [58]:
df.index = list(range(0,5))
df.head()

Unnamed: 0,first_name,last_name,city,sex
0,Jason,Miller,San Francisco,M
1,Molly,Jacobson,Baltimore,F
2,Tina,Ali,Miami,
3,Jake,Milner,Douglas,F
4,Amy,Cooze,Boston,


### Data Drop 

In [60]:
df.drop(1) # Index number로 drop

Unnamed: 0,first_name,last_name,city,sex
0,Jason,Miller,San Francisco,M
2,Tina,Ali,Miami,
3,Jake,Milner,Douglas,F
4,Amy,Cooze,Boston,


In [61]:
df.drop([0,1]) # 한 개 이상의 index number로 drop

Unnamed: 0,first_name,last_name,city,sex
2,Tina,Ali,Miami,
3,Jake,Milner,Douglas,F
4,Amy,Cooze,Boston,


In [63]:
df.drop('city', axis=1) # axis 지정으로 축을 기준으로 drop -> column 중 'city'

Unnamed: 0,first_name,last_name,sex
0,Jason,Miller,M
1,Molly,Jacobson,F
2,Tina,Ali,
3,Jake,Milner,F
4,Amy,Cooze,


---
# <span style="color:red">Pandas - dataframe operations</span>

 ### Series operation

In [64]:
s1 = Series(range(1,6), index = list('abcde'))
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [66]:
s2 = Series(range(5,11), index = list('bcedef'))
s2

b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64

In [67]:
# index 기준으로 연산수행.
# 겹치는 index가 없을 경우 NaN 값으로 반환
s1.add(s2)

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

In [68]:
s1+s2

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

In [69]:
s1.add(s2, fill_value = 0)

a     1.0
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f    10.0
dtype: float64

### Dataframe operation

In [70]:
df1 = DataFrame(
    np.arange(9).reshape(3,3),
    columns = list('abe')
)
df1

Unnamed: 0,a,b,e
0,0,1,2
1,3,4,5
2,6,7,8


In [72]:
df2 = DataFrame(
    np.arange(16).reshape(4,4),
    columns = list('abcd')
)
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


df는 column과 index 모두 고려


add operation을 쓰면 NaN값 0으로 변환 <br/>
**Operation types: add, sub, div, mul**

In [73]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,,,
1,7.0,9.0,,,
2,14.0,16.0,,,
3,,,,,


In [74]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,2.0,3.0,2.0
1,7.0,9.0,6.0,7.0,5.0
2,14.0,16.0,10.0,11.0,8.0
3,12.0,13.0,14.0,15.0,


### Series + Dataframe

In [75]:
df = DataFrame(
    np.arange(16).reshape(4,4),
    columns = list('abcd')
)
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [76]:
s = Series(np.arange(10,14), index = list('abcd'))
s

a    10
b    11
c    12
d    13
dtype: int32

In [77]:
df+s

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


---

In [78]:
s2 = Series(np.arange(10,14))
s2

0    10
1    11
2    12
3    13
dtype: int32

In [79]:
df+s2

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [81]:
df.add(s2, axis=0) # axis를 기준으로 row broadcasting 실행

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,15,16,17,18
2,20,21,22,23
3,25,26,27,28


---
# <span style="color:red">Pandas - lambda, map apply</span>


### Lambda 함수
- 한 줄로 함수를 표현하는 익명 함수 기법
- Lisp 언어에서 시작된 기법으로 오늘날 현대언어에 많이 사용
---
> **lambda** argument : expression <br/>

In [82]:
# 기존 
def f(x,y):
    return x+y

# 람다
f = lambda x,y : x+y
f(1,4)

5

---

In [83]:
# 하나의 argument만 처리하는 lambda 함수
f = lambda x : x/2
f(3)

1.5

In [84]:
f = lambda x : x**2
f(3)

9

In [85]:
(lambda x: x+1)(5) # 이름을 할당하지 않는 lambda 함수

6

---
### map 함수
- 함수와 sequence형 데이터를 인자로 받아 <br/>
각 element마다 입력받은 함수를 적용하여 list로 반환
- 일반적으로 함수를 lambda 형태로 표현함
> map(function, sequence)

In [86]:
ex = [1,2,3,4,5]
f = lambda x : x**2
list(map(f, ex))

[1, 4, 9, 16, 25]

In [87]:
f = lambda x,y : x+y
list(map(f, ex, ex)) # 두 개의 argument가 있을 때는 두 개의 sequence형을 써야 함

[2, 4, 6, 8, 10]

In [89]:
# 익명 함수 그대로 사용할 수 있음
# python3 에는 list를 꼭 붙여줘야 함
list(map(lambda x: x+x, ex)) 

[2, 4, 6, 8, 10]

### map for series
- pandas의 series type 데이터에도 map함수 사용가능
- function 대신 dict, sequence형 자료 등으로 대체 가능

In [90]:
s1 = Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [91]:
s1.map(lambda x:x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [92]:
# dict 타입으로 데이터 교체
# 없는 값은 NaN
z = {1:'A', 2:'B', 3:'C'}
s1.map(z).head(5)

0    NaN
1      A
2      B
3      C
4    NaN
dtype: object

In [95]:
# 같은 위치(index)의 데이터를 s2로 전환
s2 = Series(np.arange(10,20))
s1.map(s2).head(5)

0    10
1    11
2    12
3    13
4    14
dtype: int32

### Example - map for series

In [169]:
raw_data = {
    'earn' :  [79571.299, 96396.988, 48710.666, 80478.096, 82089345],
    'height': [73.89, 66.23, 63.77, 63.22, 63.08],
    'sex':    ['male','female','female','female','female'],
    'race':   ['white', 'white', 'white', 'other', 'white'],
    'ed' :    [16, 16, 16, 16, 17],
    'age':    [49, 62, 33, 95, 43]
}
df = pd.DataFrame(raw_data, columns=['earn','height','sex','race','ed','age'])
df

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.3,73.89,male,white,16,49
1,96396.99,66.23,female,white,16,62
2,48710.67,63.77,female,white,16,33
3,80478.1,63.22,female,other,16,95
4,82089340.0,63.08,female,white,17,43


In [128]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [129]:
# 성별 str -> 성별 code
df['sex_code'] = df.sex.map({'male':0, 'female':1})
df

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,male,white,16,49,0
1,96396.99,66.23,female,white,16,62,1
2,48710.67,63.77,female,white,16,33,1
3,80478.1,63.22,female,other,16,95,1
4,82089340.0,63.08,female,white,17,43,1


---
### Replace function 
- map 함수의 기능 중 데이터 변환 기능만 담당
- 데이터 변환 시 많이 사용하는 함수

In [130]:
df.sex.replace(
    {'male':0, 'female':1})

0    0
1    1
2    1
3    1
4    1
Name: sex, dtype: int64

In [131]:
df.sex.replace(
    ['male','female'],     # Target list
    [0,1],                 # Conversion list
    inplace = True)        # inplace: 데이터 변환결과를 적용
df

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,0,white,16,49,0
1,96396.99,66.23,1,white,16,62,1
2,48710.67,63.77,1,white,16,33,1
3,80478.1,63.22,1,other,16,95,1
4,82089340.0,63.08,1,white,17,43,1


---
### apply for dataframe
- map과 달리 series 전체(column)에 해당 함수를 적용
- 입력값을 series 데이터로 입력받아 handling 가능

In [136]:
df_info = df[['earn', 'height', 'age']]
df_info

Unnamed: 0,earn,height,age
0,79571.3,73.89,49
1,96396.99,66.23,62
2,48710.67,63.77,33
3,80478.1,63.22,95
4,82089340.0,63.08,43


In [137]:
f = lambda x: x.max() - x.min()
df_info.apply(f)

earn      8.204063e+07
height    1.081000e+01
age       6.200000e+01
dtype: float64

- 내장 연산 함수를 사용할 때도 똑같은 효과 거둘 수 있음
- mean, std 등 사용가능

In [138]:
df_info.sum()

earn      8.239450e+07
height    3.301900e+02
age       2.820000e+02
dtype: float64

In [139]:
df_info.apply(sum)

earn      8.239450e+07
height    3.301900e+02
age       2.820000e+02
dtype: float64

- scalar 값 이외에 series 값의 반환도 가능함

In [140]:
def f(x):
    return Series([x.min(), x.max()], index=['min','max'])
df_info.apply(f)

Unnamed: 0,earn,height,age
min,48710.67,63.08,33
max,82089340.0,73.89,95


---
### applymap for dataframe
- series 단위가 아닌 element 단위로 함수를 적용함
- series 단위에 apply를 적용시킬 때와 같은 효과

In [142]:
f = lambda x: -x
df_info.applymap(f)

Unnamed: 0,earn,height,age
0,-79571.3,-73.89,-49
1,-96396.99,-66.23,-62
2,-48710.67,-63.77,-33
3,-80478.1,-63.22,-95
4,-82089340.0,-63.08,-43


In [143]:
df_info['earn'].apply(f)

0   -7.957130e+04
1   -9.639699e+04
2   -4.871067e+04
3   -8.047810e+04
4   -8.208934e+07
Name: earn, dtype: float64

---
# <span style="color:red">Pandas - pandas built-in functions</span>

### describe
- numeric type 데이터의 요약 정보를 보여줌

In [145]:
df.describe()

Unnamed: 0,earn,height,sex,ed,age,sex_code
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,16478900.0,66.038,0.8,16.2,56.4,0.8
std,36677360.0,4.569614,0.447214,0.447214,23.995833,0.447214
min,48710.67,63.08,0.0,16.0,33.0,0.0
25%,79571.3,63.22,1.0,16.0,43.0,1.0
50%,80478.1,63.77,1.0,16.0,49.0,1.0
75%,96396.99,66.23,1.0,16.0,62.0,1.0
max,82089340.0,73.89,1.0,17.0,95.0,1.0


---
### unique
- series data의 유일한 값을 list로 반환함

In [170]:
df.race.unique()

array(['white', 'other'], dtype=object)

In [171]:
np.array(dict(enumerate(df['race'].unique()))) # dict type으로 index

array({0: 'white', 1: 'other'}, dtype=object)

In [172]:
value = list(map(int, np.array(list(enumerate(df['race'].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df['race'].unique())), dtype=str)[:, 1].tolist()

value, key # label index 값과 label 값 각각 추출

([0, 1], ['white', 'other'])

In [173]:
df['race'].replace(to_replace=key, value=value, inplace=True) # label str -> index 값으로 변환

In [174]:
# 성별에 대해서도 동일하게 적용
value = list(map(int, np.array(list(enumerate(df['sex'].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df['sex'].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1], ['male', 'female'])

In [175]:
# 'sex'와 'race' column의 index labeling
df['sex'].replace(to_replace = key, value=value, inplace=True)

In [176]:
df

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.3,73.89,0,0,16,49
1,96396.99,66.23,1,0,16,62
2,48710.67,63.77,1,0,16,33
3,80478.1,63.22,1,1,16,95
4,82089340.0,63.08,1,0,17,43


---
### sum
- 기본적인 column 또는 row 값의 연산을 지원
- sub, mean, min, max, count, median, mad, var 등

In [177]:
df.sum(axis=0) # column 별

earn      8.239450e+07
height    3.301900e+02
sex       4.000000e+00
race      1.000000e+00
ed        8.100000e+01
age       2.820000e+02
dtype: float64

In [178]:
df.sum(axis=1) # row 별

0    7.971019e+04
1    9.654222e+04
2    4.882444e+04
3    8.065432e+04
4    8.208947e+07
dtype: float64

---
### isnull
- column 또는 row 값에서 NaN(null)값의 index를 반환함

In [179]:
df.isnull()

Unnamed: 0,earn,height,sex,race,ed,age
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [180]:
df.isnull().sum() # null인 값의 합 (결측치 갯수)

earn      0
height    0
sex       0
race      0
ed        0
age       0
dtype: int64

---
### sort_values
- column 값을 기준으로 데이터를 sorting

In [181]:
df.sort_values(['age','earn'], ascending=True) # age가 우선

Unnamed: 0,earn,height,sex,race,ed,age
2,48710.67,63.77,1,0,16,33
4,82089340.0,63.08,1,0,17,43
0,79571.3,73.89,0,0,16,49
1,96396.99,66.23,1,0,16,62
3,80478.1,63.22,1,1,16,95


In [182]:
df.cumsum()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.3,73.89,0.0,0.0,16.0,49.0
1,175968.3,140.12,1.0,0.0,32.0,111.0
2,224679.0,203.89,2.0,0.0,48.0,144.0
3,305157.0,267.11,3.0,1.0,64.0,239.0
4,82394500.0,330.19,4.0,1.0,81.0,282.0


In [183]:
df.cummax()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.3,73.89,0.0,0.0,16.0,49.0
1,96396.99,73.89,1.0,0.0,16.0,62.0
2,96396.99,73.89,1.0,0.0,16.0,62.0
3,96396.99,73.89,1.0,1.0,16.0,95.0
4,82089340.0,73.89,1.0,1.0,17.0,95.0


---
### Correlation & Covariance
- 상관계수와 공분산을 구하는 함수 
- corr, cov, corrwith

In [184]:
df.age.corr(df.earn)

-0.31191765928447773

In [185]:
df.age.cov(df.earn)

-274519900.63965

In [186]:
df.corrwith(df.earn)

earn      1.000000
height   -0.361743
sex       0.249950
race     -0.249936
ed        1.000000
age      -0.311918
dtype: float64

In [187]:
df.corr()

Unnamed: 0,earn,height,sex,race,ed,age
earn,1.0,-0.361743,0.24995,-0.249936,1.0,-0.311918
height,-0.361743,1.0,-0.960563,-0.344736,-0.361863,-0.166655
sex,0.24995,-0.960563,1.0,0.25,0.25,0.172394
race,-0.249936,-0.344736,0.25,1.0,-0.25,0.899242
ed,1.0,-0.361863,0.25,-0.25,1.0,-0.312172
age,-0.311918,-0.166655,0.172394,0.899242,-0.312172,1.0
