## 데이터프레임에서 데이터 선택하기

In [1]:
import pandas as pd
import numpy as np

In [2]:
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr =  [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]
my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc
}

In [3]:
cars = pd.DataFrame(my_dict)
cars

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45
7,Korea,True,122
8,China,True,397
9,England,True,255


In [4]:
# 1개 컬럼 선택, series 로 추출됨

In [5]:
cars['country']

0    United States
1        Australia
2            Japan
3            India
4           Russia
5          Morocco
6            Egypt
7            Korea
8            China
9          England
Name: country, dtype: object

In [6]:
# 1개 컬럼 선택, DataFrame 으로 추출됨

In [7]:
cars[['country']]

Unnamed: 0,country
0,United States
1,Australia
2,Japan
3,India
4,Russia
5,Morocco
6,Egypt
7,Korea
8,China
9,England


In [8]:
# row index 로 잘라내기

In [9]:
cars[1:5]

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200


## 컬럼명으로 선택하기 (loc)

In [10]:
cars.loc[:, ['country', 'drives_right']]

Unnamed: 0,country,drives_right
0,United States,True
1,Australia,False
2,Japan,False
3,India,False
4,Russia,True
5,Morocco,True
6,Egypt,True
7,Korea,True
8,China,True
9,England,True


In [11]:
cars.loc[2:5, ['country', 'drives_right']]

Unnamed: 0,country,drives_right
2,Japan,False
3,India,False
4,Russia,True
5,Morocco,True


## 위치 인덱스로 선택하기 (iloc)

In [12]:
cars.iloc[3]

country         India
drives_right    False
cars_per_cap       18
Name: 3, dtype: object

In [13]:
cars.iloc[[1,2,4], [1,2]]

Unnamed: 0,drives_right,cars_per_cap
1,False,731
2,False,588
4,True,200


In [14]:
cars.iloc[2:5, :]

Unnamed: 0,country,drives_right,cars_per_cap
2,Japan,False,588
3,India,False,18
4,Russia,True,200


## Boolean Indexing 조건절을 이용한 데이터 선택

In [15]:
cars[cars.drives_right == True]

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45
7,Korea,True,122
8,China,True,397
9,England,True,255


In [16]:
cars[cars.cars_per_cap > 500]

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588


In [17]:
cars[cars['country'].isin(['Korea','England'])]   # 문자열 컬럼일 경우 문자열 비교 가능

Unnamed: 0,country,drives_right,cars_per_cap
7,Korea,True,122
9,England,True,255


## 데이터프레임에 값 추가 / 변경

In [18]:
svalue = pd.Series([1,2,3,np.nan,6,8,9,7,4,5])

In [19]:
cars['level'] = svalue   # dataframe 에 Series 값으로 컬럼 추가
cars

Unnamed: 0,country,drives_right,cars_per_cap,level
0,United States,True,809,1.0
1,Australia,False,731,2.0
2,Japan,False,588,3.0
3,India,False,18,
4,Russia,True,200,6.0
5,Morocco,True,70,8.0
6,Egypt,True,45,9.0
7,Korea,True,122,7.0
8,China,True,397,4.0
9,England,True,255,5.0


In [20]:
cars.at[0, 'country']      # at : 인덱스 번호와 컬럼명으로 데이터 선택

'United States'

In [21]:
cars.at[2, 'level'] = 30   # 인덱스 번호와 컬럼명에 해당하는 데이터 값 변경

In [22]:
cars.iloc[[2]]

Unnamed: 0,country,drives_right,cars_per_cap,level
2,Japan,False,588,30.0


In [23]:
cars.iat[5,0] = 77      # iat : 인덱스 번호와 컬럼 순서번호

In [24]:
cars.iloc[[5]]

Unnamed: 0,country,drives_right,cars_per_cap,level
5,77,True,70,8.0


## Mising Data 처리

In [25]:
cars[pd.isnull(cars.level)]

Unnamed: 0,country,drives_right,cars_per_cap,level
3,India,False,18,


In [26]:
cars.dropna(how='any')   # 한 컬럼이라도 null 값 존재하는 row 삭제.

Unnamed: 0,country,drives_right,cars_per_cap,level
0,United States,True,809,1.0
1,Australia,False,731,2.0
2,Japan,False,588,30.0
4,Russia,True,200,6.0
5,77,True,70,8.0
6,Egypt,True,45,9.0
7,Korea,True,122,7.0
8,China,True,397,4.0
9,England,True,255,5.0


In [27]:
cars = cars.fillna(value=cars.level.mean())   # null 값을 다른 값으로 변경
cars

Unnamed: 0,country,drives_right,cars_per_cap,level
0,United States,True,809,1.0
1,Australia,False,731,2.0
2,Japan,False,588,30.0
3,India,False,18,8.0
4,Russia,True,200,6.0
5,77,True,70,8.0
6,Egypt,True,45,9.0
7,Korea,True,122,7.0
8,China,True,397,4.0
9,England,True,255,5.0


## 데이터 합치기 (Merge)

#### Concat

In [28]:
pieces = [cars[7:], cars[3:7], cars[:3]]

In [29]:
pd.concat(pieces)   ## 데이터를 쪼갠 순서대로 합친다.

Unnamed: 0,country,drives_right,cars_per_cap,level
7,Korea,True,122,7.0
8,China,True,397,4.0
9,England,True,255,5.0
3,India,False,18,8.0
4,Russia,True,200,6.0
5,77,True,70,8.0
6,Egypt,True,45,9.0
0,United States,True,809,1.0
1,Australia,False,731,2.0
2,Japan,False,588,30.0


#### Merge  (SQL style)

In [30]:
left = pd.DataFrame({'key': ['Kate', 'Susan'], 'leftval': [1, 2]})
right = pd.DataFrame({'key': ['Kate', 'Susan'], 'rightval': [4, 5]})
pd.merge(left, right, on='key')

Unnamed: 0,key,leftval,rightval
0,Kate,1,4
1,Susan,2,5


In [31]:
left = pd.DataFrame({'key': ['Kate', 'Kate'], 'leftval': [1, 2]})
right = pd.DataFrame({'key': ['Kate', 'Kate'], 'rightval': [4, 5]})
df = pd.merge(left, right, on='key')   # key 값이 동일한 경우 row 가 늘어난다.
df

Unnamed: 0,key,leftval,rightval
0,Kate,1,4
1,Kate,1,5
2,Kate,2,4
3,Kate,2,5


#### Append

In [32]:
s = pd.DataFrame({'key':['Jessica'], 'leftval':[9], 'rightval':[7]})
df.append(s, ignore_index=True)

Unnamed: 0,key,leftval,rightval
0,Kate,1,4
1,Kate,1,5
2,Kate,2,4
3,Kate,2,5
4,Jessica,9,7


## Grouping

In [33]:
df = pd.DataFrame({'col1' : ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'A'],
                   'col2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
                   'col3' : np.random.randn(8).round(2)*100,
                   'col4' : np.random.randn(8).round(3)*10})

In [34]:
df

Unnamed: 0,col1,col2,col3,col4
0,A,one,-182.0,-5.9
1,B,one,123.0,-6.39
2,A,two,103.0,12.62
3,B,three,29.0,-1.23
4,A,two,-31.0,-10.66
5,B,two,-21.0,-13.59
6,A,one,-115.0,5.95
7,A,three,-80.0,7.41


In [35]:
df.groupby('col1').sum()

Unnamed: 0_level_0,col3,col4
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,-305.0,9.42
B,131.0,-21.21


In [36]:
df.groupby(['col1','col2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,col3,col4
col1,col2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,one,-297.0,0.05
A,three,-80.0,7.41
A,two,72.0,1.96
B,one,123.0,-6.39
B,three,29.0,-1.23
B,two,-21.0,-13.59


In [37]:
# pivot table

In [38]:
pd.pivot_table(df, values='col3', index=['col1'], columns=['col2'])

col2,one,three,two
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,-148.5,-80.0,36.0
B,123.0,29.0,-21.0
