## 데이터프레임에서 데이터 선택하기

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt', 'Korea', 'China', 'England']
dr =  [True, False, False, False, True, True, True, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45, 122, 397, 255]
my_dict = {
    'country':names,
    'drives_right':dr,
    'cars_per_cap':cpc
}

In [4]:
cars = pd.DataFrame(my_dict)

In [5]:
cars

Unnamed: 0,cars_per_cap,country,drives_right
0,809,United States,True
1,731,Australia,False
2,588,Japan,False
3,18,India,False
4,200,Russia,True
5,70,Morocco,True
6,45,Egypt,True
7,122,Korea,True
8,397,China,True
9,255,England,True


In [6]:
# 1개 컬럼 선택, series 로 추출됨

In [7]:
cars['country']

0    United States
1        Australia
2            Japan
3            India
4           Russia
5          Morocco
6            Egypt
7            Korea
8            China
9          England
Name: country, dtype: object

In [8]:
# 1개 컬럼 선택, DataFrame 으로 추출됨

In [9]:
cars[['country']]

Unnamed: 0,country
0,United States
1,Australia
2,Japan
3,India
4,Russia
5,Morocco
6,Egypt
7,Korea
8,China
9,England


In [10]:
# row index 로 잘라내기

In [11]:
cars[1:5]

Unnamed: 0,cars_per_cap,country,drives_right
1,731,Australia,False
2,588,Japan,False
3,18,India,False
4,200,Russia,True


## 컬럼명으로 선택하기 (loc)

In [12]:
cars.loc[:, ['country', 'drives_right']]

Unnamed: 0,country,drives_right
0,United States,True
1,Australia,False
2,Japan,False
3,India,False
4,Russia,True
5,Morocco,True
6,Egypt,True
7,Korea,True
8,China,True
9,England,True


In [13]:
cars.loc[2:5, ['country', 'drives_right']]

Unnamed: 0,country,drives_right
2,Japan,False
3,India,False
4,Russia,True
5,Morocco,True


## 위치 인덱스로 선택하기 (iloc)

In [14]:
cars.iloc[3]

cars_per_cap       18
country         India
drives_right    False
Name: 3, dtype: object

In [15]:
cars.iloc[[1,2,4], [1,2]]

Unnamed: 0,country,drives_right
1,Australia,False
2,Japan,False
4,Russia,True


In [16]:
cars.iloc[2:5, :]

Unnamed: 0,cars_per_cap,country,drives_right
2,588,Japan,False
3,18,India,False
4,200,Russia,True


## Boolean Indexing 조건절을 이용한 데이터 선택

In [17]:
cars[cars.drives_right == True]

Unnamed: 0,cars_per_cap,country,drives_right
0,809,United States,True
4,200,Russia,True
5,70,Morocco,True
6,45,Egypt,True
7,122,Korea,True
8,397,China,True
9,255,England,True


In [18]:
cars[cars.cars_per_cap > 500]

Unnamed: 0,cars_per_cap,country,drives_right
0,809,United States,True
1,731,Australia,False
2,588,Japan,False


In [19]:
cars[cars['country'].isin(['Korea','England'])]   # 문자열 컬럼일 경우 문자열 비교 가능

Unnamed: 0,cars_per_cap,country,drives_right
7,122,Korea,True
9,255,England,True


## 데이터프레임에 값 추가 / 변경

In [20]:
svalue = pd.Series([1,2,3,np.nan,6,8,9,7,4,5])

In [21]:
cars['level'] = svalue   # dataframe 에 Series 값으로 컬럼 추가

In [22]:
cars

Unnamed: 0,cars_per_cap,country,drives_right,level
0,809,United States,True,1.0
1,731,Australia,False,2.0
2,588,Japan,False,3.0
3,18,India,False,
4,200,Russia,True,6.0
5,70,Morocco,True,8.0
6,45,Egypt,True,9.0
7,122,Korea,True,7.0
8,397,China,True,4.0
9,255,England,True,5.0


In [23]:
cars.at[0, 'country']      # at : 인덱스 번호와 컬럼명으로 데이터 선택

'United States'

In [24]:
cars.at[2, 'level'] = 30   # 인덱스 번호와 컬럼명에 해당하는 데이터 값 변경

In [25]:
cars.iloc[[2]]

Unnamed: 0,cars_per_cap,country,drives_right,level
2,588,Japan,False,30.0


In [26]:
cars.iat[5,0] = 77      # iat : 인덱스 번호와 컬럼 순서번호

In [27]:
cars.iloc[[5]]

Unnamed: 0,cars_per_cap,country,drives_right,level
5,77,Morocco,True,8.0


## Mising Data 처리

In [28]:
cars[pd.isnull(cars.level)]

Unnamed: 0,cars_per_cap,country,drives_right,level
3,18,India,False,


In [29]:
cars.dropna(how='any')   # 한 컬럼이라도 null 값 존재하는 row 삭제.

Unnamed: 0,cars_per_cap,country,drives_right,level
0,809,United States,True,1.0
1,731,Australia,False,2.0
2,588,Japan,False,30.0
4,200,Russia,True,6.0
5,77,Morocco,True,8.0
6,45,Egypt,True,9.0
7,122,Korea,True,7.0
8,397,China,True,4.0
9,255,England,True,5.0


In [30]:
cars = cars.fillna(value=cars.level.mean())   # null 값을 다른 값으로 변경

In [31]:
cars

Unnamed: 0,cars_per_cap,country,drives_right,level
0,809,United States,True,1.0
1,731,Australia,False,2.0
2,588,Japan,False,30.0
3,18,India,False,8.0
4,200,Russia,True,6.0
5,77,Morocco,True,8.0
6,45,Egypt,True,9.0
7,122,Korea,True,7.0
8,397,China,True,4.0
9,255,England,True,5.0


## 데이터 합치기 (Merge)

#### Concat

In [32]:
pieces = [cars[7:], cars[3:7], cars[:3]]

In [33]:
pd.concat(pieces)   ## 데이터를 쪼갠 순서대로 합친다.

Unnamed: 0,cars_per_cap,country,drives_right,level
7,122,Korea,True,7.0
8,397,China,True,4.0
9,255,England,True,5.0
3,18,India,False,8.0
4,200,Russia,True,6.0
5,77,Morocco,True,8.0
6,45,Egypt,True,9.0
0,809,United States,True,1.0
1,731,Australia,False,2.0
2,588,Japan,False,30.0


#### Merge  (SQL style)

In [34]:
left = pd.DataFrame({'key': ['Kate', 'Susan'], 'leftval': [1, 2]})

In [35]:
right = pd.DataFrame({'key': ['Kate', 'Susan'], 'rightval': [4, 5]})

In [36]:
pd.merge(left, right, on='key')

Unnamed: 0,key,leftval,rightval
0,Kate,1,4
1,Susan,2,5


In [37]:
left = pd.DataFrame({'key': ['Kate', 'Kate'], 'leftval': [1, 2]})

In [38]:
right = pd.DataFrame({'key': ['Kate', 'Kate'], 'rightval': [4, 5]})

In [39]:
df = pd.merge(left, right, on='key')   # key 값이 동일한 경우 row 가 늘어난다.

In [40]:
df

Unnamed: 0,key,leftval,rightval
0,Kate,1,4
1,Kate,1,5
2,Kate,2,4
3,Kate,2,5


#### Append

In [41]:
s = pd.DataFrame({'key':['Jessica'], 'leftval':[9], 'rightval':[7]})

In [42]:
df.append(s, ignore_index=True)

Unnamed: 0,key,leftval,rightval
0,Kate,1,4
1,Kate,1,5
2,Kate,2,4
3,Kate,2,5
4,Jessica,9,7


## Grouping

In [43]:
df = pd.DataFrame({'col1' : ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'A'],
                   'col2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
                   'col3' : np.random.randn(8).round(2)*100,
                   'col4' : np.random.randn(8).round(3)*10})

In [44]:
df

Unnamed: 0,col1,col2,col3,col4
0,A,one,-159.0,-12.89
1,B,one,119.0,-12.23
2,A,two,32.0,-7.31
3,B,three,-6.0,-1.62
4,A,two,-32.0,10.65
5,B,two,-21.0,-2.15
6,A,one,105.0,4.06
7,A,three,31.0,-16.99


In [45]:
df.groupby('col1').sum()

Unnamed: 0_level_0,col3,col4
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,-23.0,-22.48
B,92.0,-16.0


In [46]:
df.groupby(['col1','col2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,col3,col4
col1,col2,Unnamed: 2_level_1,Unnamed: 3_level_1
A,one,-54.0,-8.83
A,three,31.0,-16.99
A,two,0.0,3.34
B,one,119.0,-12.23
B,three,-6.0,-1.62
B,two,-21.0,-2.15


In [47]:
# pivot table

In [48]:
pd.pivot_table(df, values='col3', index=['col1'], columns=['col2'])

col2,one,three,two
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,-27.0,31.0,0.0
B,119.0,-6.0,-21.0
