In [43]:
import pandas as pd
import numpy as np

## Renaming columns

In [39]:
df = pd.DataFrame({
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
})
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [40]:
df.rename(columns={'pop':'인구'})

Unnamed: 0,state,year,인구
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [41]:
df.rename(columns={'state':'주','year':'연도','pop':'인구'})

Unnamed: 0,주,연도,인구
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [46]:
## DataFrame의 컬럼 개수를 정확하게 일치시켜주어야 한다
df.columns = ['주','연도','인구']
df

Unnamed: 0,주,연도,인구
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


#### Exercise1 df2의 column명을 '이름' , '나이' , '연도' , '점수' 로 바꾸세요.

In [8]:
df2 = pd.DataFrame({
    "name" : ['soyeon','hanseok','jiseob','bohyeon','bohyeon','hongjae'],
    "age" : [22, 23, 26, 22, 21, 23],
    "year" : [2016, 2017, 2018, 2019, 2019, 2017],
    "points" : [1.5, 1.7, 2.0, 3.8, 1.9, 2.3] 
})
df2

Unnamed: 0,name,age,year,points
0,soyeon,22,2016,1.5
1,hanseok,23,2017,1.7
2,jiseob,26,2018,2.0
3,bohyeon,22,2019,3.8
4,bohyeon,21,2019,1.9
5,hongjae,23,2017,2.3


In [10]:
## code here
df2.rename(columns={'name':'이름','age':'나이','year':'년도','points':'점수'})


Unnamed: 0,이름,나이,년도,점수
0,soyeon,22,2016,1.5
1,hanseok,23,2017,1.7
2,jiseob,26,2018,2.0
3,bohyeon,22,2019,3.8
4,bohyeon,21,2019,1.9
5,hongjae,23,2017,2.3


## Reordering columns

In [47]:
df.columns

Index(['주', '연도', '인구'], dtype='object')

In [48]:
df = df.reindex(columns = ['연도', '주', '인구'])
df

Unnamed: 0,연도,주,인구
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [49]:
df.reindex(columns = sorted(df.columns))

Unnamed: 0,연도,인구,주
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Ohio,3.6,2002
3,Nevada,2.4,2001
4,Nevada,2.9,2002


#### Exercise2 df2의 column 순서를 name, year, points, age 순으로 바꾸세요. 

In [67]:
df2 = pd.DataFrame({
    "name" : ['soyeon','hanseok','jiseob','bohyeon','bohyeon','hongjae'],
    "age" : [22, 23, 26, 22, 21, 23],
    "year" : [2016, 2017, 2018, 2019, 2019, 2017],
    "points" : [1.5, 1.7, 2.0, 3.8, 1.9, 2.3] 
})
df2

Unnamed: 0,name,age,year,points
0,soyeon,22,2016,1.5
1,hanseok,23,2017,1.7
2,jiseob,26,2018,2.0
3,bohyeon,22,2019,3.8
4,bohyeon,21,2019,1.9
5,hongjae,23,2017,2.3


In [27]:
## code here

df2 = df2.reindex(columns = ['name', 'year', 'points','age'])
df2

Unnamed: 0,name,year,points,age
0,soyeon,2016,1.5,22
1,hanseok,2017,1.7,23
2,jiseob,2018,2.0,26
3,bohyeon,2019,3.8,22
4,bohyeon,2019,1.9,21
5,hongjae,2017,2.3,23


## Duplicate Rows

In [59]:
data = pd.DataFrame({
    'key1':['a', 'b', 'b', 'c', 'c'],
    'key2':['v', 'w', 'w', 'x', 'y'],
    'num':[1, 2, 2, 4, 5]
})
data

Unnamed: 0,key1,key2,num
0,a,v,1
1,b,w,2
2,b,w,2
3,c,x,4
4,c,y,5


In [62]:
## 중복여부 확인
data.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [60]:
data.duplicated(['key1'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [61]:
data.duplicated(['key1','key2'])

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [63]:
## 중복값 처리
data.drop_duplicates()

Unnamed: 0,key1,key2,num
0,a,v,1
1,b,w,2
3,c,x,4
4,c,y,5


In [64]:
data.drop_duplicates(['key1']) # keep='first' default

Unnamed: 0,key1,key2,num
0,a,v,1
1,b,w,2
3,c,x,4


In [65]:
data.drop_duplicates(['key1','key2'])

Unnamed: 0,key1,key2,num
0,a,v,1
1,b,w,2
3,c,x,4
4,c,y,5


In [66]:
## 중복된 마지막 값을 남김
data.drop_duplicates(['key1'],keep='last')

Unnamed: 0,key1,key2,num
0,a,v,1
2,b,w,2
4,c,y,5


## Sorting values

In [68]:
df2 = pd.DataFrame({
    "name" : ['soyeon','hanseok','jiseob','bohyeon','bohyeon','hongjae'],
    "age" : [22, 23, 26, 22, 21, 23],
    "year" : [2016, 2017, 2018, 2019, 2019, 2017],
    "points" : [1.5, 1.7, 2.0, 3.8, 1.9, 2.3] 
})
df2

Unnamed: 0,name,age,year,points
0,soyeon,22,2016,1.5
1,hanseok,23,2017,1.7
2,jiseob,26,2018,2.0
3,bohyeon,22,2019,3.8
4,bohyeon,21,2019,1.9
5,hongjae,23,2017,2.3


In [69]:
df2.sort_values(by='age') ## 오름차순 정렬

Unnamed: 0,name,age,year,points
4,bohyeon,21,2019,1.9
0,soyeon,22,2016,1.5
3,bohyeon,22,2019,3.8
1,hanseok,23,2017,1.7
5,hongjae,23,2017,2.3
2,jiseob,26,2018,2.0


In [70]:
df2.sort_values(by='age', ascending=False) ## 내림차순 정렬

Unnamed: 0,name,age,year,points
2,jiseob,26,2018,2.0
1,hanseok,23,2017,1.7
5,hongjae,23,2017,2.3
0,soyeon,22,2016,1.5
3,bohyeon,22,2019,3.8
4,bohyeon,21,2019,1.9


## Merging DataFrame

<font color='darkgreen'>
##### Merge Types
<br><img align="left" src="http://drive.google.com/uc?export=view&id=1yEU_xZ9qZrZi8IIWTMhhMIKI95XxisX0" width=900 height=800>

In [111]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})

In [73]:
display(df1,df2)

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [72]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [79]:
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [80]:
pd.merge(df1, df2, on='key', how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [81]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [82]:
pd.merge(df1, df2, on='key', how='right')

Unnamed: 0,key,data1,data2
0,b,0.0,1
1,b,1.0,1
2,b,6.0,1
3,a,2.0,0
4,a,4.0,0
5,a,5.0,0
6,d,,2


- column명이 다른 경우

In [75]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})

In [76]:
display(df3,df4)

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [77]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


#### Exercise2 두 데이터를 이용하여 고객당 방문한 지점의 종류를 파악할 수 있는 데이터를 만들어 보세요.

In [16]:
cs = pd.read_csv('dataCustomers.tab',sep='\t',encoding='cp949')
ts = pd.read_csv('dataTransactions.tab',sep='\t',encoding='cp949')
cs.head()

Unnamed: 0,custid,gender,age,marriage,residence,job
0,10070,여,28,미혼,Yongsan-gu,제조업
1,10139,여,28,미혼,Gangdong-gu,정보서비스
2,10208,여,28,미혼,Gwangjin-gu,제조업
3,10275,여,28,미혼,Eunpyeong-gu,개인사업
4,10350,남,28,미혼,Gangnam-gu,제조업


In [15]:
ts.head()

Unnamed: 0,datetime,custid,store,product,brand,corner,import,amount,installment
0,2000-05-01 10:43,18313,신촌점,4104840008000,샤넬,화장품,1,113000,3
1,2000-05-01 11:00,18313,신촌점,2700000000000,식품,일반식품,0,91950,3
2,2000-05-01 11:33,27222,신촌점,4545370944500,까사미아,가구,0,598000,3
3,2000-05-01 11:43,27222,신촌점,4500860043900,대아통상,기타,0,20100,1
4,2000-05-01 11:53,27222,신촌점,4538130048700,토이플러스,문화완구,0,24000,1


In [33]:
## code here

ct=pd.merge(cs,ts)
ct=ct.drop_duplicates(['custid','store']);ct

Unnamed: 0,custid,gender,age,marriage,residence,job,datetime,store,product,brand,corner,import,amount,installment
0,10070,여,28,미혼,Yongsan-gu,제조업,2000-06-27 12:23,무역점,4408173027000,1492마일즈,유니캐주얼,0,39000,1
1,10070,여,28,미혼,Yongsan-gu,제조업,2000-09-03 11:33,신촌점,4106530008200,메이컵포에버,화장품,1,18000,1
7,10070,여,28,미혼,Yongsan-gu,제조업,2000-11-04 17:24,본점,2800429313003,밀라노 본점,캐릭터캐주얼,0,298000,3
20,10139,여,28,미혼,Gangdong-gu,정보서비스,2000-05-07 15:53,천호점,4400502028400,휠라슈즈,스포츠,0,64000,3
36,10208,여,28,미혼,Gwangjin-gu,제조업,2000-05-03 16:53,천호점,4406261022900,엘르뿌뽕,유아동복,0,21500,1
209,10275,여,28,미혼,Eunpyeong-gu,개인사업,2000-07-12 18:50,신촌점,4215770013070,아니베에프,캐릭터캐주얼,0,30000,1
211,10275,여,28,미혼,Eunpyeong-gu,개인사업,2000-11-24 11:50,본점,4100830008000,라프레리,화장품,1,185000,3
212,10275,여,28,미혼,Eunpyeong-gu,개인사업,2000-12-08 18:23,무역점,4106530008200,메이컵포에버,화장품,1,44000,1
216,10350,남,28,미혼,Gangnam-gu,제조업,2000-05-11 19:03,무역점,4533170048500,앰비,문화완구,0,29000,1
217,10350,남,28,미혼,Gangnam-gu,제조업,2000-08-03 12:50,신촌점,2700000000000,식품,일반식품,0,37701,1


## Concatenating DataFrame

In [107]:
df5 = pd.DataFrame(np.arange(6).reshape(3,2),
                  index=['a','b','c'], columns=['one', 'two'])
df6 = pd.DataFrame(5+np.arange(4).reshape(2,2),
                  index=['a','c'], columns=['three','four'])

In [108]:
display(df5, df6)

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


Unnamed: 0,three,four
a,5,6
c,7,8


In [114]:
pd.concat([df5,df6], axis=1) ## 열로 추가

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [103]:
df7 = pd.DataFrame(np.arange(12).reshape(3,4), columns=['a', 'b', 'c', 'd'])
df8 = pd.DataFrame(12+np.arange(6).reshape(2,3), columns=['b','d','a'])

In [104]:
display(df7, df8)

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


Unnamed: 0,b,d,a
0,12,13,14
1,15,16,17


In [115]:
pd.concat([df7,df8]) ## 행으로 추가

Unnamed: 0,a,b,c,d
0,0,1,2.0,3
1,4,5,6.0,7
2,8,9,10.0,11
0,14,12,,13
1,17,15,,16


In [116]:
pd.concat([df7, df8], ignore_index=True) ## 겹치는 index 무시

Unnamed: 0,a,b,c,d
0,0,1,2.0,3
1,4,5,6.0,7
2,8,9,10.0,11
3,14,12,,13
4,17,15,,16


## Handling Missing Data

In [138]:
data = pd.merge(df3, df4, left_on='lkey', right_on='rkey',how='outer')
data

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


- column별 / row별 결측값 개수 구하기

In [139]:
data.isnull()
# pd.isnull(data)

Unnamed: 0,lkey,data1,rkey,data2
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,True,True
7,True,True,False,False


In [140]:
## column별 개수
data.isnull().sum()
# pd.isnull(data).sum()

lkey     1
data1    1
rkey     1
data2    1
dtype: int64

In [141]:
## row별 개수
data.isnull().sum(1)
# pd.isnull(data).sum(1)

0    0
1    0
2    0
3    0
4    0
5    0
6    2
7    2
dtype: int64

In [142]:
data.notnull()
# pd.notnull(data)

Unnamed: 0,lkey,data1,rkey,data2
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,True,True
4,True,True,True,True
5,True,True,True,True
6,True,True,False,False
7,False,False,True,True


In [163]:
## column별 개수
data.notnull().sum()
# pd.notnull(data).sum()

lkey     7
data1    7
rkey     7
data2    7
dtype: int64

In [164]:
## row별 개수
data.notnull().sum(1)
# pd.notnull(data).sum(1)

0    4
1    4
2    4
3    4
4    4
5    4
6    2
7    2
dtype: int64

- 결측값 채우기

In [146]:
data.fillna(0)

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,0,0.0
7,0,0.0,d,2.0


In [147]:
data.fillna('missing')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0
6,c,3,missing,missing
7,missing,missing,d,2


In [150]:
## 결측값을 앞 방향으로 채우기
data.fillna(method='ffill')
# data.fillna(method='pad')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,a,0.0
7,c,3.0,d,2.0


In [151]:
## 결측값을 뒷 방향으로 채우기
data.fillna(method='bfill')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,d,2.0
7,,,d,2.0


In [152]:
## column별 평균으로 대체
data.fillna(data.mean())

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,0.714286
7,,3.0,d,2.0


In [154]:
## column마다 다른 값으로 대체
data.fillna({'lkey':'ㅋ','data1':100,'rkey':'ㅋㅋ','data2':-100})

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,ㅋㅋ,-100.0
7,ㅋ,100.0,d,2.0


- 결측값이 들어있는 row / column 제거하기

In [155]:
data.dropna()

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0


In [160]:
data.dropna(axis=1)

0
1
2
3
4
5
6
7


In [156]:
data.dropna(how='all')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [161]:
data.dropna(how='all', axis=1)

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


#### Exercise3 다음 데이터의 결측값을 확인하고 키와 몸무게는 평균으로, 나머지는  임의로 채운 후 결측치가 있는지 확인해 보세요.

In [39]:
player = pd.read_csv('players.csv');player

Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky
5,5,Gene Berce,180.0,79.0,Marquette University,1926.0,,
6,6,Charlie Black,196.0,90.0,University of Kansas,1921.0,Arco,Idaho
7,7,Nelson Bobb,183.0,77.0,Temple University,1924.0,Philadelphia,Pennsylvania
8,8,Jake Bornheimer,196.0,90.0,Muhlenberg College,1927.0,New Brunswick,New Jersey
9,9,Vince Boryla,196.0,95.0,University of Denver,1927.0,East Chicago,Indiana


In [52]:
## code here

player.isnull().sum()
player.fillna({'height':player.height.mean(),'weight':player.weight.mean()},inplace = True)
player.fillna(0,inplace=True)
player.isnull().sum()

Unnamed: 0     0
Player         0
height         0
weight         0
collage        0
born           0
birth_city     0
birth_state    0
dtype: int64

# 수고하셨습니다~~~~