In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [243]:
def seriesInfo(s) :
    print('value :', s.values)
    print('value type :', type(s.values))
    print('index :', s.index)
    print('index type :', type(s.index))
    print('index + value :')
    print(s)

In [246]:
price_series = pd.Series([4000, 3000, 3500, 2000], index = ['a', 'b', 'c', 'd'])
seriesInfo(price_series)

value : [4000 3000 3500 2000]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'b', 'c', 'd'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    4000
b    3000
c    3500
d    2000
dtype: int64


In [5]:
# label 인덱싱
price_series['a'] = 5000
seriesInfo(price_series)

value : [5000 3000 3500 2000]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'b', 'c', 'd'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    5000
b    3000
c    3500
d    2000
dtype: int64


In [6]:
# 배열 인덱싱
price_series[0] = 4000
seriesInfo(price_series)

value : [4000 3000 3500 2000]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'b', 'c', 'd'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    4000
b    3000
c    3500
d    2000
dtype: int64


In [7]:
# 값 추가
price_series['e'] = 1000
seriesInfo(price_series)

value : [4000 3000 3500 2000 1000]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    4000
b    3000
c    3500
d    2000
e    1000
dtype: int64


In [8]:
# 삭제
del price_series['e']
seriesInfo(price_series)

value : [4000 3000 3500 2000]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'b', 'c', 'd'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    4000
b    3000
c    3500
d    2000
dtype: int64


In [9]:
# Series는 인덱싱을 해야되기 때문에 순서가 있는 데이터 타입
# set 데이터 타입은 순서가 없기 때문에 Series로 만들 수 없다. -> 타입 캐스팅 후 생성
set = pd.Series(list({10, 20, 30, 40, 50}))
seriesInfo(set)

value : [40 10 50 20 30]
value type : <class 'numpy.ndarray'>
index : RangeIndex(start=0, stop=5, step=1)
index type : <class 'pandas.core.indexes.range.RangeIndex'>
index + value :
0    40
1    10
2    50
3    20
4    30
dtype: int64


In [10]:
pd.isnull(set)

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [12]:
# null 값 넣기
# np.NaN 사용
set[0] = np.NaN
seriesInfo(set)

value : [nan 10. 50. 20. 30.]
value type : <class 'numpy.ndarray'>
index : RangeIndex(start=0, stop=5, step=1)
index type : <class 'pandas.core.indexes.range.RangeIndex'>
index + value :
0     NaN
1    10.0
2    50.0
3    20.0
4    30.0
dtype: float64


In [13]:
ser01 = pd.Series([100, 200, 300, 350], index = ['a', 'o', 'k', 'm'])
ser02 = pd.Series([400, 200, 350, 450], index = ['o', 'a', 'h', 'm'])

In [14]:
# 인덱스가 같지 않은 요소들은 NaN으로 나온다.
ser03 = ser01 + ser02
seriesInfo(ser03)

value : [300.  nan  nan 800. 600.]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'h', 'k', 'm', 'o'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    300.0
h      NaN
k      NaN
m    800.0
o    600.0
dtype: float64


In [15]:
# fill_value
# 인덱스가 같지 않은 요소들은 NaN이 아닌 원래 자신의 값으로 나온다.
ser04 = ser01.add(ser02, fill_value = 0)
seriesInfo(ser04)

value : [300. 350. 300. 800. 600.]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'h', 'k', 'm', 'o'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    300.0
h    350.0
k    300.0
m    800.0
o    600.0
dtype: float64


In [18]:
# 결측치를 채워넣는 함수
# fillna()
zser = ser03.fillna(0)
seriesInfo(zser)
print('*' * 50)

zser = ser03.fillna(ser03.mean())
seriesInfo(zser)

value : [300.   0.   0. 800. 600.]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'h', 'k', 'm', 'o'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    300.0
h      0.0
k      0.0
m    800.0
o    600.0
dtype: float64
**************************************************
value : [300.         566.66666667 566.66666667 800.         600.        ]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'h', 'k', 'm', 'o'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    300.000000
h    566.666667
k    566.666667
m    800.000000
o    600.000000
dtype: float64


In [19]:
# 결측치 제거
pd.notnull(ser03)

a     True
h    False
k    False
m     True
o     True
dtype: bool

In [20]:
subset = ser03[pd.notnull(ser03)]
seriesInfo(subset)

value : [300. 800. 600.]
value type : <class 'numpy.ndarray'>
index : Index(['a', 'm', 'o'], dtype='object')
index type : <class 'pandas.core.indexes.base.Index'>
index + value :
a    300.0
m    800.0
o    600.0
dtype: float64


# DataFrame
- 2차원 행렬 데이터에 인덱스를 붙인 것과 동일
- 행 인덱스, 열 인덱스를 붙일 수 있다. 

In [26]:
# 년도에 해당하는 도시별 인구수 정의
data = {
    '2020' : [5135748, 1534879, 9872135, 4325987],
    '2018' : [5349748, 1574912, 9513215, 4368547],
    '2016' : [3517458, 4455912, 5415215, 3541547],
    '2014' : [5212748, 5415912, 5154121, 8745547],
    '지역' : ['수도권', '경상권', '수도권', '경상권'],
    '증가율' : [0.6853, 0.0434, 0.0944, 0.0034]
}

columns = ['지역', '2014', '2016', '2018', '2020', '증가율']
pop_df = pd.DataFrame(data, index = ['서울', '부산', '경기', '대구'], columns = columns)
pop_df

Unnamed: 0,지역,2014,2016,2018,2020,증가율
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944
대구,경상권,8745547,3541547,4368547,4325987,0.0034


In [27]:
pop_df.values

array([['수도권', 5212748, 3517458, 5349748, 5135748, 0.6853],
       ['경상권', 5415912, 4455912, 1574912, 1534879, 0.0434],
       ['수도권', 5154121, 5415215, 9513215, 9872135, 0.0944],
       ['경상권', 8745547, 3541547, 4368547, 4325987, 0.0034]], dtype=object)

In [28]:
pop_df.columns

Index(['지역', '2014', '2016', '2018', '2020', '증가율'], dtype='object')

In [29]:
pop_df.index

Index(['서울', '부산', '경기', '대구'], dtype='object')

In [30]:
pop_df.index.name = '도시'
pop_df.columns.name = '특성'
pop_df

특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944
대구,경상권,8745547,3541547,4368547,4325987,0.0034


In [37]:
def dfInfo(df) :
    print('df shape : {}'.format(df.shape))
    print('df size : {}'.format(df.size))
    print('df ndim : {}'.format(df.ndim))
    print('df index : {}'.format(df.index))
    print('df index type : {}'.format(type(df.index)))
    print('df columns : {}'.format(df.columns))
    print('df columns type : {}'.format(type(df.columns)))
    print('df values : \n{}'.format(df.values))
    print('df values type : {}'.format(type(df.values)))

In [38]:
dfInfo(pop_df)

df shape : (4, 6)
df size : 24
df ndim : 2
df index : Index(['서울', '부산', '경기', '대구'], dtype='object', name='도시')
df index type : <class 'pandas.core.indexes.base.Index'>
df columns : Index(['지역', '2014', '2016', '2018', '2020', '증가율'], dtype='object', name='특성')
df columns type : <class 'pandas.core.indexes.base.Index'>
df values : 
[['수도권' 5212748 3517458 5349748 5135748 0.6853]
 ['경상권' 5415912 4455912 1574912 1534879 0.0434]
 ['수도권' 5154121 5415215 9513215 9872135 0.0944]
 ['경상권' 8745547 3541547 4368547 4325987 0.0034]]
df values type : <class 'numpy.ndarray'>


In [41]:
# 열의 개수와 행의 개수가 각각 5개 이상
# 열에는 정수, 문자열, 실수, 날짜데이터가 각각 1개 이상 포함
from datetime import date, datetime, timedelta
test_date = datetime(2020, 10, 13)
date_list = [test_date + timedelta(days = i) for i in range(5)]

test_df = pd.DataFrame({
    '정수' : [9, 1, 6, 5, 3],
    '문자열' : ['apple', 'banana', 'peach', 'orange', 'grape'],
    '실수' : ['0.65', '0.89', '0.12', '0.49', '0.37'],
    '날짜' : date_list,
    '마지막' : ['best', 'good', 'normal', 'bad', 'worst']
})

dfInfo(test_df)
test_df

df shape : (5, 5)
df size : 25
df ndim : 2
df index : RangeIndex(start=0, stop=5, step=1)
df index type : <class 'pandas.core.indexes.range.RangeIndex'>
df columns : Index(['정수', '문자열', '실수', '날짜', '마지막'], dtype='object')
df columns type : <class 'pandas.core.indexes.base.Index'>
df values : 
[[9 'apple' '0.65' Timestamp('2020-10-13 00:00:00') 'best']
 [1 'banana' '0.89' Timestamp('2020-10-14 00:00:00') 'good']
 [6 'peach' '0.12' Timestamp('2020-10-15 00:00:00') 'normal']
 [5 'orange' '0.49' Timestamp('2020-10-16 00:00:00') 'bad']
 [3 'grape' '0.37' Timestamp('2020-10-17 00:00:00') 'worst']]
df values type : <class 'numpy.ndarray'>


Unnamed: 0,정수,문자열,실수,날짜,마지막
0,9,apple,0.65,2020-10-13,best
1,1,banana,0.89,2020-10-14,good
2,6,peach,0.12,2020-10-15,normal
3,5,orange,0.49,2020-10-16,bad
4,3,grape,0.37,2020-10-17,worst


In [43]:
# 행과 열을 전치
pop_df.T

도시,서울,부산,경기,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2014,5212748,5415912,5154121,8745547
2016,3517458,4455912,5415215,3541547
2018,5349748,1574912,9513215,4368547
2020,5135748,1534879,9872135,4325987
증가율,0.6853,0.0434,0.0944,0.0034


In [44]:
pop_df

특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944
대구,경상권,8745547,3541547,4368547,4325987,0.0034


In [45]:
pop_df['2014-2016 증가율'] = (pop_df['2016'] - pop_df['2014']) / pop_df['2014'] * 100
pop_df

특성,지역,2014,2016,2018,2020,증가율,2014-2016 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853,-32.522002
부산,경상권,5415912,4455912,1574912,1534879,0.0434,-17.725547
경기,수도권,5154121,5415215,9513215,9872135,0.0944,5.065733
대구,경상권,8745547,3541547,4368547,4325987,0.0034,-59.504568


In [47]:
del pop_df['2014-2016 증가율']
pop_df

특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944
대구,경상권,8745547,3541547,4368547,4325987,0.0034


In [48]:
type(pop_df['지역'])

pandas.core.series.Series

In [49]:
# 부분 indexing
pop_df[['지역', '증가율']]

특성,지역,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,수도권,0.6853
부산,경상권,0.0434
경기,수도권,0.0944
대구,경상권,0.0034


In [50]:
test_df = pd.DataFrame(np.arange(12).reshape(3, 4))
test_df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [51]:
# 열 indexing
test_df[2]

0     2
1     6
2    10
Name: 2, dtype: int32

In [59]:
# 행 indexing
# 항상 슬리이싱을 해야한다.
# 인덱스, 라벨 슬라이싱도 가능
pop_df

특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944
대구,경상권,8745547,3541547,4368547,4325987,0.0034


In [68]:
display(pop_df[:1])
display(pop_df[:'서울'])

특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853


특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853


In [92]:
display(pop_df[0:3])
display(pop_df['서울':'경기'])

특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944


특성,지역,2014,2016,2018,2020,증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,5212748,3517458,5349748,5135748,0.6853
부산,경상권,5415912,4455912,1574912,1534879,0.0434
경기,수도권,5154121,5415215,9513215,9872135,0.0944


In [77]:
# 특정 데이터 추출
# 열을 먼저 작성한 후 행을 작성한다.
pop_df['2020']['서울']

5135748

In [249]:
score_data = {
    'kor' : [80, 90, 70, 30],
    'eng' : [90, 70, 60, 40],
    'math' : [90, 60, 90, 70]
}

columns = ['kor', 'eng', 'math']
index = ['김지은', '황인범', '김정수', '최호진']

exec_df = pd.DataFrame(score_data, index = index, columns = columns)
display(exec_df)


# 모든 학생의 수학 점수를 시리즈로 출력
display(exec_df['math'])
print(type(exec_df['math']))
print('*' * 50)

# 모든 학생의 국어와 영어 점수를 데이터 프레임으로 만들기
display(exec_df[['kor', 'eng']])
print(type(exec_df[['kor', 'eng']]))
print('*' * 50)

# 모든 학생의 각 과목 평균 점수를 새로운 열로 추가
exec_df['avg'] = round((exec_df['kor'] + exec_df['eng'] + exec_df['math']) / 3, 2)
display(exec_df)
print('*' * 50)

# 최호진 학생의 영어 점수를 90점으로 수정하고 평균 점수도 다시 계산
exec_df['eng']['최호진'] = 90
exec_df['avg'] = round((exec_df['kor'] + exec_df['eng'] + exec_df['math']) / 3, 2)
display(exec_df)
print('*' * 50)

# 김지은 학생의 점수를 데이터 프레임으로 만들기
display(exec_df[:1])
print(type(exec_df[:1]))
print('*' * 50)

# 김정수 학생의 점수를 시리즈로 출력
# 1
sub_list = [i for i in exec_df.columns]

for i in sub_list :
    print(exec_df[2:3][i])
    print(type(exec_df[2:3][i]))
print('*' * 50)

# 2
display(exec_df.T['김정수']) 
print('*' * 50)

# 황인범 학생의 국어점수와 수학점수를 100점으로 수정하고 평균 점수도 다시 계산
exec_df['kor']['황인범'] = 100
exec_df['math']['황인범'] = 100
exec_df['avg'] = round((exec_df['kor'] + exec_df['eng'] + exec_df['math']) / 3, 2)
display(exec_df)

Unnamed: 0,kor,eng,math
김지은,80,90,90
황인범,90,70,60
김정수,70,60,90
최호진,30,40,70


김지은    90
황인범    60
김정수    90
최호진    70
Name: math, dtype: int64

<class 'pandas.core.series.Series'>
**************************************************


Unnamed: 0,kor,eng
김지은,80,90
황인범,90,70
김정수,70,60
최호진,30,40


<class 'pandas.core.frame.DataFrame'>
**************************************************


Unnamed: 0,kor,eng,math,avg
김지은,80,90,90,86.67
황인범,90,70,60,73.33
김정수,70,60,90,73.33
최호진,30,40,70,46.67


**************************************************


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,kor,eng,math,avg
김지은,80,90,90,86.67
황인범,90,70,60,73.33
김정수,70,60,90,73.33
최호진,30,90,70,63.33


**************************************************


Unnamed: 0,kor,eng,math,avg
김지은,80,90,90,86.67


<class 'pandas.core.frame.DataFrame'>
**************************************************
김정수    70
Name: kor, dtype: int64
<class 'pandas.core.series.Series'>
김정수    60
Name: eng, dtype: int64
<class 'pandas.core.series.Series'>
김정수    90
Name: math, dtype: int64
<class 'pandas.core.series.Series'>
김정수    73.33
Name: avg, dtype: float64
<class 'pandas.core.series.Series'>
**************************************************


kor     70.00
eng     60.00
math    90.00
avg     73.33
Name: 김정수, dtype: float64

**************************************************


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,kor,eng,math,avg
김지은,80,90,90,86.67
황인범,100,70,100,90.0
김정수,70,60,90,73.33
최호진,30,90,70,63.33


### 데이터 입출력
- 매직명령어 : %%

In [136]:
%%writefile sample01.csv
col01, col02, col03
1, 1, 1
2, 2, 2
3, 3, 3
4, 4, 4

Writing sample01.csv


In [140]:
court_df = pd.read_csv('./data/court_code.txt', sep = '\t', encoding = 'cp949')
dfInfo(court_df)

df shape : (46180, 3)
df size : 138540
df ndim : 2
df index : RangeIndex(start=0, stop=46180, step=1)
df index type : <class 'pandas.core.indexes.range.RangeIndex'>
df columns : Index(['법정동코드', '법정동명', '폐지여부'], dtype='object')
df columns type : <class 'pandas.core.indexes.base.Index'>
df values : 
[[1100000000 '서울특별시' '존재']
 [1111000000 '서울특별시 종로구' '존재']
 [1111010100 '서울특별시 종로구 청운동' '존재']
 ...
 [5013032024 '제주특별자치도 서귀포시 표선면 가시리' '존재']
 [5013032025 '제주특별자치도 서귀포시 표선면 세화리' '존재']
 [5013032026 '제주특별자치도 서귀포시 표선면 토산리' '존재']]
df values type : <class 'numpy.ndarray'>


In [141]:
court_df.head()

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재


In [142]:
court_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46180 entries, 0 to 46179
Data columns (total 3 columns):
법정동코드    46180 non-null int64
법정동명     46180 non-null object
폐지여부     46180 non-null object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [145]:
# 1. 폐지여부가 존재인 것들만 데이터프레임으로 만들기
subset_df = court_df[court_df['폐지여부'] == '존재']
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20544 entries, 0 to 46179
Data columns (total 3 columns):
법정동코드    20544 non-null int64
법정동명     20544 non-null object
폐지여부     20544 non-null object
dtypes: int64(1), object(2)
memory usage: 642.0+ KB


In [146]:
subset_df.head()

Unnamed: 0,법정동코드,법정동명,폐지여부
0,1100000000,서울특별시,존재
1,1111000000,서울특별시 종로구,존재
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재


In [152]:
# Pandas에서 문자열 전처리를 할 경우 반드시 str 사용
# 법정동명 앞 5자리만 추출
display(subset_df['법정동명'].str[:5].head())
print('*' * 50)

# 법정동명 마지막 한글자만 추출
display(subset_df['법정동명'].str[-1].head())

0    서울특별시
1    서울특별시
2    서울특별시
3    서울특별시
4    서울특별시
Name: 법정동명, dtype: object

**************************************************


0    시
1    구
2    동
3    동
4    동
Name: 법정동명, dtype: object

In [157]:
# expand = True : 분리된 문자열 리스트로 새로운 데이터프레임을 생성
display(subset_df['법정동명'].str.split(' ').head())
print('*' * 50)

display(subset_df['법정동명'].str.split(' ', expand = True).head())

0              [서울특별시]
1         [서울특별시, 종로구]
2    [서울특별시, 종로구, 청운동]
3    [서울특별시, 종로구, 신교동]
4    [서울특별시, 종로구, 궁정동]
Name: 법정동명, dtype: object

**************************************************


Unnamed: 0,0,1,2,3,4
0,서울특별시,,,,
1,서울특별시,종로구,,,
2,서울특별시,종로구,청운동,,
3,서울특별시,종로구,신교동,,
4,서울특별시,종로구,궁정동,,


In [173]:
# 특정 글자로 시작하는 startswith() : boolean 타입 반환
# 서울로 시작하는 데이터만 추출
display(subset_df[subset_df['법정동명'].str.startswith('서울')].tail())
print('*' * 50)

# 법정동명중 '동' 으로만 끝나는 데이터만 추출
display(subset_df[subset_df['법정동명'].str.endswith('동')].head())
print('*' * 50)

# 법정동명중 특정 글자를 포함하는 데이터만 추철 : str.contains()
# 강서구를 포함하는 데이터 필터링
display(subset_df[subset_df['법정동명'].str.contains('강서구')].head())
print('*' * 50)

# 법정동명에서 공백을 다른 문자(_)로 대체
display(subset_df['법정동명'].str.replace(' ', '_').head(10))
print('*' * 50)

Unnamed: 0,법정동코드,법정동명,폐지여부
1107,1174010600,서울특별시 강동구 둔촌동,존재
1108,1174010700,서울특별시 강동구 암사동,존재
1109,1174010800,서울특별시 강동구 성내동,존재
1110,1174010900,서울특별시 강동구 천호동,존재
1111,1174011000,서울특별시 강동구 강일동,존재


**************************************************


Unnamed: 0,법정동코드,법정동명,폐지여부
2,1111010100,서울특별시 종로구 청운동,존재
3,1111010200,서울특별시 종로구 신교동,존재
4,1111010300,서울특별시 종로구 궁정동,존재
5,1111010400,서울특별시 종로구 효자동,존재
6,1111010500,서울특별시 종로구 창성동,존재


**************************************************


Unnamed: 0,법정동코드,법정동명,폐지여부
737,1150000000,서울특별시 강서구,존재
740,1150010100,서울특별시 강서구 염창동,존재
741,1150010200,서울특별시 강서구 등촌동,존재
742,1150010300,서울특별시 강서구 화곡동,존재
743,1150010400,서울특별시 강서구 가양동,존재


**************************************************


0            서울특별시
1        서울특별시_종로구
2    서울특별시_종로구_청운동
3    서울특별시_종로구_신교동
4    서울특별시_종로구_궁정동
5    서울특별시_종로구_효자동
6    서울특별시_종로구_창성동
7    서울특별시_종로구_통의동
8    서울특별시_종로구_적선동
9    서울특별시_종로구_통인동
Name: 법정동명, dtype: object

**************************************************


In [181]:
# 공백이 들어있는 경우 공백 제거 및 대소문자 처리
empty_df = pd.DataFrame({
    'col01' : ['abcd    ', '   FFFfdg  ', 'abCCe  '],
    'col02' : ['   dfkSDFjf', 'asdf  ', 'IndlfL   ']
})
empty_df

Unnamed: 0,col01,col02
0,abcd,dfkSDFjf
1,FFFfdg,asdf
2,abCCe,IndlfL


In [213]:
# str.strip(), str.lstrirp(), str.rstrip()
# str.lower(), str.upper(), str.swapcase()
test_ltrip = empty_df['col01'].str.rstrip()
test_ltrip[1]

'   FFFfdg'

In [203]:
weather_df = pd.read_csv('./data/weather_20201012.csv', sep = ',', encoding = 'cp949')
weather_df.head()

Unnamed: 0,날짜,지점,평균기온(℃),최저기온(℃),최고기온(℃)
0,1907-10-01,108,13.5,7.9,20.7
1,1907-10-02,108,16.2,7.9,22.0
2,1907-10-03,108,16.2,13.1,21.3
3,1907-10-04,108,16.5,11.2,22.0
4,1907-10-05,108,17.6,10.9,25.4


In [250]:
# 기온이 가장 높았던 날은 언제이고 몇도인지 데이터프레임으로 출력
# 1
idx = np.argmax(weather_df['최고기온(℃)'])
display(weather_df[idx:idx + 1])

# 2
display(weather_df[weather_df['최고기온(℃)'] == weather_df['최고기온(℃)'].max()])

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return bound(*args, **kwds)


Unnamed: 0,날짜,지점,평균기온(℃),최저기온(℃),최고기온(℃)
40051,2018-08-01,108,33.6,27.8,39.6


Unnamed: 0,날짜,지점,평균기온(℃),최저기온(℃),최고기온(℃)
40051,2018-08-01,108,33.6,27.8,39.6


In [251]:
name_df = pd.read_csv('./data/year2020_baby_name.csv', sep = ',', encoding = 'cp949')
name_df.head()

Unnamed: 0,NAME,GENDER,COUNT
0,Isabella,F,22731
1,Sophia,F,20477
2,Emma,F,17179
3,Olivia,F,16860
4,Ava,F,15300


In [252]:
# 정렬 : sort_values(by = , ascending = )
# 타입변환 : astype(type)
sort_df = name_df.sort_values(by = 'COUNT', ascending = False)
sort_df.head(10)

Unnamed: 0,NAME,GENDER,COUNT
0,Isabella,F,22731
19698,Jacob,M,21875
1,Sophia,F,20477
19699,Ethan,M,17866
2,Emma,F,17179
19700,Michael,M,17133
19701,Jayden,M,17030
19702,William,M,16870
3,Olivia,F,16860
19703,Alexander,M,16634


In [253]:
rank_df = name_df.sort_values(by = 'COUNT', ascending = False)['COUNT'].rank(ascending = False)
rank = rank_df.astype('int64')
rank

sort_df['RANK'] = rank
sort_df.head(10)

Unnamed: 0,NAME,GENDER,COUNT,RANK
0,Isabella,F,22731,1
19698,Jacob,M,21875,2
1,Sophia,F,20477,3
19699,Ethan,M,17866,4
2,Emma,F,17179,5
19700,Michael,M,17133,6
19701,Jayden,M,17030,7
19702,William,M,16870,8
3,Olivia,F,16860,9
19703,Alexander,M,16634,10


In [232]:
# gender를 기준으로 M 데이터프레임을 만들기
# gender를 기준으로 F 데이터프레임을 만들기
M_df = sort_df[sort_df['GENDER'] == 'M']
F_df = sort_df[sort_df['GENDER'] == 'F']

display(M_df.head())
print('*' * 50)

display(F_df.head())

Unnamed: 0,NAME,GENDER,COUNT,RANK
19698,Jacob,M,21875,2
19699,Ethan,M,17866,4
19700,Michael,M,17133,6
19701,Jayden,M,17030,7
19702,William,M,16870,8


**************************************************


Unnamed: 0,NAME,GENDER,COUNT,RANK
0,Isabella,F,22731,1
1,Sophia,F,20477,3
2,Emma,F,17179,5
3,Olivia,F,16860,9
4,Ava,F,15300,15


In [233]:
# merge
# 행의 개수가 같고 인덱스 번호가 같아야 가능하다.
gender_df = pd.merge(M_df, F_df)
gender_df

Unnamed: 0,NAME,GENDER,COUNT,RANK


In [235]:
m_df = M_df.reset_index(drop = True)
m_df.head()

Unnamed: 0,NAME,GENDER,COUNT,RANK
0,Jacob,M,21875,2
1,Ethan,M,17866,4
2,Michael,M,17133,6
3,Jayden,M,17030,7
4,William,M,16870,8


In [254]:
gender_df = pd.merge(m_df, F_df, left_index = True, right_index = True)
gender_df.head()

Unnamed: 0,NAME_x,GENDER_x,COUNT_x,RANK_x,NAME_y,GENDER_y,COUNT_y,RANK_y
0,Jacob,M,21875,2,Isabella,F,22731,1
1,Ethan,M,17866,4,Sophia,F,20477,3
2,Michael,M,17133,6,Emma,F,17179,5
3,Jayden,M,17030,7,Olivia,F,16860,9
4,William,M,16870,8,Ava,F,15300,15


In [255]:
rank = gender_df['COUNT_x'].rank(ascending = False)
rank = rank.astype('int64')

gender_df['RANK'] = rank
gender_df.head()

Unnamed: 0,NAME_x,GENDER_x,COUNT_x,RANK_x,NAME_y,GENDER_y,COUNT_y,RANK_y,RANK
0,Jacob,M,21875,2,Isabella,F,22731,1,1
1,Ethan,M,17866,4,Sophia,F,20477,3,2
2,Michael,M,17133,6,Emma,F,17179,5,3
3,Jayden,M,17030,7,Olivia,F,16860,9,4
4,William,M,16870,8,Ava,F,15300,15,5
