# 전국 신규 민간 아파트 분양가격 동향 분석

## import

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## 데이터 불러오기

<img src='./img/avg15.09_sample.png' width='70%'>

In [6]:
# 위의 이미지에서 나타낸 문제들을 해결하기 위해 header, thousands 옵션을 사용하여 csv를 read
avg15_09_ori = pd.read_csv('aptPriceDataSet/avg15.09.csv', header=3, thousands=',')
avg18_07_ori = pd.read_csv('aptPriceDataSet/avg18.07.csv', thousands=',')
shop_201806_01_ori = pd.read_csv('aptPriceDataSet/shop_201806_01.csv')

In [7]:
# 원본 DataFrame 유지를 위해 복사본 생성
avg15_09 = avg15_09_ori.copy()
avg18_07 = avg18_07_ori.copy()

## 데이터 확인

### avg15_09

In [8]:
avg15_09.shape

(21, 27)

In [9]:
avg15_09.columns

Index(['시도', '시군구', '12월', '1월', '2월', '3월', '4월', '5월', '6월', '7월', '8월',
       '9월', '10월', '11월', '12월.1', '1월.1', '2월.1', '3월.1', '4월.1', '5월.1',
       '6월.1', '7월.1', '8월.1', '9월.1', '전월비', '전년말비', '전년동월비'],
      dtype='object')

In [10]:
avg15_09.head()

Unnamed: 0,시도,시군구,12월,1월,2월,3월,4월,5월,6월,7월,...,3월.1,4월.1,5월.1,6월.1,7월.1,8월.1,9월.1,전월비,전년말비,전년동월비
0,전국,,8059,8130,8195,8204,8289,8358,8344,8333,...,8563,8613,8624,8627,8643,8678,8665,-13,82,207
1,서울,,18189,17925,17925,18016,18098,19446,18867,18742,...,19415,18842,18367,18374,18152,18443,17969,-474,-2300,-1434
2,6대광역시,부산,8111,8111,9078,8965,9402,9501,9453,9457,...,9279,9327,9345,9515,9559,9581,9608,1,430,477
3,,대구,8080,8080,8077,8101,8267,8274,8360,8360,...,8441,8446,8568,8542,8542,8795,8863,27,400,350
4,,인천,10204,10204,10408,10408,10000,9844,10058,9974,...,9876,9938,10551,10443,10443,10449,10450,-162,-150,-131


In [11]:
avg15_09.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 27 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   시도      7 non-null      object
 1   시군구     14 non-null     object
 2   12월     21 non-null     int64 
 3   1월      21 non-null     int64 
 4   2월      21 non-null     int64 
 5   3월      21 non-null     int64 
 6   4월      21 non-null     int64 
 7   5월      21 non-null     int64 
 8   6월      21 non-null     int64 
 9   7월      21 non-null     int64 
 10  8월      21 non-null     int64 
 11  9월      21 non-null     int64 
 12  10월     21 non-null     int64 
 13  11월     21 non-null     int64 
 14  12월.1   21 non-null     int64 
 15  1월.1    21 non-null     int64 
 16  2월.1    21 non-null     int64 
 17  3월.1    21 non-null     int64 
 18  4월.1    21 non-null     int64 
 19  5월.1    21 non-null     int64 
 20  6월.1    21 non-null     int64 
 21  7월.1    21 non-null     int64 
 22  8월.1    21 non-null     int6

### avg18_07

In [12]:
avg18_07.shape

(2890, 5)

In [13]:
avg18_07.columns

Index(['지역명', '규모구분', '연도', '월', '분양가격(㎡)'], dtype='object')

In [14]:
avg18_07.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2890 entries, 0 to 2889
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   지역명      2890 non-null   object
 1   규모구분     2890 non-null   object
 2   연도       2890 non-null   int64 
 3   월        2890 non-null   int64 
 4   분양가격(㎡)  2750 non-null   object
dtypes: int64(2), object(3)
memory usage: 113.0+ KB


## 데이터 전처리

In [15]:
# 전월비, 전년말비, 전년동월비의 데이터를 직접 계산하여 확인하는 과정
avg15_09['전월비확인'] = avg15_09['9월.1'].astype(int) - avg15_09['8월.1'].astype(int)
avg15_09['전년말비확인'] = avg15_09['9월.1'].astype(int) - avg15_09['12월.1'].astype(int)
avg15_09['전년동월비확인'] = avg15_09['9월.1'].astype(int) - avg15_09['9월'].astype(int)
avg15_09

Unnamed: 0,시도,시군구,12월,1월,2월,3월,4월,5월,6월,7월,...,6월.1,7월.1,8월.1,9월.1,전월비,전년말비,전년동월비,전월비확인,전년말비확인,전년동월비확인
0,전국,,8059,8130,8195,8204,8289,8358,8344,8333,...,8627,8643,8678,8665,-13,82,207,-13,81,206
1,서울,,18189,17925,17925,18016,18098,19446,18867,18742,...,18374,18152,18443,17969,-474,-2300,-1434,-474,-2300,-1435
2,6대광역시,부산,8111,8111,9078,8965,9402,9501,9453,9457,...,9515,9559,9581,9608,1,430,477,27,400,350
3,,대구,8080,8080,8077,8101,8267,8274,8360,8360,...,8542,8542,8795,8863,27,400,350,68,610,414
4,,인천,10204,10204,10408,10408,10000,9844,10058,9974,...,10443,10443,10449,10450,-162,-150,-131,1,430,477
5,,광주,6098,7326,7611,7346,7346,7523,7659,7612,...,7881,8089,8231,8083,-148,334,281,-148,335,281
6,,대전,8321,8321,8321,8341,8341,8341,8333,8333,...,8079,8079,8079,7917,68,610,414,-162,-150,-131
7,,울산,8090,8090,8090,8153,8153,8153,8153,8153,...,9190,9190,9215,9215,0,324,722,0,324,722
8,,,8151,8355,8597,8552,8585,8606,8669,8648,...,8942,8984,9058,9023,-36,325,352,-35,325,352
9,경기,,10855,10855,10791,10784,10876,10646,10266,10124,...,10573,10518,10573,10341,-232,-38,-160,-232,-38,-160


In [16]:
# 전월비, 전년말비, 전년동월비의 값의 위치가 바뀌어 있어서 새로 계산해서 만듦/ 기존데이터는 삭제
avg15_09.drop(columns=["전월비","전년말비","전년동월비"], axis=1, inplace=True) # drop을 이용해서 컬럼삭제
avg15_09

Unnamed: 0,시도,시군구,12월,1월,2월,3월,4월,5월,6월,7월,...,3월.1,4월.1,5월.1,6월.1,7월.1,8월.1,9월.1,전월비확인,전년말비확인,전년동월비확인
0,전국,,8059,8130,8195,8204,8289,8358,8344,8333,...,8563,8613,8624,8627,8643,8678,8665,-13,81,206
1,서울,,18189,17925,17925,18016,18098,19446,18867,18742,...,19415,18842,18367,18374,18152,18443,17969,-474,-2300,-1435
2,6대광역시,부산,8111,8111,9078,8965,9402,9501,9453,9457,...,9279,9327,9345,9515,9559,9581,9608,27,400,350
3,,대구,8080,8080,8077,8101,8267,8274,8360,8360,...,8441,8446,8568,8542,8542,8795,8863,68,610,414
4,,인천,10204,10204,10408,10408,10000,9844,10058,9974,...,9876,9938,10551,10443,10443,10449,10450,1,430,477
5,,광주,6098,7326,7611,7346,7346,7523,7659,7612,...,7861,7914,7877,7881,8089,8231,8083,-148,335,281
6,,대전,8321,8321,8321,8341,8341,8341,8333,8333,...,8067,8145,8272,8079,8079,8079,7917,-162,-150,-131
7,,울산,8090,8090,8090,8153,8153,8153,8153,8153,...,8629,9380,9192,9190,9190,9215,9215,0,324,722
8,,,8151,8355,8597,8552,8585,8606,8669,8648,...,8692,8858,8967,8942,8984,9058,9023,-35,325,352
9,경기,,10855,10855,10791,10784,10876,10646,10266,10124,...,10469,10684,10685,10573,10518,10573,10341,-232,-38,-160


In [17]:
# 컬럼명 보기좋게 바꿈

avg15_09.columns = ['시도', '시군구', '2013_12', '2014_1', '2014_2', '2014_3', '2014_4', '2014_5', '2014_6', '2014_7', '2014_8',
       '2014_9', '2014_10', '2014_11', '2014_12', '2015_1', '2015_2', '2015_3', '2015_4', '2015_5',
       '2015_6', '2015_7', '2015_8', '2015_9', '전월비확인', '전년말비확인', '전년동월비확인']
avg15_09

Unnamed: 0,시도,시군구,2013_12,2014_1,2014_2,2014_3,2014_4,2014_5,2014_6,2014_7,...,2015_3,2015_4,2015_5,2015_6,2015_7,2015_8,2015_9,전월비확인,전년말비확인,전년동월비확인
0,전국,,8059,8130,8195,8204,8289,8358,8344,8333,...,8563,8613,8624,8627,8643,8678,8665,-13,81,206
1,서울,,18189,17925,17925,18016,18098,19446,18867,18742,...,19415,18842,18367,18374,18152,18443,17969,-474,-2300,-1435
2,6대광역시,부산,8111,8111,9078,8965,9402,9501,9453,9457,...,9279,9327,9345,9515,9559,9581,9608,27,400,350
3,,대구,8080,8080,8077,8101,8267,8274,8360,8360,...,8441,8446,8568,8542,8542,8795,8863,68,610,414
4,,인천,10204,10204,10408,10408,10000,9844,10058,9974,...,9876,9938,10551,10443,10443,10449,10450,1,430,477
5,,광주,6098,7326,7611,7346,7346,7523,7659,7612,...,7861,7914,7877,7881,8089,8231,8083,-148,335,281
6,,대전,8321,8321,8321,8341,8341,8341,8333,8333,...,8067,8145,8272,8079,8079,8079,7917,-162,-150,-131
7,,울산,8090,8090,8090,8153,8153,8153,8153,8153,...,8629,9380,9192,9190,9190,9215,9215,0,324,722
8,,,8151,8355,8597,8552,8585,8606,8669,8648,...,8692,8858,8967,8942,8984,9058,9023,-35,325,352
9,경기,,10855,10855,10791,10784,10876,10646,10266,10124,...,10469,10684,10685,10573,10518,10573,10341,-232,-38,-160


### avg15_09

#### 결측치 처리

In [23]:
# avg15_09의 결측치를 확인

avg15_09[avg15_09['시도'].isnull()]

Unnamed: 0,시도,시군구,2013_12,2014_1,2014_2,2014_3,2014_4,2014_5,2014_6,2014_7,...,2015_3,2015_4,2015_5,2015_6,2015_7,2015_8,2015_9,전월비확인,전년말비확인,전년동월비확인
3,,대구,8080,8080,8077,8101,8267,8274,8360,8360,...,8441,8446,8568,8542,8542,8795,8863,68,610,414
4,,인천,10204,10204,10408,10408,10000,9844,10058,9974,...,9876,9938,10551,10443,10443,10449,10450,1,430,477
5,,광주,6098,7326,7611,7346,7346,7523,7659,7612,...,7861,7914,7877,7881,8089,8231,8083,-148,335,281
6,,대전,8321,8321,8321,8341,8341,8341,8333,8333,...,8067,8145,8272,8079,8079,8079,7917,-162,-150,-131
7,,울산,8090,8090,8090,8153,8153,8153,8153,8153,...,8629,9380,9192,9190,9190,9215,9215,0,324,722
8,,,8151,8355,8597,8552,8585,8606,8669,8648,...,8692,8858,8967,8942,8984,9058,9023,-35,325,352
13,,충북,6589,6589,6611,6625,6678,6598,6587,6586,...,6783,6790,6805,6682,6601,6603,6606,3,-137,22
14,,충남,6365,6365,6379,6287,6552,6591,6644,6805,...,7161,7017,6975,6939,6935,6942,6939,-3,-50,57
15,,전북,6282,6281,5946,5966,6277,6306,6351,6319,...,6542,6551,6556,6601,6750,6580,6885,305,302,166
16,,전남,5678,5678,5678,5696,5736,5656,5609,5780,...,5825,5940,6050,6243,6286,6289,6245,-44,461,441


In [24]:
# avg15_09의 결측치를 확인

avg15_09[avg15_09['시군구'].isnull()]

Unnamed: 0,시도,시군구,2013_12,2014_1,2014_2,2014_3,2014_4,2014_5,2014_6,2014_7,...,2015_3,2015_4,2015_5,2015_6,2015_7,2015_8,2015_9,전월비확인,전년말비확인,전년동월비확인
0,전국,,8059,8130,8195,8204,8289,8358,8344,8333,...,8563,8613,8624,8627,8643,8678,8665,-13,81,206
1,서울,,18189,17925,17925,18016,18098,19446,18867,18742,...,19415,18842,18367,18374,18152,18443,17969,-474,-2300,-1435
8,,,8151,8355,8597,8552,8585,8606,8669,8648,...,8692,8858,8967,8942,8984,9058,9023,-35,325,352
9,경기,,10855,10855,10791,10784,10876,10646,10266,10124,...,10469,10684,10685,10573,10518,10573,10341,-232,-38,-160
10,수도권,,13083,12995,13041,13069,12991,13312,13064,12947,...,13253,13155,13201,13130,13038,13155,12920,-235,-636,-373
11,세종,,7601,7600,7532,7814,7908,7934,8067,8067,...,8555,8546,8546,8671,8669,8695,8715,20,155,433
20,,,6432,6462,6435,6443,6566,6552,6578,6605,...,6873,6899,6900,6925,6961,6933,7019,86,203,321


In [25]:
# 확인한 결측치를 채워넣음

avg15_09.loc[3:8, '시도'] = '6대광역시'
avg15_09.loc[13:20, '시도'] = '지방'
avg15_09.loc[1, '시군구'] = '서울'
avg15_09.loc[9, '시군구'] = '경기'
avg15_09.loc[11, '시군구'] = '세종'

# 전국/ 평균값 row 삭제
avg15_09.drop(index=0, inplace=True)
avg15_09.drop(index=8, inplace=True)
avg15_09.drop(index=20, inplace=True)
avg15_09.drop(index=10, inplace=True)

# index 재정렬
avg15_09 = avg15_09.reset_index(drop=True)

avg15_09

Unnamed: 0,시도,시군구,2013_12,2014_1,2014_2,2014_3,2014_4,2014_5,2014_6,2014_7,...,2015_3,2015_4,2015_5,2015_6,2015_7,2015_8,2015_9,전월비확인,전년말비확인,전년동월비확인
0,서울,서울,18189,17925,17925,18016,18098,19446,18867,18742,...,19415,18842,18367,18374,18152,18443,17969,-474,-2300,-1435
1,6대광역시,부산,8111,8111,9078,8965,9402,9501,9453,9457,...,9279,9327,9345,9515,9559,9581,9608,27,400,350
2,6대광역시,대구,8080,8080,8077,8101,8267,8274,8360,8360,...,8441,8446,8568,8542,8542,8795,8863,68,610,414
3,6대광역시,인천,10204,10204,10408,10408,10000,9844,10058,9974,...,9876,9938,10551,10443,10443,10449,10450,1,430,477
4,6대광역시,광주,6098,7326,7611,7346,7346,7523,7659,7612,...,7861,7914,7877,7881,8089,8231,8083,-148,335,281
5,6대광역시,대전,8321,8321,8321,8341,8341,8341,8333,8333,...,8067,8145,8272,8079,8079,8079,7917,-162,-150,-131
6,6대광역시,울산,8090,8090,8090,8153,8153,8153,8153,8153,...,8629,9380,9192,9190,9190,9215,9215,0,324,722
7,경기,경기,10855,10855,10791,10784,10876,10646,10266,10124,...,10469,10684,10685,10573,10518,10573,10341,-232,-38,-160
8,세종,세종,7601,7600,7532,7814,7908,7934,8067,8067,...,8555,8546,8546,8671,8669,8695,8715,20,155,433
9,지방,강원,6230,6230,6230,6141,6373,6350,6350,6268,...,6182,6924,6846,6986,7019,7008,7121,113,756,702


### avg18_07

#### 결측치 처리

In [26]:
# avg18_07 결측치인 row 확인

avg18_07[avg18_07['분양가격(㎡)'].isnull()] 

Unnamed: 0,지역명,규모구분,연도,월,분양가격(㎡)
368,광주,전용면적 85㎡초과 102㎡이하,2016,2,
369,광주,전용면적 102㎡초과,2016,2,
374,대전,전용면적 102㎡초과,2016,2,
388,강원,전용면적 85㎡초과 102㎡이하,2016,2,
421,제주,전용면적 60㎡이하,2016,2,
...,...,...,...,...,...
2841,울산,전용면적 60㎡이하,2018,7,
2843,울산,전용면적 85㎡초과 102㎡이하,2018,7,
2844,울산,전용면적 102㎡초과,2018,7,
2878,경북,전용면적 85㎡초과 102㎡이하,2018,7,


In [27]:
# avg18_07['분양가격(㎡)'] 결측치때문에 계산이 안되므로 0으로 대체
# fillna(결측치에 적용될 데이터, inplace=True(갱신된내용 원본에 적용))

avg18_07['분양가격(㎡)'].fillna('0', inplace=True)
avg18_07[avg18_07['분양가격(㎡)'].isnull()]     # 결측치 없어짐

Unnamed: 0,지역명,규모구분,연도,월,분양가격(㎡)


In [28]:
# 결측치 외에도 '  '와 같이 공백으로 채워진 데이터 확인

avg18_07[avg18_07['분양가격(㎡)']=='  ']

Unnamed: 0,지역명,규모구분,연도,월,분양가격(㎡)
28,광주,전용면적 85㎡초과 102㎡이하,2015,10,
29,광주,전용면적 102㎡초과,2015,10,
34,대전,전용면적 102㎡초과,2015,10,
81,제주,전용면적 60㎡이하,2015,10,
113,광주,전용면적 85㎡초과 102㎡이하,2015,11,
114,광주,전용면적 102㎡초과,2015,11,
119,대전,전용면적 102㎡초과,2015,11,
166,제주,전용면적 60㎡이하,2015,11,
198,광주,전용면적 85㎡초과 102㎡이하,2015,12,
199,광주,전용면적 102㎡초과,2015,12,


In [29]:
# avg18_07['분양가격(㎡)'] 공백 때문에 계산이 안되므로 0으로 대체

avg18_07[avg18_07['분양가격(㎡)']=='  '] = '0'
avg18_07

Unnamed: 0,지역명,규모구분,연도,월,분양가격(㎡)
0,서울,전체,2015,10,5841
1,서울,전용면적 60㎡이하,2015,10,5652
2,서울,전용면적 60㎡초과 85㎡이하,2015,10,5882
3,서울,전용면적 85㎡초과 102㎡이하,2015,10,5721
4,서울,전용면적 102㎡초과,2015,10,5879
...,...,...,...,...,...
2885,제주,전체,2018,7,3334
2886,제주,전용면적 60㎡이하,2018,7,0
2887,제주,전용면적 60㎡초과 85㎡이하,2018,7,3356
2888,제주,전용면적 85㎡초과 102㎡이하,2018,7,3226


In [30]:
# 콤마가 포함된 데이터가 존재하여 연산불가
# 분양가격(㎡) 컬럼에 콤마가 포함된 row 출력



In [31]:
# string 타입이기 때문에 replace 앞에 .str 필요
# replace를 사용하여 콤마를 지움

avg18_07['분양가격(㎡)'] = avg18_07['분양가격(㎡)'].str.replace(',', '')

In [34]:
# avg15_09의 분양가격과 단위를 맞추기 위해 분양가격(3.3㎡) 컬럼을 생성

avg18_07['분양가격(3.3㎡)'] = (avg18_07['분양가격(㎡)'].astype(int) * 3.3).astype(int)
avg18_07

Unnamed: 0,지역명,규모구분,연도,월,분양가격(㎡),분양가격(3.3㎡)
0,서울,전체,2015,10,5841,19275
1,서울,전용면적 60㎡이하,2015,10,5652,18651
2,서울,전용면적 60㎡초과 85㎡이하,2015,10,5882,19410
3,서울,전용면적 85㎡초과 102㎡이하,2015,10,5721,18879
4,서울,전용면적 102㎡초과,2015,10,5879,19400
...,...,...,...,...,...,...
2885,제주,전체,2018,7,3334,11002
2886,제주,전용면적 60㎡이하,2018,7,0,0
2887,제주,전용면적 60㎡초과 85㎡이하,2018,7,3356,11074
2888,제주,전용면적 85㎡초과 102㎡이하,2018,7,3226,10645
