# 라이브러리 로드

In [1]:
# 기초 전처리
import pandas as pd
import numpy as np
import geopandas as gpd
from tqdm import tqdm
%matplotlib inline

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import os

# 캔버스 사이즈 적용
plt.rcParams['figure.figsize'] = (12, 9)

# 컬럼 전체 확인 가능하도록 출력 범위 설정
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

# 불필요한 경고 표시 생략
import warnings
warnings.filterwarnings(action = 'ignore')

##### **필요함수 정의**

In [2]:
def load_data(path):
    df = pd.read_csv(path, encoding='CP949')
    df = df[ df['상권_구분_코드']=='A' ]  # 골목상권으로 테이블 제한
    
    return df

##### 베이스 데이터 로드

In [3]:
df_base = pd.read_csv('../data/base_data/df_base.csv')
df_base.head()

Unnamed: 0,상권_코드
0,1000001
1,1000002
2,1000003
3,1000004
4,1000005


# 상권별 평균 지가 Feature
## 골목상권별 행정동 코드 테이블

In [4]:
store_code = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권영역).csv')
# 필요없는 컬럼 정리
store_code.drop(['기준_년월_코드','상권_구분_코드','상권_구분_코드_명', '시군구_코드', '형태정보'], axis=1, inplace=True)
store_code.head(3)

Unnamed: 0,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,행정동_코드
70,1000334,삼양로93길,201388,459892,11305660
71,1000354,덕릉로60길,203646,459682,11320513
72,1000355,도당로13가길,203280,463008,11320700


In [5]:
store_code = pd.merge(df_base, store_code, on='상권_코드', how='left')
print(store_code.shape)
store_code.head(3)

(1009, 5)


Unnamed: 0,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,행정동_코드
0,1000001,계동길,198799,453610,11110600
1,1000002,난계로27길,201996,452630,11110710
2,1000003,돈화문로11가길,198977,452902,11110615


## 서울시별 법정동 코드 테이블

In [6]:
area_cat = pd.read_csv('../data/geo/한국행정구역분류_2021.10.1.기준.csv')
area_cat.drop('Unnamed: 4', axis=1, inplace=True)
area_cat.head(3)

Unnamed: 0,시도,행정구역코드,행정기관코드,법정동코드
0,서울특별시,11.0,1100000000,1100000000
1,서울특별시,11010.0,1111000000,1111000000
2,서울특별시,1101072.0,1111051500,1111010100


In [7]:
# 행정동 코드와 법정동 코드의 뒷자리 0 두개를 삭제
area_cat = area_cat[area_cat['시도']=='서울특별시']

area_cat['행정기관코드'] = area_cat['행정기관코드'].astype(str)
area_cat['행정기관코드'] = area_cat['행정기관코드'].apply(lambda x: x[:-2])
area_cat['행정기관코드'] = area_cat['행정기관코드'].astype(int)

area_cat['법정동코드'] = area_cat['법정동코드'].astype(str)
area_cat['법정동코드'] = area_cat['법정동코드'].apply(lambda x: x[:-2])
area_cat['법정동코드'] = area_cat['법정동코드'].astype(int)

area_cat.head(3)

Unnamed: 0,시도,행정구역코드,행정기관코드,법정동코드
0,서울특별시,11.0,11000000,11000000
1,서울특별시,11010.0,11110000,11110000
2,서울특별시,1101072.0,11110515,11110101


In [8]:
# 골목상권별 행정코드와 서울시 법정동 코드와 merge
df_pre = pd.merge(store_code, area_cat, left_on='행정동_코드', right_on='행정기관코드', how='left')
print(df_pre['상권_코드'].nunique())
df_pre.head(3)

1009


Unnamed: 0,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,행정동_코드,시도,행정구역코드,행정기관코드,법정동코드
0,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146.0
1,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110147.0
2,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110148.0


In [9]:
# null값으로 된 법정동코드를 추가
null_list = ['노해로17길', '노해로23길','노해로33길','덕릉로40길','덕릉로41길',
             '도봉로99길', '삼각산로28길', '삼양로77길', '수유로12길', '오패산로67길',
             '한천로109길', '한천로124나길','한천로130길','한천로132길','한천로139가길',
             '한천로140길', '상일로5길', '천호대로219길']
법정동코드 = [11305103, 11305103, 11305103, 11305102, 11305102,
            11305103, 11305103, 11305103, 11305103, 11305102,
            11305102, 11305102, 11305102, 11305102, 11305103,
            11305103, 11740103, 11740103]

In [10]:
for null, num in zip(null_list, 법정동코드):
    null_idx = df_pre[df_pre['상권_코드_명']==null]['법정동코드'].index
    df_pre.iloc[null_idx, 8] = num
    
# 소수화된 법정동코드를 str화
df_pre['법정동코드'] = df_pre['법정동코드'].astype(int)
df_pre['법정동코드'] = df_pre['법정동코드'].astype(str)
df_pre['법정동코드'].isnull().sum()

0

In [11]:
print(df_pre.shape)
df_pre.head(3)

(1731, 9)


Unnamed: 0,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,행정동_코드,시도,행정구역코드,행정기관코드,법정동코드
0,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146
1,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110147
2,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110148


## 공시지가 테이블

In [12]:
jiga = pd.read_csv('../data/geo/서울시공시지가(2020).csv', encoding='cp949')
print(jiga.shape)
jiga.head()

(909537, 13)


Unnamed: 0.1,Unnamed: 0,고유번호,법정동코드,법정동명,특수지구분코드,특수지구분명,지번,기준연도,기준월,공시지가,공시일자,표준지여부,데이터기준일자
0,30,1.11101e+18,1111010100,서울특별시 종로구 청운동,1,일반,1,2020,1,4357000,2020-05-29,N,2021-08-03
1,43,1.11101e+18,1111010100,서울특별시 종로구 청운동,1,일반,01월 01일,2020,1,1392000,2020-05-29,N,2021-08-03
2,75,1.11101e+18,1111010100,서울특별시 종로구 청운동,1,일반,01월 02일,2020,1,2520000,2020-05-29,N,2021-08-03
3,107,1.11101e+18,1111010100,서울특별시 종로구 청운동,1,일반,01월 03일,2020,1,4337000,2020-05-29,N,2021-08-03
4,120,1.11101e+18,1111010100,서울특별시 종로구 청운동,1,일반,01월 04일,2020,1,1554000,2020-05-29,N,2021-08-03


In [13]:
# 7월의 법정동코드는 258개, df_pre의 법정동코드는 416
# 1월의 법정동코드는 df_pre의 법정동코드보다 많으므로 1월을 기준으로 잡음
jiga = jiga[jiga['기준월']==1]

# 필요없는 컬럼 삭제
jiga.drop(['Unnamed: 0','고유번호','법정동명','특수지구분코드',
           '특수지구분명','지번', '기준연도','공시일자','표준지여부','데이터기준일자','기준월'], axis=1, inplace=True)
jiga.head(3)

Unnamed: 0,법정동코드,공시지가
0,1111010100,4357000
1,1111010100,1392000
2,1111010100,2520000


In [14]:
# 법정동코드의 형식을 일치시켜줌
jiga['법정동코드'] = jiga['법정동코드'].apply(lambda x: str(x)[:-2])
jiga.head(3)

Unnamed: 0,법정동코드,공시지가
0,11110101,4357000
1,11110101,1392000
2,11110101,2520000


In [15]:
# 공시지가 테이블 merge
df_pre = pd.merge(df_pre, jiga, on='법정동코드', how='left')

# 공시지가 null check
print(df_pre['공시지가'].isnull().sum())
df_pre.head(5)

0


Unnamed: 0,상권_코드,상권_코드_명,엑스좌표_값,와이좌표_값,행정동_코드,시도,행정구역코드,행정기관코드,법정동코드,공시지가
0,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146,5352000
1,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146,5000000
2,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146,5871000
3,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146,5871000
4,1000001,계동길,198799,453610,11110600,서울특별시,1101060.0,11110600.0,11110146,8360000


In [16]:
df_pre = df_pre.groupby(['상권_코드'])['공시지가'].mean().to_frame()
df_pre = df_pre.reset_index()
print(df_pre.shape)
df_pre.head()

(1009, 2)


Unnamed: 0,상권_코드,공시지가
0,1000001,4675446.0
1,1000002,3689914.0
2,1000003,12411850.0
3,1000004,4357117.0
4,1000005,1612400.0


# 배후지 아파트 관련 Feature

In [17]:
apt = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권배후지-아파트).csv')
apt.head(3)

Unnamed: 0,기준_년_코드,기준_분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,아파트_단지_수,아파트_면적_66_제곱미터_미만_세대_수,아파트_면적_66_제곱미터_세대_수,아파트_면적_99_제곱미터_세대_수,아파트_면적_132_제곱미터_세대_수,아파트_면적_165_제곱미터_세대_수,아파트_가격_1_억_미만_세대_수,아파트_가격_1_억_세대_수,아파트_가격_2_억_세대_수,아파트_가격_3_억_세대_수,아파트_가격_4_억_세대_수,아파트_가격_5_억_세대_수,아파트_가격_6_억_이상_세대_수,아파트_평균_면적,아파트_평균_시가
0,2021,2,A,골목상권,1000507,가로공원로58길,135,1059,322,10,0,0,809,296,286,0,0,0,0,45,96832655
1,2021,2,A,골목상권,1001010,풍성로37가길,423,4401,1604,300,1,20,602,2231,1073,628,1055,581,156,47,180148225
2,2021,2,A,골목상권,1001009,천호옛길,204,2325,1405,316,9,0,197,911,504,667,912,646,218,51,208523475


In [18]:
# 기준 분기 전처리
기준년_2020_3분기= ((apt['기준_년_코드']==2020) & (apt['기준_분기_코드']==3)) 
기준년_2020_4분기= ((apt['기준_년_코드']==2020) & (apt['기준_분기_코드']==4)) 
기준년_2021_1분기= ((apt['기준_년_코드']==2021) & (apt['기준_분기_코드']==1)) 
기준년_2021_2분기= ((apt['기준_년_코드']==2021) & (apt['기준_분기_코드']==2))

apt = apt[기준년_2020_3분기 | 기준년_2020_4분기 | 기준년_2021_1분기 | 기준년_2021_2분기]

print(apt.기준_년_코드.unique())
print(apt.기준_분기_코드.unique())

apt.drop(['기준_년_코드','기준_분기_코드'], axis=1, inplace=True)
apt.head(3)

[2021 2020]
[2 1 4 3]


Unnamed: 0,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,아파트_단지_수,아파트_면적_66_제곱미터_미만_세대_수,아파트_면적_66_제곱미터_세대_수,아파트_면적_99_제곱미터_세대_수,아파트_면적_132_제곱미터_세대_수,아파트_면적_165_제곱미터_세대_수,아파트_가격_1_억_미만_세대_수,아파트_가격_1_억_세대_수,아파트_가격_2_억_세대_수,아파트_가격_3_억_세대_수,아파트_가격_4_억_세대_수,아파트_가격_5_억_세대_수,아파트_가격_6_억_이상_세대_수,아파트_평균_면적,아파트_평균_시가
0,A,골목상권,1000507,가로공원로58길,135,1059,322,10,0,0,809,296,286,0,0,0,0,45,96832655
1,A,골목상권,1001010,풍성로37가길,423,4401,1604,300,1,20,602,2231,1073,628,1055,581,156,47,180148225
2,A,골목상권,1001009,천호옛길,204,2325,1405,316,9,0,197,911,504,667,912,646,218,51,208523475


In [19]:
# 필요없는 컬럼 삭제
apt.drop(['상권_구분_코드','상권_코드_명','상권_구분_코드_명'], axis=1, inplace=True)
print(apt.shape)
apt.head(3)

(4036, 16)


Unnamed: 0,상권_코드,아파트_단지_수,아파트_면적_66_제곱미터_미만_세대_수,아파트_면적_66_제곱미터_세대_수,아파트_면적_99_제곱미터_세대_수,아파트_면적_132_제곱미터_세대_수,아파트_면적_165_제곱미터_세대_수,아파트_가격_1_억_미만_세대_수,아파트_가격_1_억_세대_수,아파트_가격_2_억_세대_수,아파트_가격_3_억_세대_수,아파트_가격_4_억_세대_수,아파트_가격_5_억_세대_수,아파트_가격_6_억_이상_세대_수,아파트_평균_면적,아파트_평균_시가
0,1000507,135,1059,322,10,0,0,809,296,286,0,0,0,0,45,96832655
1,1001010,423,4401,1604,300,1,20,602,2231,1073,628,1055,581,156,47,180148225
2,1001009,204,2325,1405,316,9,0,197,911,504,667,912,646,218,51,208523475


In [20]:
# 아파트 평가는 아파트 단지수와
# 아파트 가격 1억 미만 세대수 / 아파트 가격 2~5억 세대수 / 아파트 가격 6억 이상 세대수 
# 아파트 평균 시가가 의미있다고 생각함

# 필요없는 컬럼 삭제
apt.drop(['아파트_면적_66_제곱미터_미만_세대_수', '아파트_면적_66_제곱미터_세대_수',
          '아파트_면적_99_제곱미터_세대_수', '아파트_면적_132_제곱미터_세대_수', 
          '아파트_면적_165_제곱미터_세대_수'], axis=1, inplace=True)
apt.head(3)

Unnamed: 0,상권_코드,아파트_단지_수,아파트_가격_1_억_미만_세대_수,아파트_가격_1_억_세대_수,아파트_가격_2_억_세대_수,아파트_가격_3_억_세대_수,아파트_가격_4_억_세대_수,아파트_가격_5_억_세대_수,아파트_가격_6_억_이상_세대_수,아파트_평균_면적,아파트_평균_시가
0,1000507,135,809,296,286,0,0,0,0,45,96832655
1,1001010,423,602,2231,1073,628,1055,581,156,47,180148225
2,1001009,204,197,911,504,667,912,646,218,51,208523475


In [21]:
# 아파트 가격이 상권 소득에 영향을 미친다고 가정하여
# 3단계로 나눔
apt['아파트_가격_2억_미만_세대_수'] = apt['아파트_가격_1_억_미만_세대_수'] \
                                 + apt['아파트_가격_1_억_세대_수'] \

apt['아파트_가격_2~4억_세대_수'] = apt['아파트_가격_2_억_세대_수'] \
                              + apt['아파트_가격_3_억_세대_수'] \
                              + apt['아파트_가격_4_억_세대_수']

apt['아파트_가격_5억_이상_세대수'] = apt['아파트_가격_5_억_세대_수'] \
                               + apt['아파트_가격_6_억_이상_세대_수']

# 필요없는 컬럼 제거
apt.drop(['아파트_가격_1_억_미만_세대_수', '아파트_가격_1_억_세대_수',
          '아파트_가격_2_억_세대_수', '아파트_가격_3_억_세대_수',
          '아파트_가격_4_억_세대_수', '아파트_가격_5_억_세대_수',
          '아파트_가격_6_억_이상_세대_수'], axis=1, inplace=True)

apt.head(3)

Unnamed: 0,상권_코드,아파트_단지_수,아파트_평균_면적,아파트_평균_시가,아파트_가격_2억_미만_세대_수,아파트_가격_2~4억_세대_수,아파트_가격_5억_이상_세대수
0,1000507,135,45,96832655,1105,286,0
1,1001010,423,47,180148225,2833,2756,737
2,1001009,204,51,208523475,1108,2083,864


In [22]:
# 상권코드별 아파트 평균 면적 재 집계
아파트_평균_면적 = apt.groupby('상권_코드').아파트_평균_면적.mean()

# 인덱스별 총 아파트 시가를 계산 
apt['아파트_총_시가'] = apt['아파트_단지_수'] * apt['아파트_평균_시가']

# 새로운 아파트 평균 시가 컬럼을 만들기 위해 삭제
apt.drop('아파트_평균_시가', axis=1, inplace=True)

# 상권코드별 아파트 평균 시가를 위한 작업
총_아파트_수 = apt.groupby(['상권_코드'])['아파트_단지_수'].sum()
총_아파트_시가 = apt.groupby(['상권_코드'])['아파트_총_시가'].sum()

# 아파트 평균 시가 계산 (후에 merge를 하기 위해 테이블화)
아파트_평균_시가 = (총_아파트_시가 / 총_아파트_수).to_frame()
아파트_평균_시가.columns = ['아파트_평균_시가']
아파트_평균_시가

Unnamed: 0_level_0,아파트_평균_시가
상권_코드,Unnamed: 1_level_1
1000001,2.552432e+08
1000002,2.299791e+08
1000003,1.777366e+08
1000004,1.485003e+08
1000005,3.376317e+08
...,...
1001006,2.213663e+08
1001007,1.545962e+08
1001008,1.854689e+08
1001009,2.028161e+08


In [23]:
apt.drop('아파트_평균_면적', axis=1, inplace=True)

# 상권코드별 집계
apt = apt.groupby(['상권_코드']).sum()

# 테이블 재구조화
apt = apt.reset_index()

apt.head()

Unnamed: 0,상권_코드,아파트_단지_수,아파트_가격_2억_미만_세대_수,아파트_가격_2~4억_세대_수,아파트_가격_5억_이상_세대수,아파트_총_시가
0,1000001,544,2800,582,245,138852322968
1,1000002,179,4372,5775,2984,41166258826
2,1000003,32,982,1386,0,5687571448
3,1000004,917,4753,2021,124,136174787035
4,1000005,421,1210,2220,1006,142142959722


In [24]:
# 평균시가 컬럼 추가
apt = pd.merge(apt, 아파트_평균_시가, left_on='상권_코드', right_index=True, how='left')
apt = pd.merge(apt, 아파트_평균_면적, left_on='상권_코드', right_index=True, how='left')

apt.drop(['아파트_총_시가'], inplace=True, axis=1)
print(apt.shape)
apt.head(3)

(1009, 7)


Unnamed: 0,상권_코드,아파트_단지_수,아파트_가격_2억_미만_세대_수,아파트_가격_2~4억_세대_수,아파트_가격_5억_이상_세대수,아파트_평균_시가,아파트_평균_면적
0,1000001,544,2800,582,245,255243200.0,68.0
1,1000002,179,4372,5775,2984,229979100.0,49.75
2,1000003,32,982,1386,0,177736600.0,50.0


# 집객 시설 관련 Feature

In [25]:
facilities = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권-집객시설).csv')
facilities.fillna(0, inplace=True)  # 0값이 nan값으로 되어 있음
facilities.head()

Unnamed: 0,기준_년_코드,기준_분기_코드,상권_구분_코드,상권_구분_코드_명,상권_코드,상권_코드_명,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,백화점_수,슈퍼마켓_수,극장_수,숙박_시설_수,공항_수,철도_역_수,버스_터미널_수,지하철_역_수,버스_정거장_수
11,2021,2,A,골목상권,1000865,사임당로17길,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12,2021,2,A,골목상권,1000869,서초대로15길,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,2021,2,A,골목상권,1000870,서초대로23길,19,1.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
14,2021,2,A,골목상권,1000138,능동로19길,5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15,2021,2,A,골목상권,1000622,고척로27길,10,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [26]:
# 기준 분기 전처리
기준년_2020_3분기= ((facilities['기준_년_코드']==2020) & (facilities['기준_분기_코드']==3)) 
기준년_2020_4분기= ((facilities['기준_년_코드']==2020) & (facilities['기준_분기_코드']==4)) 
기준년_2021_1분기= ((facilities['기준_년_코드']==2021) & (facilities['기준_분기_코드']==1)) 
기준년_2021_2분기= ((facilities['기준_년_코드']==2021) & (facilities['기준_분기_코드']==2))

facilities = facilities[기준년_2020_3분기 | 기준년_2020_4분기 | 기준년_2021_1분기 | 기준년_2021_2분기]

print(facilities.기준_년_코드.unique())
print(facilities.기준_분기_코드.unique())

# 집계에 불필요한 컬럼 제거
facilities.drop(['기준_년_코드','기준_분기_코드', 
                 '상권_구분_코드', '상권_구분_코드_명', '상권_코드_명'], axis=1, inplace=True)

# 컬럼 기준 전체 합계가 0인 컬럼 제거
facilities.drop(['백화점_수', '공항_수', '철도_역_수', '버스_터미널_수'], axis=1, inplace=True)
facilities.head(3)

[2021 2020]
[2 1 4 3]


Unnamed: 0,상권_코드,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,슈퍼마켓_수,극장_수,숙박_시설_수,지하철_역_수,버스_정거장_수
11,1000865,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12,1000869,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,1000870,19,1.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [27]:
facilities = facilities.groupby(['상권_코드']).sum()
facilities.head()

Unnamed: 0_level_0,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,슈퍼마켓_수,극장_수,숙박_시설_수,지하철_역_수,버스_정거장_수
상권_코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1000001,230,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,114.0,0.0,34.0
1000002,29,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,9.0,0.0,4.0
1000003,152,1.0,3.0,0.0,0.0,0.0,4.0,8.0,0.0,0.0,1.0,0.0,3.0,27.0,0.0,16.0
1000004,44,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,20.0
1000005,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,12.0


# 최종 테이블 구성 및 구성

In [28]:
df_full = pd.merge(df_base, df_pre, on='상권_코드')

df_full = pd.merge(df_full, apt, on='상권_코드', how='left')
df_full.fillna(0, inplace=True)  # 상권코드 1000039(명동길)엔 아파트가 존재하지 않는다.

df_full = pd.merge(df_full, facilities, on='상권_코드', how='left')
df_full.fillna(0, inplace=True)  # 상권코드 1000868(사평대로57길)엔 모든 집객 시설이 존재하지 않는다.

df_full.set_index('상권_코드', inplace=True)
print(df_full.shape, '\n')
print(df_full.isnull().sum())
df_full.head()

(1009, 23) 

공시지가                 0
아파트_단지_수             0
아파트_가격_2억_미만_세대_수    0
아파트_가격_2~4억_세대_수     0
아파트_가격_5억_이상_세대수     0
아파트_평균_시가            0
아파트_평균_면적            0
집객시설_수               0
관공서_수                0
은행_수                 0
종합병원_수               0
일반_병원_수              0
약국_수                 0
유치원_수                0
초등학교_수               0
중학교_수                0
고등학교_수               0
대학교_수                0
슈퍼마켓_수               0
극장_수                 0
숙박_시설_수              0
지하철_역_수              0
버스_정거장_수             0
dtype: int64


Unnamed: 0_level_0,공시지가,아파트_단지_수,아파트_가격_2억_미만_세대_수,아파트_가격_2~4억_세대_수,아파트_가격_5억_이상_세대수,아파트_평균_시가,아파트_평균_면적,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,슈퍼마켓_수,극장_수,숙박_시설_수,지하철_역_수,버스_정거장_수
상권_코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1000001,4675446.0,544.0,2800.0,582.0,245.0,255243200.0,68.0,230.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,114.0,0.0,34.0
1000002,3689914.0,179.0,4372.0,5775.0,2984.0,229979100.0,49.75,29.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,9.0,0.0,4.0
1000003,12411850.0,32.0,982.0,1386.0,0.0,177736600.0,50.0,152.0,1.0,3.0,0.0,0.0,0.0,4.0,8.0,0.0,0.0,1.0,0.0,3.0,27.0,0.0,16.0
1000004,4357117.0,917.0,4753.0,2021.0,124.0,148500300.0,58.0,44.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,20.0
1000005,1612400.0,421.0,1210.0,2220.0,1006.0,337631700.0,102.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,12.0


In [29]:
path = '../data/indicator/preprocess_geo.csv'

df_full.to_csv(path)
pd.read_csv(path).head()

Unnamed: 0,상권_코드,공시지가,아파트_단지_수,아파트_가격_2억_미만_세대_수,아파트_가격_2~4억_세대_수,아파트_가격_5억_이상_세대수,아파트_평균_시가,아파트_평균_면적,집객시설_수,관공서_수,은행_수,종합병원_수,일반_병원_수,약국_수,유치원_수,초등학교_수,중학교_수,고등학교_수,대학교_수,슈퍼마켓_수,극장_수,숙박_시설_수,지하철_역_수,버스_정거장_수
0,1000001,4675446.0,544.0,2800.0,582.0,245.0,255243200.0,68.0,230.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,114.0,0.0,34.0
1,1000002,3689914.0,179.0,4372.0,5775.0,2984.0,229979100.0,49.75,29.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,9.0,0.0,4.0
2,1000003,12411850.0,32.0,982.0,1386.0,0.0,177736600.0,50.0,152.0,1.0,3.0,0.0,0.0,0.0,4.0,8.0,0.0,0.0,1.0,0.0,3.0,27.0,0.0,16.0
3,1000004,4357117.0,917.0,4753.0,2021.0,124.0,148500300.0,58.0,44.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,20.0
4,1000005,1612400.0,421.0,1210.0,2220.0,1006.0,337631700.0,102.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,12.0
