# 라이브러리 로드

In [1]:
import tqdm
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt

from scipy.stats import skew 
from scipy.stats import norm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings(action='ignore')
plt.style.use('seaborn-whitegrid')
plt.rc('font', family='Malgun Gothic')  # 한글 폰트 사용을 위해서 세팅
pd.set_option('max_columns', 80) # 최대 컬럼 갯수 지정
pd.set_option('max_rows', 80) # 최대 로우 갯수 지정

In [2]:
def load_data(path):
    df = pd.read_csv(path, encoding='CP949')
    df = df[ df['상권_구분_코드']=='A' ]  # 골목상권으로 테이블 제한
    
    return df

# 데이터 로드

In [3]:
df_2021 = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권-추정매출).csv')
df_2020 = load_data('../data/row_data/서울시 우리마을가게상권분석서비스(상권-추정매출)_2020.csv')
df_2019 = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권-추정매출)_2019.csv')
df_2018 = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권-추정매출)_2018.csv')
df_2017 = load_data('../data/row_data/서울시 우리마을가게 상권분석서비스(상권-추정매출)_2017.csv')

# 기준 인덱스가 될 테이블 생성
bs_area = gpd.read_file('data/geo/상권_geometry_좌표.geojson', driver= 'GeoJSON')
bs_area = bs_area[bs_area['TRDAR_SE_C'] == 'A']
bs_area = bs_area.sort_values('TRDAR_CD')
df_base = bs_area['TRDAR_CD'].reset_index(drop=True).to_frame()
df_base.columns = ['상권_코드']

df_base.drop(804, axis=0, inplace=True)  # 2020년 이전은 존재하지 않는 상권코드
df_base['상권_코드'] = df_base['상권_코드'].astype(int)
df_base.to_csv('./data/base_data/df_base.csv', index=False)

print(df_base.shape)
df_base.head()

(1009, 1)


Unnamed: 0,상권_코드
0,1000001
1,1000002
2,1000003
3,1000004
4,1000005


In [4]:
income_2017 = df_2017.groupby(['상권_코드'])['분기당_매출_건수'].sum().to_frame()
income_2018 = df_2018.groupby(['상권_코드'])['분기당_매출_건수'].sum().to_frame()
income_2019 = df_2019.groupby(['상권_코드'])['분기당_매출_건수'].sum().to_frame()
income_2020 = df_2020.groupby(['상권_코드'])['분기당_매출_건수'].sum().to_frame()
income_2021 = df_2021.groupby(['상권_코드'])['분기당_매출_건수'].sum().to_frame()

df_full = pd.merge(df_base, income_2017, left_on='상권_코드', right_index=True, )
df_full = pd.merge(df_full, income_2018, on='상권_코드', suffixes=('_2017', '_2018'))
df_full = pd.merge(df_full, income_2019, on='상권_코드', suffixes=('_2018', '_2019'))
df_full = pd.merge(df_full, income_2020, on='상권_코드', suffixes=('_2019', '_2020'))
df_full = pd.merge(df_full, income_2021, on='상권_코드', suffixes=('_2020', '_2021'))
df_full.set_index('상권_코드', inplace=True)

df_full['합계'] = df_full.sum(axis=1)
df_full.head()

Unnamed: 0_level_0,분기당_매출_건수_2017,분기당_매출_건수_2018,분기당_매출_건수_2019,분기당_매출_건수_2020,분기당_매출_건수,합계
상권_코드,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000001,1256289,1271610,1111664,995901,524965,5160429
1000002,1071093,1249259,1106194,841004,407875,4675425
1000003,1432450,1787480,1496646,1344778,601382,6662736
1000004,3201450,2529601,1521309,1105801,537648,8895809
1000005,517763,518973,396981,385697,171974,1991388


In [5]:
new_table = bs_area.copy()
new_table['TRDAR_CD'] = new_table['TRDAR_CD'].astype(int)

df_full = pd.merge(df_full, new_table[['TRDAR_CD', 'area']], left_index=True, right_on='TRDAR_CD', how='left')
df_full['면적당_매출'] = df_full['합계'] / df_full['area']

print(df_full.shape)
df_full.head()

(1009, 9)


Unnamed: 0,분기당_매출_건수_2017,분기당_매출_건수_2018,분기당_매출_건수_2019,분기당_매출_건수_2020,분기당_매출_건수,합계,TRDAR_CD,area,면적당_매출
914,1256289,1271610,1111664,995901,524965,5160429,1000001,125350.780853,41.167905
915,1071093,1249259,1106194,841004,407875,4675425,1000002,31696.155647,147.507636
916,1432450,1787480,1496646,1344778,601382,6662736,1000003,113805.687263,58.544842
856,3201450,2529601,1521309,1105801,537648,8895809,1000004,44023.149162,202.071164
857,517763,518973,396981,385697,171974,1991388,1000005,367064.087898,5.425178


In [6]:
target = df_full[['TRDAR_CD','면적당_매출']]
target.columns = ['상권_코드', '면적당_매출']
target.set_index('상권_코드', inplace=True)

target.to_csv('../data/base_data/target.csv')
pd.read_csv('../data/base_data/target.csv').head()

Unnamed: 0,상권_코드,면적당_매출
0,1000001,41.167905
1,1000002,147.507636
2,1000003,58.544842
3,1000004,202.071164
4,1000005,5.425178
