In [20]:
# 기초 전처리
import pandas as pd
import numpy as np

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import geopandas as gpd
import os

import warnings
warnings.filterwarnings(action = 'ignore')

# pandas 결과값의 표현 범위 소수점 2자리수로 변경
pd.options.display.float_format = '{:.2f}'.format

# 파일 로드위한 directory 확인 및 현재 경로로 설정
a = os.getcwd()
os.chdir(a)

In [21]:
# FastText에서 유사도로 측정한 맛, 친절, 분위기, 가격(긍정/부정) 에 대한 유의어 사전 load
import json

# Opening JSON file
f = open('keyword_dict.json')

# returns JSON object as
# a dictionary
keyword_dict = json.load(f)

## Iterating through the json
# list
#for i in keyword_dict['emp_details']:
#	print(i)

# Closing file
f.close()

# 전체 과정 설명
- 1) 웹크롤링한 서울시 상점정보를 geojson으로 만든다. (상점별 x,y좌표 보유한 df)
- 2) Qgis를 활용 상점별 좌표를 기준으로 최근접 이웃 알고리즘으로 가장 인근에 위치한 상권과 연결 시킨다.
- 3) Qgis로 작업한 geojson파일을 불러와서 상점정보 & 상권정보와 연결 시킨다. => 각 상점별 정보 + 상권정보 테이블 생성 (10281개 매장)
- 4) 웹크롤링한 매장별 리뷰데이터를 3)에서 만든 테이블과 연결하여 각 매장별 리뷰 테이블을 만든다.

# 서울시 상점 정보로 geojson만들기

In [22]:
# 서울시 상점 정보 불러오기
df = pd.read_csv('가게정보_서울시.csv')
df.head(2)

Unnamed: 0,name,code_ne,r_type,address,tel,score,detail,strong,datalab_1,review_num,blog_num,x,y
0,이삭토스트 개포동역점,38473617,토스트,서울 강남구 개포로 512 1층 109호,02-451-5421,4.7,이삭토스트 서울 개포동역점의 바로주문 페이지 입니다. [찾아오시는 길] -개포동역(...,,{},239,15.0,127.07,37.49
1,개포동장군주먹고기,1258223030,돼지고기구이,서울 강남구 개포로82길 9-15 1층,0507-1300-7367,4.49,개포동 186-9 1층입니다. 개포 5단지 인근 상가거리에있습니다. 개포동역 5번출...,,{},49,20.0,127.07,37.49


In [23]:
# 필요 컬럼만 추출
store  = df[['name','x','y']]

In [24]:
# geojson변환 위해 좌표지정
from shapely.geometry import Point
pd.reset_option('display.float_format')
store['x'] = store.x.astype(float)
store['y'] = store.y.astype(float)

# geojson으로 변환
geometry = [Point(xy) for xy in zip(store.x, store.y)]
store_w_gpd = gpd.GeoDataFrame(store, crs='EPSG:4326', geometry=geometry)

In [25]:
store_w_gpd.head()

Unnamed: 0,name,x,y,geometry
0,이삭토스트 개포동역점,127.068842,37.489307,POINT (127.06884 37.48931)
1,개포동장군주먹고기,127.068254,37.488977,POINT (127.06825 37.48898)
2,드림트레이,127.046346,37.477771,POINT (127.04635 37.47777)
3,라피노,127.049622,37.477314,POINT (127.04962 37.47731)
4,블랑커피,127.050083,37.475814,POINT (127.05008 37.47581)


In [26]:
# 결과 저장
#store_w_gpd.to_file("서울시_상점좌표.geojson", driver='GeoJSON', encoding='utf-8')

# Qgis로 일치영역 찾기
- 상권_geometry_좌표(골목).geojson 에서 각 상점의 중심점을 구하고 서울시 상점좌표(store_w_gpd)를 활용해 둘 사이의 최근접 허브(포인트)를 계산 (결과값 HubName에 저장)
- 각 상점 별로 가장 근접한 상권(HubName)을 찾았다.

# 상권정보와 상점정보 합치기

In [27]:
# 상권 관련 정보 
place = gpd.read_file('상점_인근_상권.geojson', driver= 'GeoJSON', encoding='cp949')

# 상권 내 상점 정보
bs_area = gpd.read_file('상권_geometry_좌표(골목).geojson', driver= 'GeoJSON', encoding='cp949')

In [28]:
# 상권내 존재하는 상점 정보(place) & 상권에 대한 정보(bs_area)를 '상권명' 기준으로 병합
merged_df = pd.merge(place, bs_area, left_on='HubName',right_on='TRDAR_CD_N')
merged_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 10569 entries, 0 to 10568
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   name        10569 non-null  object  
 1   HubName     10569 non-null  object  
 2   geometry_x  10569 non-null  geometry
 3   TRDAR_SE_C  10569 non-null  object  
 4   TRDAR_SE_1  10569 non-null  object  
 5   TRDAR_CD    10569 non-null  object  
 6   TRDAR_CD_N  10569 non-null  object  
 7   XCNTS_VALU  10569 non-null  int64   
 8   YDNTS_VALU  10569 non-null  int64   
 9   SIGNGU_CD   10569 non-null  object  
 10  ADSTRD_CD   10569 non-null  object  
 11  STDR_YM_CD  10569 non-null  object  
 12  area        10569 non-null  float64 
 13  perimeter   10569 non-null  float64 
 14  x           10569 non-null  float64 
 15  y           10569 non-null  float64 
 16  geometry_y  10569 non-null  geometry
dtypes: float64(4), geometry(2), int64(2), object(9)
memory usage: 1.5+ MB


In [29]:
# 필요 컬럼 추출 (병합 예정)
merged_bs_area  = merged_df[['name','TRDAR_CD','TRDAR_CD_N']]
merged_bs_area.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10569 entries, 0 to 10568
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        10569 non-null  object
 1   TRDAR_CD    10569 non-null  object
 2   TRDAR_CD_N  10569 non-null  object
dtypes: object(3)
memory usage: 330.3+ KB


In [30]:
# 상점 정보에서 필요 내용만 추출
store2  = df[['name','code_ne','r_type','address','score','review_num','blog_num','x','y']]
store2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10569 entries, 0 to 10568
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        10569 non-null  object 
 1   code_ne     10569 non-null  int64  
 2   r_type      10569 non-null  object 
 3   address     10569 non-null  object 
 4   score       10551 non-null  object 
 5   review_num  10486 non-null  object 
 6   blog_num    10225 non-null  object 
 7   x           10569 non-null  float64
 8   y           10569 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 743.3+ KB


In [31]:
# 상점 정보 & 상권 내 상점 정보 & 상권 정보 병합
store_review_by_bs = pd.merge(merged_bs_area, store2, on='name', how='left')
store_review_by_bs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11203 entries, 0 to 11202
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        11203 non-null  object 
 1   TRDAR_CD    11203 non-null  object 
 2   TRDAR_CD_N  11203 non-null  object 
 3   code_ne     11203 non-null  int64  
 4   r_type      11203 non-null  object 
 5   address     11203 non-null  object 
 6   score       11185 non-null  object 
 7   review_num  11118 non-null  object 
 8   blog_num    10847 non-null  object 
 9   x           11203 non-null  float64
 10  y           11203 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 1.0+ MB


In [32]:
# 상점 수의 unique값이 일치하는 것으로 보아 유실된 데이터가 없다.
print(f'기존 상점의 개수:', len(store['name'].unique()))
print(f'상권내 상점의 개수:', len(store_review_by_bs['name'].unique()))

print(f'상권개수:', len(store_review_by_bs['TRDAR_CD'].unique()))

기존 상점의 개수: 10281
상권내 상점의 개수: 10281
상권개수: 879


In [33]:
# 중복값이 발생.. 원인을 모르겠넴..
# 일단 제거
store_review_by_bs = store_review_by_bs.drop_duplicates('name')

In [34]:
# x,y는 상점의 좌표
store_review_by_bs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10281 entries, 0 to 11202
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        10281 non-null  object 
 1   TRDAR_CD    10281 non-null  object 
 2   TRDAR_CD_N  10281 non-null  object 
 3   code_ne     10281 non-null  int64  
 4   r_type      10281 non-null  object 
 5   address     10281 non-null  object 
 6   score       10263 non-null  object 
 7   review_num  10199 non-null  object 
 8   blog_num    9943 non-null   object 
 9   x           10281 non-null  float64
 10  y           10281 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 963.8+ KB


In [35]:
store_review_by_bs

Unnamed: 0,name,TRDAR_CD,TRDAR_CD_N,code_ne,r_type,address,score,review_num,blog_num,x,y
0,이삭토스트 개포동역점,1000892,개포로82길,38473617,토스트,서울 강남구 개포로 512 1층 109호,4.7,239,15.0,127.068842,37.489307
1,개포동장군주먹고기,1000892,개포로82길,1258223030,돼지고기구이,서울 강남구 개포로82길 9-15 1층,4.49,49,20.0,127.068254,37.488977
2,커피마마퀸 개포동점,1000892,개포로82길,1233704717,"카페,디저트",서울 강남구 삼성로 38,4.75,51,7.0,127.066845,37.488688
3,강남초장,1000892,개포로82길,1277913587,생선회,서울 강남구 개포로 510 1층 110~111호,4.04,23,10.0,127.068344,37.489288
4,스텔라떡볶이 개포점,1000892,개포로82길,1264091478,떡볶이,"서울 강남구 개포로82길 13-15 지상1층 5호, 9호",4.51,67,48.0,127.068348,37.488821
...,...,...,...,...,...,...,...,...,...,...,...
11198,오르새피자 중화점,1000234,동일로139길,905419007,피자,서울 중랑구 중랑역로 104 동원데쟈뷰,4.41,75,23,127.077721,37.602700
11199,콩닢,1000234,동일로139길,584670846,두부요리,서울 중랑구 동일로 799-6,4.22,151,31,127.078980,37.601979
11200,김연수 소갈비살,1000234,동일로139길,1480979128,소고기구이,서울 중랑구 중랑역로 94,4.2,65,24,127.077886,37.601927
11201,총각집곱창,1000234,동일로139길,18882325,"곱창,막창,양",서울 중랑구 중랑역로 100 디엠씨빌딩,4.48,385,102,127.077798,37.602430


In [36]:
# 결과 저장
store_review_by_bs.to_csv('상권_내_상점.csv', encoding='utf-8')

In [37]:
# 전체 골목상권 1010개중 876개에 대한 상점 정보
len(store_review_by_bs['TRDAR_CD'].unique())

876

# 상권별 리뷰데이터 생성

In [38]:
review_data = pd.read_csv('리뷰정보_서울시.csv', encoding='utf-8')
review_data.info() # 총 1105677개의 리뷰

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136877 entries, 0 to 1136876
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   name           1136877 non-null  object 
 1   code_no        1136877 non-null  int64  
 2   reviewer_info  1136877 non-null  object 
 3   r_score        1136877 non-null  float64
 4   r_date         1136877 non-null  object 
 5   r_times        1136877 non-null  object 
 6   proving        1136690 non-null  object 
 7   r_comments     1105677 non-null  object 
 8   owner_reply    90998 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 78.1+ MB


In [39]:
# store_review_by_bs(상권별 상점 데이터) 와 리뷰데이터 병합
review_by_bs_area = pd.merge(store_review_by_bs, review_data, on='name', how='left')
review_by_bs_area.head(5)

Unnamed: 0,name,TRDAR_CD,TRDAR_CD_N,code_ne,r_type,address,score,review_num,blog_num,x,y,code_no,reviewer_info,r_score,r_date,r_times,proving,r_comments,owner_reply
0,이삭토스트 개포동역점,1000892,개포로82길,38473617,토스트,서울 강남구 개포로 512 1층 109호,4.7,239,15.0,127.068842,37.489307,38473617.0,"{'평균_별점': '5', '사진': 'NA', '리뷰': '1'}",5.0,2021.10.04,2번째 방문,주문,맛있어요! ㅋㅋ사장님 두분도 친절하세요! 최고,
1,이삭토스트 개포동역점,1000892,개포로82길,38473617,토스트,서울 강남구 개포로 512 1층 109호,4.7,239,15.0,127.068842,37.489307,38473617.0,"{'평균_별점': '4.5', '사진': 6, '리뷰': '4'}",5.0,2021.10.02,3번째 방문,주문,맛있고 친절해요!!,감사합니다 ^^**********
2,이삭토스트 개포동역점,1000892,개포로82길,38473617,토스트,서울 강남구 개포로 512 1층 109호,4.7,239,15.0,127.068842,37.489307,38473617.0,"{'평균_별점': '4', '사진': 'NA', '리뷰': '369'}",4.0,2021.09.27,1번째 방문,영수증,굿,감사합니다 ^^
3,이삭토스트 개포동역점,1000892,개포로82길,38473617,토스트,서울 강남구 개포로 512 1층 109호,4.7,239,15.0,127.068842,37.489307,38473617.0,"{'평균_별점': '4.1', '사진': 'NA', '리뷰': '490'}",3.5,2021.09.27,1번째 방문,영수증,굿,감사합니다 ^^
4,이삭토스트 개포동역점,1000892,개포로82길,38473617,토스트,서울 강남구 개포로 512 1층 109호,4.7,239,15.0,127.068842,37.489307,38473617.0,"{'평균_별점': '4.6', '사진': 'NA', '리뷰': '64'}",5.0,2021.09.23,1번째 방문,영수증,사장님 친절하세용ㅎㅎ,감사합니다 ^^


In [40]:
# review_by_bs_area.to_csv('상권별_리뷰.csv', encoding='utf-8', index=False)

# 상권별 비율 지표 뽑아내기

## 상권별 review 비율

In [41]:
# review 없는 row 삭제
review_by_bs_area = review_by_bs_area[review_by_bs_area['r_comments'].notna()]

In [42]:
review_by_bs_area.isnull().sum()

name                   0
TRDAR_CD               0
TRDAR_CD_N             0
code_ne                0
r_type                 0
address                0
score                  0
review_num            85
blog_num            5334
x                      0
y                      0
code_no                0
reviewer_info          0
r_score                0
r_date                 0
r_times                0
proving              187
r_comments             0
owner_reply      1016091
dtype: int64

In [43]:
# 상권별 review 수
review_cnt_by_bs_area = review_by_bs_area.groupby(['TRDAR_CD'])['r_comments'].count()
df = review_cnt_by_bs_area.to_frame()

In [44]:
df.reset_index(inplace=True)
df.rename(columns = {'r_comments' : 'review_cnt'}, inplace = True)
df

Unnamed: 0,TRDAR_CD,review_cnt
0,1000001,1392
1,1000002,424
2,1000003,6280
3,1000004,1951
4,1000005,5527
...,...,...
870,1001006,1859
871,1001007,1182
872,1001008,2235
873,1001009,265


In [45]:
# 상권별 review 비율
total_review_cnt = df['review_cnt'].sum()
df['review_ratio'] = df['review_cnt']/total_review_cnt
df.head()

Unnamed: 0,TRDAR_CD,review_cnt,review_ratio
0,1000001,1392,0.001259
1,1000002,424,0.000383
2,1000003,6280,0.00568
3,1000004,1951,0.001765
4,1000005,5527,0.004999


## 상권별 평균 평점

In [46]:
review_by_bs_area['score'].replace('1,099',1099,inplace=True)
review_by_bs_area['score'] = review_by_bs_area['r_score'].astype('float')

In [47]:
# 상권별 평균평점
avg_score = review_by_bs_area.groupby(['TRDAR_CD'])['r_score'].mean()
df_score = avg_score.to_frame()

In [48]:
df_score.reset_index(inplace=True)
df_score.rename(columns = {'r_score' : 'avg_score'}, inplace = True)
df_score

Unnamed: 0,TRDAR_CD,avg_score
0,1000001,4.414511
1,1000002,4.198113
2,1000003,4.366481
3,1000004,4.237314
4,1000005,4.425547
...,...,...
870,1001006,4.344002
871,1001007,4.368866
872,1001008,4.479642
873,1001009,4.633962


In [49]:
df = pd.merge(df, df_score, on='TRDAR_CD')

In [50]:
df.head(1)

Unnamed: 0,TRDAR_CD,review_cnt,review_ratio,avg_score
0,1000001,1392,0.001259,4.414511


# owner 응답률

In [51]:
review_by_bs_area['owner_reply'].fillna('활활활활', inplace=True)

def change_1(x):
    if x == '활활활활':
        return 0
    else:
        return 1

review_by_bs_area['tmp'] = review_by_bs_area['owner_reply'].apply(change_1)
상권당_평균리플 = review_by_bs_area.groupby(['TRDAR_CD'])['tmp'].mean().to_frame()

In [52]:
df = pd.merge(df, 상권당_평균리플, left_on='TRDAR_CD', right_index=True)
df.head(1)

Unnamed: 0,TRDAR_CD,review_cnt,review_ratio,avg_score,tmp
0,1000001,1392,0.001259,4.414511,0.081178


각 상권당 맛에 대해서 표현한 리뷰(맛있다)의 비율

In [53]:
def change_2(x, key):
    for val in keyword_dict[key]:
        if val in x:
            return 1
    else:
        return 0

In [54]:
from tqdm import tqdm_notebook

sorts = ['taste','kindness','mood', 'pos_price', 'neg_price']
for i in tqdm_notebook(sorts):
    review_by_bs_area['tmp_'+i] = review_by_bs_area['r_comments'].apply(lambda x: change_2(x,i))

  0%|          | 0/5 [00:00<?, ?it/s]

In [55]:
df2 = df.copy()
for i in review_by_bs_area.columns[-5:]:
    상권당_평균리플 = (review_by_bs_area.groupby(['TRDAR_CD'])[i].mean()/review_by_bs_area[i].mean()).to_frame()
    df2 = pd.merge(df2, 상권당_평균리플, left_on='TRDAR_CD', right_index=True)
df2

Unnamed: 0,TRDAR_CD,review_cnt,review_ratio,avg_score,tmp,tmp_taste,tmp_kindness,tmp_mood,tmp_pos_price,tmp_neg_price
0,1000001,1392,0.001259,4.414511,0.081178,0.967455,0.972514,1.374311,0.663147,1.016724
1,1000002,424,0.000383,4.198113,0.002358,0.860697,0.835577,0.581762,0.466527,0.560294
2,1000003,6280,0.005680,4.366481,0.062102,1.033080,1.171051,1.537087,0.850445,1.154985
3,1000004,1951,0.001765,4.237314,0.015889,0.951627,0.724453,0.831636,0.824620,0.922310
4,1000005,5527,0.004999,4.425547,0.065497,0.995085,1.032358,1.506493,0.856557,1.066334
...,...,...,...,...,...,...,...,...,...,...
870,1001006,1859,0.001681,4.344002,0.001614,0.956420,0.750275,1.014326,0.461089,0.815692
871,1001007,1182,0.001069,4.368866,0.018613,0.970932,0.965456,1.150093,0.970628,0.949335
872,1001008,2235,0.002021,4.479642,0.063982,0.989588,0.830960,0.662194,0.938147,0.839036
873,1001009,265,0.000240,4.633962,0.000000,1.052542,0.605133,0.537807,0.646917,1.106283


In [56]:
df2.columns=['상권_코드', '리뷰_수', '리뷰_비율', '평균_별점', '사장님_댓글_비율',
             '맛있다_비율','친절_비율', '분위기_비율','싸다_비율','비싸다_비율']
df2.head(1)

Unnamed: 0,상권_코드,리뷰_수,리뷰_비율,평균_별점,사장님_댓글_비율,맛있다_비율,친절_비율,분위기_비율,싸다_비율,비싸다_비율
0,1000001,1392,0.001259,4.414511,0.081178,0.967455,0.972514,1.374311,0.663147,1.016724


In [57]:
df2.to_csv('상권_문화_지표.csv')