In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json

In [2]:
api_keys = json.load(open('api_keys.json', 'r'))

In [71]:
import requests
import json
import time

class NaverMaps():
    headers = {}
    nv_url = "https://naveropenapi.apigw.ntruss.com"
    congress_coor = "126.9178693,37.5281375"
    search_url_format = nv_url+ '/map-place/v1/search?query={query}&coordinate='+congress_coor
    geocode_url_format = nv_url+"/map-geocode/v2/geocode?query={query}&coordinate="+congress_coor
    
    def __init__(self, api_key_id, api_key):
        self.headers = {"X-NCP-APIGW-API-KEY-ID":api_key_id,
                        "X-NCP-APIGW-API-KEY":api_key,
                    }
    def geocode(self, query_string):
        return self.query({'query':query_string}, self.geocode_url_format)
    
    def search(self, query_string):
        return self.query({'query':query_string}, self.search_url_format)
    
    def query(self, query_dict, api_url, retry=5):
        query_dict['query'] = query_dict['query'].replace('#',' ').replace('&',' ')
        r = requests.get(api_url.format(**query_dict), headers=self.headers)
        d = json.loads(r.text)
        if d['status'] != 'OK':
            if retry > 0:
                print("%s retry: %s" % (query_dict, retry))
                time.sleep(3)
                return self.query(query_dict, api_url, retry=retry-1)
            else:
                raise BaseException(query_string, d)
        return d

In [72]:
m = NaverMaps(api_keys['naver_maps_id'], api_keys['naver_maps_key'])

# 1. 주소 있는 데이터 읽어서 확인 

In [113]:
df = pd.read_csv('./jonmat2017.csv')
df = df[df.columns[:-1]]

In [114]:
df[:3]

Unnamed: 0,총연번,의원번호,의원명,당,당ID,지역명,연월일,내역,지출액,사용처,분류,주소,의원지출액순위
0,17_000018,3,강길부,자유한국당,200,울산 울주군,2017.1.6,상임위원회의,160000,여의도소호정,간담회_식대,,75
1,17_000019,3,강길부,자유한국당,200,울산 울주군,2017.1.7,지역현안회의,84000,두남일회,간담회_식대,,75
2,17_000020,3,강길부,자유한국당,200,울산 울주군,2017.1.8,지역예산회의,36000,경기식당,간담회_식대,,75


In [115]:
df['주소'] = df['주소'].apply(lambda x: x.strip() if type(x)==str else '')
df['지출액'] = df['지출액'].apply(lambda x: int(x.replace(',','')))

# 2. 주소 있는것 지오코딩

In [28]:
uniq = df['주소'].unique()

In [29]:
try:
    addr_dict = json.load(open('geocode_dict.json', 'r'))
except:
    addr_dict = {}

In [38]:
for addr in uniq:
    if addr == '':
        
    if addr in addr_dict:
        continue
    addr_dict[addr] = m.geocode(addr)

In [40]:
json.dump(addr_dict, open('geocode_dict.json', 'w'))

In [39]:
addr_dict[''] = {'lat':0, 'lng':0}

In [41]:
for enum in enumerate(addr_dict):
    if enum[0] > 2:
        break
    print(enum)

(0, '서울특별시 영등포구 여의대방로68길 17')
(1, '서울특별시 영등포구 여의도동 45-20 동북빌딩 203-204')
(2, '서울특별시 강남구 개포로 623')


In [42]:
df_latlng = df['주소'].apply(lambda x: pd.Series({f:addr_dict[x][f] for f in ['lat','lng']}))
df_latlng[:3]

Unnamed: 0,lat,lng
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0


In [116]:
df = pd.concat([df, df_latlng], axis=1)

In [117]:
df.query("주소!=''")[:3].T

Unnamed: 0,345,346,347
총연번,17_001545,17_001554,17_001556
의원번호,5,5,5
의원명,강석호,강석호,강석호
당,자유한국당,자유한국당,자유한국당
당ID,200,200,200
지역명,경북 영양군영덕군봉화군울진군,경북 영양군영덕군봉화군울진군,경북 영양군영덕군봉화군울진군
연월일,2017.1.11,2017.1.17,2017.1.17
내역,지역현안관련정책개발보좌직원간담회,상임위법안소위관련보좌직원정책개발간담회,정책개발기자간담회
지출액,105000,160000,350000
사용처,또래오래치킨,청도,잔비어스


# 3.0 네이버 API 써보기

In [83]:
import re
def refine_name(name, lv=1):
    name = name.replace('(주)',' ')
    if lv < 2:
        return name
    return re.sub('\(.*[\)]','',name)
refine_name('(주)파리크라상천등산(평택방향)', lv=2)

'파리크라상천등산 평택방향'

In [93]:
m = NaverMaps(api_keys['naver_maps_id'], api_keys['naver_maps_key'])
def search_naver_place(name, lv=0):
    places = m.search(name)['places']
    if len(places) > 0:
        return places[0]
    elif lv > 1:
        return {'name':name}
    else:
        lv+=1
        name = refine_name(name, lv=lv)
        return search_naver_place(name, lv=lv)

def refine_naver_dict(naver_place_dict):
    if 'x' not in naver_place_dict:
        naver_place_dict['jibun_address']='검색불가'
        naver_place_dict['x'] = '-2'
        naver_place_dict['y'] = '-2'
        return naver_place_dict
    
    return {x:naver_place_dict[x] for x in ['name','jibun_address','x','y']}
    
d = search_naver_place('(주)파리크라상천등산(평택방향)')
print(d)
print(refine_naver_dict(d))


{'name': '파리크라상 천등산(제천방향)주유소', 'road_address': '충청북도 충주시 산척면 평택제천고속도로 106', 'jibun_address': '충청북도 충주시 산척면 영덕리 산182-46', 'phone_number': '043-844-2988', 'x': '127.9411031', 'y': '37.0616363', 'distance': 104252.8448329165, 'sessionId': '1LUN1moBe9kwkY1_4cKP'}
{'name': '파리크라상 천등산(제천방향)주유소', 'jibun_address': '충청북도 충주시 산척면 영덕리 산182-46', 'x': '127.9411031', 'y': '37.0616363'}


In [95]:
venues = df['사용처'].unique()
venue_dict = {}

In [96]:
for v in venues:
    if v not in venue_dict or venue_dict[v]['jibun_address']=='검색불가':
        venue_dict[v] = refine_naver_dict(search_naver_place(v))

In [97]:
json.dump(venue_dict, open('venue_dict.json','w'))

In [106]:
len(venue_dict)

6762

In [118]:
df_naver_place = df['사용처'].apply(lambda x: pd.Series({f:venue_dict[x][f] for f in ['name','jibun_address','x','y']}))
df_naver_place[:3]

Unnamed: 0,name,jibun_address,x,y
0,소호정여의도점,서울특별시 영등포구 여의도동 13,126.921312,37.530978
1,두남일회,울산광역시 남구 옥동 591-4,129.2886255,35.5358108
2,경기식당,경상남도 양산시 하북면 순지리 537,129.0814101,35.4934277


In [119]:
df = pd.concat([df, df_naver_place], axis=1)

In [120]:
df[:3]

Unnamed: 0,총연번,의원번호,의원명,당,당ID,지역명,연월일,내역,지출액,사용처,분류,주소,의원지출액순위,lat,lng,name,jibun_address,x,y
0,17_000018,3,강길부,자유한국당,200,울산 울주군,2017.1.6,상임위원회의,160000,여의도소호정,간담회_식대,,75,0.0,0.0,소호정여의도점,서울특별시 영등포구 여의도동 13,126.921312,37.530978
1,17_000019,3,강길부,자유한국당,200,울산 울주군,2017.1.7,지역현안회의,84000,두남일회,간담회_식대,,75,0.0,0.0,두남일회,울산광역시 남구 옥동 591-4,129.2886255,35.5358108
2,17_000020,3,강길부,자유한국당,200,울산 울주군,2017.1.8,지역예산회의,36000,경기식당,간담회_식대,,75,0.0,0.0,경기식당,경상남도 양산시 하북면 순지리 537,129.0814101,35.4934277


In [125]:
df.groupby(df['jibun_address']=='검색불가').count()['x']

jibun_address
False    17477
True      3810
Name: x, dtype: int64

In [126]:
df_google_xy = df.query('lat > 0')

In [145]:
df_google_xy['dist'] = df_google_xy.apply(lambda x: abs(x['lat']-float(x['y'])) + abs(x['lng']-float(x['x'])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [154]:
df_google_xy[df_google_xy['dist'] < 0.02].sort_values(by='dist', ascending=False)

Unnamed: 0,총연번,의원번호,의원명,당,당ID,지역명,연월일,내역,지출액,사용처,분류,주소,의원지출액순위,lat,lng,name,jibun_address,x,y,dist
2840,17_018337,356,김성수,더불어민주당,100,비례대표,2017.7.20,의원간티타임,18000,폴라리스,간담회_다과,서울특별시 영등포구 여의도동 43,6,37.521462,126.930659,해성폴라리스오피스텔,서울특별시 영등포구 영등포동1가 27-1,126.9150525,37.5181123,0.018957
2890,17_018431,356,김성수,더불어민주당,100,비례대표,2017.9.25,언론공정성실현을위한정책간담회및만찬,181000,엉터리생고기,간담회_식대,서울특별시 영등포구 국회대로 72길 17,6,37.529520,126.921248,엉터리생고기 동여의도점,서울특별시 영등포구 여의도동 45-5 대우메종오피스텔 지하1층,126.9287230,37.5192760,0.017720
345,17_001545,5,강석호,자유한국당,200,경북 영양군영덕군봉화군울진군,2017.1.11,지역현안관련정책개발보좌직원간담회,105000,또래오래치킨,사무실_식대비,서울특별시 영등포구 여의대방로68길 17,4,37.517729,126.932872,또래오래 신길2호점,서울특별시 영등포구 신길동 89-102,126.9209375,37.5122500,0.017413
11620,17_077439,185,유승민,바른정당,2020,대구 동구을,2017.1.8,정책간담회,861000,함지산업,간담회_식대,서울특별시 서초구 방배동 80 7-1,2,37.478131,126.989814,함지산업,서울특별시 서초구 방배동 807-1,126.9919728,37.4932528,0.017280
2797,17_018260,356,김성수,더불어민주당,100,비례대표,2017.5.19,조찬간담회,14000,용문식당,간담회_식대,서울특별시 용산구 효창공원로 110,6,37.551676,126.963796,용문해장국,서울특별시 용산구 용문동 8-95,126.9608625,37.5379583,0.016652
2715,17_018129,356,김성수,더불어민주당,100,비례대표,2017.2.4,조찬간담회,35000,용문식당,간담회_식대,서울특별시 용산구 효창공원로 110,6,37.551676,126.963796,용문해장국,서울특별시 용산구 용문동 8-95,126.9608625,37.5379583,0.016652
5665,17_036735,86,노웅래,더불어민주당,100,서울 마포구갑,2017.5.9,정책현안간담회다과비등,19600,매스커피,간담회_다과,서울특별시 영등포구 국회대로74길 20,3,37.529587,126.921526,매스커피 여의도점,서울특별시 영등포구 여의도동 44-21,126.9290482,37.5207816,0.016328
2761,17_018202,356,김성수,더불어민주당,100,비례대표,2017.4.3,기자단오찬간담회,44000,진진,언론_기자식대등,서울특별시 영등포구 국회대로 72길 11 프린스텔,6,37.529936,126.920813,진진만두국,서울특별시 영등포구 여의도동 36-4 오륜빌딩 3층,126.9272337,37.5201994,0.016157
5842,17_037017,86,노웅래,더불어민주당,100,서울 마포구갑,2017.12.21,4차산업혁명관련조찬간담회보좌진등8인,160650,뚜레주르,간담회_식대,서울특별시 영등포구 의사당대로 1,3,37.519749,126.929719,뚜레쥬르 여의서로점,서울특별시 영등포구 여의도동 11-11 한서리버파크 201호,126.9247150,37.5308270,0.016082
5757,17_036880,86,노웅래,더불어민주당,100,서울 마포구갑,2017.8.16,정책현안간담회비보좌진등6인,107000,창고,간담회_식대,서울특별시 영등포구 국회대로70길 15-1,3,37.529623,126.920383,창고43 본점1관점,서울특별시 영등포구 여의도동 36-2,126.9269444,37.5207631,0.015421


# 저장

In [156]:
df.to_csv('df_naver.csv', index=False)