In [140]:
import skmob
import pandas as pd
import csv, os, sys, json
import urllib.request
from skmob.preprocessing import clustering


In [141]:
# 주소 정보 전처리해주기
def process_addr(addr_info):
    name_exist = False
    addr = ''
    adm = ''
    roadaddr = ''
    building_name = ''

    for info in addr_info['results']:
        # 도로명 주소가 있는 경우 건물 이름 빼오기
        if info['name'] == 'roadaddr':
            building_name = info['land']['addition0']['value']
            if len(building_name) == 0:
                road_name = info['land']['name']
                number1 = info['land']['number1']
                number2 = info['land']['number2']
                roadaddr = ' '.join([road_name, number1, number2])
                roadaddr = roadaddr.strip()
            else:
                name_exist = True
                building_name = building_name.split('.')[0]
                    
        if info['name'] == 'addr':
            area1 = info['region']['area1']['name']
            area2 = info['region']['area2']['name']
            area3 = info['region']['area3']['name']
            area4 = info['region']['area4']['name']
            land1 = info['land']['number1']
            land2 = info['land']['number2']
            addr = ' '.join([area1, area2, area3, area4, land1, land2])
            addr = addr.strip()

        if info['name'] == 'admcode':
            area1 = info['region']['area1']['name']
            area2 = info['region']['area2']['name']
            area3 = info['region']['area3']['name']
            area4 = info['region']['area4']['name']
            adm = ' '.join([area1, area2, area3, area4])
            adm = adm.strip()

    # print(f"adm: {adm}, addr: {addr}, roadaddr: {roadaddr}")
    if name_exist:
        search = building_name
    else:
        search = ' '.join([adm, addr, roadaddr])
    # search = ' '.join([building_name, adm, addr, roadaddr])

    return search

In [142]:
def coords_to_place(x, y):
    client_id = "ktta1uumf3"
    client_secret = "PDFJ27DxqU6bQSsf40GE9DwybUnlgRzZnni42Dqi"
    coords = ','.join([x, y])
    
    url = "https://naveropenapi.apigw.ntruss.com/map-reversegeocode/v2/gc?request=coordsToaddr&coords=" + \
        coords + "&sourcecrs=epsg:4326&output=json&orders=roadaddr,admcode,roadaddr"

    request = urllib.request.Request(url)
    request.add_header("X-NCP-APIGW-API-KEY-ID", client_id)
    request.add_header("X-NCP-APIGW-API-KEY", client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()
    if (rescode == 200):
        response_body = response.read()
        addr_info = response_body.decode('utf-8')
        addr_info = json.loads(addr_info)
    else:
        print("Error Code:" + rescode)

    search = process_addr(addr_info)
    
    return search

In [143]:
def place_to_category(place):
    if len(place) == 0 or place == "none":
        return "none"

    client_id = "G3_TXQoFDd0lBFsM8fpG"
    client_secret = "oJaqz0CK0W"
    encText = urllib.parse.quote(place)
    url = "https://openapi.naver.com/v1/search/local?query=" + encText  # JSON 결과
    # url = "https://openapi.naver.com/v1/search/blog.xml?query=" + encText # XML 결과
    request = urllib.request.Request(url)
    request.add_header("X-Naver-Client-Id", client_id)
    request.add_header("X-Naver-Client-Secret", client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()

    if (rescode == 200):
        response_body = response.read()
        category_info = response_body.decode('utf-8')
        category_info = json.loads(category_info)
    else:
        print("Error Code:" + rescode)

    if len(category_info['items']) == 0:
        return "none"
    item = category_info['items'][0]
    category = item['category']

    return category

In [144]:
device_id_list = []
for it in os.scandir('./staypoint'):
    if it.is_dir():
        if it.name.isdigit():
            device_id_list.append(it.name)
device_id_list.sort()

for device_id in device_id_list:
    path_dir = './cluster/'+device_id
    file_list = os.listdir(path_dir)

    fields = ['deviceid', 'latitude', 'longitude', 'cluster', 'address']

    for cluster in file_list:
        if not cluster.startswith("cluster_"):
            continue

        result = pd.DataFrame(columns=fields)
        with open('./cluster/'+device_id+'/'+cluster, newline='') as file:
            df = pd.read_csv(file)
            for idx in range (len(df)):
                latitude = df.loc[idx]['latitude']
                longitude = df.loc[idx]['longitude']
                x = "{:.7f}".format(latitude)
                y = "{:.7f}".format(longitude)
                address = coords_to_place(y, x)
                res = pd.DataFrame({
                    'deviceid': [df.loc[idx]['deviceid']],
                    'latitude': [latitude],
                    'longitude': [longitude],
                    'cluster': [df.loc[idx]['cluster']],
                    'address': [address]
                })
                result = pd.concat([result, res])

        c = cluster.split('_')[1].split('.')[0]
        result.to_csv("./cluster/"+device_id+'/address_'+ c +'.csv', index=False)

KeyboardInterrupt: 

In [150]:
cluster0 = 0
totalCnt = 0
for device_id in device_id_list:
    path_dir = './cluster/'+device_id
    file_list = os.listdir(path_dir)

    fields = ['deviceid', 'latitude', 'longitude',
              'datetime', 'leaving_datetime', 'cluster', 'category']
    f = open('./cluster/'+device_id+'/category.csv', 'w', newline='')
    write = csv.writer(f)
    write.writerow(fields)

    for cluster in file_list:
        if not cluster.startswith("cluster_"):
            continue
        with open('./cluster/'+device_id+'/'+cluster, newline='') as file:
            df = pd.read_csv(file)
            df['datetime'] = pd.to_datetime(df['datetime'])
            # df['year'] = df['datetime'].map(lambda x: x.isocalendar()[0])
            # df['week'] = df['datetime'].map(lambda x: x.isocalendar()[1])
            latitude = df['latitude'].mean()
            longitude = df['longitude'].mean()

            idx = 0
            dif = 1e6
            for row in range(len(df)):
                tmp = abs(df.loc[row]['latitude']-latitude) + \
                    abs(df.loc[row]['longitude']-longitude)
                if (dif > tmp):
                    idx = row
                    dif = tmp

            x = "{:.7f}".format(df.loc[idx]['latitude'])
            y = "{:.7f}".format(df.loc[idx]['longitude'])
            place = coords_to_place(y, x)
            category = place_to_category(place)
            if category == 'none':
                totalCnt += 1
                if df.loc[idx]['cluster'] == 0:
                    cluster0 += 1
                print(f"devide ID: {df.loc[idx]['deviceid']} cluster: {df.loc[idx]['cluster']} keyword: {place}")
            df['category'] = category
            write.writerow(df.loc[idx])
    print()
    f.close()
print(f"cluster 0: {(cluster0 / totalCnt) * 100:.2f}")

devide ID: 100 cluster: 0 keyword: 서울특별시 성동구 마장동  청계천로12가길 60

devide ID: 103 cluster: 12 keyword: 가로판매대-15
devide ID: 103 cluster: 11 keyword: 영화관씨네일레븐

devide ID: 104 cluster: 3 keyword: 서울특별시 종로구 종로1.2.3.4가동  수표로 86 1

devide ID: 107 cluster: 0 keyword: 서울특별시 동대문구 제기동  약령시로9길 45

devide ID: 108 cluster: 0 keyword: 서울특별시 성동구 사근동  사근동11길 11 1

devide ID: 110 cluster: 0 keyword: 서울특별시 성동구 왕십리2동  왕십리로31나길 27 1

devide ID: 112 cluster: 5 keyword: 서울특별시 성동구 왕십리2동  왕십리로 339

devide ID: 113 cluster: 0 keyword: 서울특별시 성동구 마장동  무학로14가길 3 11


devide ID: 116 cluster: 3 keyword: 인천광역시 부평구 십정2동  백범로 466
devide ID: 116 cluster: 0 keyword: 왕십리문화공원 공중화장실

devide ID: 118 cluster: 9 keyword: 서울특별시 동대문구 용신동  고산자로34길 25


devide ID: 125 cluster: 2 keyword: 서울특별시 성동구 금호2.3가동  독서당로 303 2
devide ID: 125 cluster: 0 keyword: 서울특별시 성동구 금호2.3가동  금호산9길 59 24

devide ID: 128 cluster: 11 keyword: 서울특별시 동대문구 답십리1동  천호대로 281
devide ID: 128 cluster: 6 keyword: 가로판매대-15




devide ID: 139 cluster: 4 keyword: 서울특별시 동대

In [None]:
for device_id in device_id_list:
    df = pd.read_csv('./cluster/'+device_id+'/category.csv')
    df = df.sort_values(by=['cluster'])
    df.to_csv("./cluster/"+device_id+'/category.csv', index=False)

In [None]:
# df = pd.read_csv('./cluster/51/address_0.csv')
# for i in range(len(df)):
#     x = str(df.loc[i]['latitude'])
#     y = str(df.loc[i]['longitude'])
#     place = coords_to_place(y, x)
#     category = place_to_category(place)
    # print(place)
    # print(category)


In [None]:
# x = str(37.5646166)
# y = str(127.0356865)
# x = str(37.5687204)
# y = str(127.0261365)
# place = coords_to_place(y,x)
# category = place_to_category(place)
# print(place)
# print(category)

In [None]:
device_id_list = []
for it in os.scandir('./staypoint'):
    if it.is_dir():
        if it.name.isdigit():
            device_id_list.append(it.name)
device_id_list.sort()

# device_id_list = ['51']

for device_id in device_id_list:
    df1 = pd.read_csv('/Users/lordly/Downloads/graduation-master/cluster/'+device_id+'/category.csv')
    df2 = pd.read_csv('./cluster/'+device_id+'/category.csv')
    
    cnt1 = 0
    cnt2 = 0
    total = len(df1['category'])
    for category in df1['category']:
        if category == 'none':
            cnt1 += 1
    for category in df2['category']:
        if category == 'none':
            cnt2 += 1
    print(f"deviceID: {device_id} before : {(cnt1/total) * 100:.2f}%, after: {(cnt2/total) * 100:.2f}%")

deviceID: 100 before : 37.50%, after: 12.50%
deviceID: 103 before : 40.00%, after: 13.33%
deviceID: 104 before : 70.00%, after: 10.00%
deviceID: 107 before : 66.67%, after: 11.11%
deviceID: 108 before : 100.00%, after: 50.00%
deviceID: 110 before : 100.00%, after: 100.00%
deviceID: 112 before : 75.00%, after: 12.50%
deviceID: 113 before : 75.00%, after: 25.00%
deviceID: 115 before : 40.00%, after: 0.00%
deviceID: 116 before : 50.00%, after: 50.00%
deviceID: 118 before : 72.73%, after: 9.09%
deviceID: 123 before : 20.00%, after: 0.00%
deviceID: 125 before : 100.00%, after: 66.67%
deviceID: 128 before : 66.67%, after: 16.67%
deviceID: 129 before : 100.00%, after: 0.00%
deviceID: 136 before : 60.00%, after: 0.00%
deviceID: 138 before : 40.00%, after: 0.00%
deviceID: 139 before : 37.50%, after: 12.50%
deviceID: 140 before : 60.00%, after: 0.00%
deviceID: 141 before : 50.00%, after: 16.67%
deviceID: 142 before : 66.67%, after: 19.05%
deviceID: 147 before : 33.33%, after: 0.00%
deviceID: 149

In [None]:
# print(coords_to_place("127.0420499", "37.5665961"))
print(coords_to_place("126.9711747", "37.5801189"))
print(place_to_category("서울삼육병원"))

서울특별시 종로구 청운효자동  
건강,의료>병원,의원
