In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys

In [2]:
# 역과 구 매칭
stn_gu = pd.read_csv('data/subway.csv', encoding='utf-8')

# 역별 승하차객 in/out
inout_pop = pd.read_csv('data/seoul_boarding_pop_by_subway.csv', encoding='utf-8')
inout_pop = inout_pop.drop('datetime', axis=1)

# 승차와 하차 비중
inout_rate = 0.5

# 구별 유동인구
floating_pop = pd.read_csv('data/skt_floating_pop.csv', encoding='utf-8')
floating_pop_mean = floating_pop.groupby(['hour','age','sex','si','gu'], as_index=False).mean()
floating_pop_mean = floating_pop_mean.drop(['datetime', 'si'], axis=1)

In [3]:
for stn_id in tqdm(range(len(inout_pop))):
    for time in ['morning', 'day', 'evening', 'night']:
        
        stn_name = inout_pop.iloc[stn_id, 1]
        gu_match = (stn_gu['station'] == stn_name)
        gu = stn_gu[gu_match].gu.iloc[0] # 동일역이 여러개 나올 수 있어 맨 윗값 취함
        
        if time == 'morning': hours = range(4, 10)
        elif time == 'day': hours = range(10, 16)
        elif time == 'evening': hours = range(16, 20)
        else: hours = range(20, 28) # night
        
        ad_effect = np.zeros((2,5)) # 광고효과지수
        
        for sex in range(2):
            for age in range(5):
                
                for h in hours:
                    
                    # 해당 역의 h시-(h+1)시 승하차인원 계산
                    inout_pop_h = (inout_pop.iloc[stn_id, 2*(h-3)+0]
                                    + inout_rate * inout_pop.iloc[stn_id, 2*(h-3)+1])
                    
                    
                    # 해당 구 h시-(h+1)시 유동인구에서 해당 연령대 및 성별 비율 계산
                    cond_gu = (floating_pop_mean['gu'] == gu)
                    
                    if sex == 0:
                        cond_sex = (floating_pop_mean['sex'] == '남성')
                    else:
                        cond_sex = (floating_pop_mean['sex'] == '여성')
                    
                    if age != 4:
                        cond_age = (floating_pop_mean['age'] == 10*(age+2))
                    else:
                        cond_age = ((floating_pop_mean['age'] == 60) | (floating_pop_mean['age'] == 70))
                    
                    cond_h = (floating_pop_mean['hour'] == h % 24)
                    
                    numer = floating_pop_mean[cond_gu & cond_sex & cond_age & cond_h]['floating_pop'].sum()
                    # 분자는 해당 구 h시-(h+1)시 유동인구에서 해당 연령대 및 성별 인원
                    
                    denom = floating_pop_mean[cond_gu & cond_h]['floating_pop'].sum()
                    # 분모는 해당 구 h시-(h+1)시 전체 유동인구
                    
                    floating_pop_rate_h = numer / denom
                    
                    
                    # 광고효과지수는 승하차인원*유동인구 비율을 시간에 따라 더함
                    ad_effect[sex][age] += inout_pop_h * floating_pop_rate_h
                
                
                
        globals()['stn%03d_%s' % (stn_id, time)] = ad_effect # 변수명 ex. stn164_evening

100%|████████████████████████████████████████████████████████████████████████████████| 376/376 [06:51<00:00,  1.09s/it]


In [4]:
# 클러스터링 위해 하나의 배열로 모으기
# row_number = 4*stn_id + time
# column_number = 5*sex + age
# 20남/30남/40남/50남/6-70남/20여/30여/40여/50여/6-70여

objects = np.zeros((len(inout_pop)*4, 2*5))

mod = sys.modules[__name__]


for stn_id in range(len(inout_pop)):
    for time in ['morning', 'day', 'evening', 'night']:
        
        stn_id_time = getattr(mod, 'stn%03d_%s' % (stn_id, time))
        
        if time == 'morning': t = 0
        elif time == 'day': t = 1
        elif time == 'evening': t = 2
        else: t = 3 # night
            
        for sex in range(2):
            for age in range(5):
                
                objects[4*stn_id + t, 5*sex + age] = stn_id_time[sex, age]