In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys
import csv

In [None]:
stations = pd.read_csv('./seoul_boarding_pop_by_subway.csv').station
lines = pd.read_csv('./seoul_boarding_pop_by_subway.csv').line

In [None]:
objects = pd.read_csv('./clustering_parameters.csv', encoding='EUC-KR')
col_names = pd.Index(['id', 'station', 'time']).append(objects.columns)
objects['station'] = [i for i in stations for _ in range(4)]
objects['line'] = [i for i in lines for _ in range(4)]
objects['id'] = objects['line'] + objects['station']
objects['time'] = ['출근', '낮', '퇴근', '밤']*len(stations)
objects.reindex(columns = col_names)


In [None]:
twenties = ['20남', '30남', '20여', '30여']
sixties = ['6-70남', '6-70여']
t = s = 0
for idx in objects.index:
    total = objects.at[idx, '총인원']
    for twenty in twenties:
        t += objects.at[idx, twenty] * total
    for sixty in sixties:
        s += objects.at[idx, sixty] * total
slope = s/t
slope

In [None]:
# 20-30대 승하차인원 vs 60-70대 승하차인원
# 시간대 합침

twenties = ['20남', '30남', '20여', '30여']
sixties = ['6-70남', '6-70여']
def func(x):
    d = {}
    d['twenties'] = (x[twenties].sum(axis=1)*x['총인원']).sum()
    d['sixties'] = (x[sixties].sum(axis=1)*x['총인원']).sum()
    return pd.Series(d, index=['twenties', 'sixties'])

df = objects.groupby(['id']).apply(func)
x = df['twenties']
y = df['sixties']
df = df.reset_index()

plt.rc('font', family='AppleGothic')        
plt.scatter(x, y)
plt.plot(x, x*slope)
for i, txt in enumerate(df.id):
    plt.annotate(txt, (x[i], y[i]))
plt.show()

In [None]:
below = {} # 20대가 더 많음
above = {} # 30대가 더 많음
for x, y, station in zip(df['twenties'], df['sixties'], df['id']):
    _y = slope*x
    if _y > y:
        below[station] = _y - y
    else:
        above[station] = y - _y

above = sorted(above.items(), key=(lambda x: x[1]), reverse=True)
below = sorted(below.items(), key=(lambda x: x[1]), reverse=True)
print([x[0] for x in above[:10]])
print([x[0] for x in below[:10]])


In [None]:
floating = pd.read_csv('./skt_floating_pop.csv')

floating = floating.drop('datetime', axis=1).groupby(['gu', 'hour', 'age', 'sex']).sum().reset_index()
pop_by_gu = floating.drop('age', axis=1).groupby(['gu', 'hour']).sum('floating_pop').reset_index()

def get_percentage(gu, hour, floating_pop):
    total = pop_by_gu[(pop_by_gu.gu==gu)&(pop_by_gu.hour==hour)].floating_pop.values[0]
    return floating_pop/total

floating['percentage'] = floating.apply(lambda x: get_percentage(x.gu, x.hour, x.floating_pop), axis=1)
floating

In [None]:
import re

boarding = pd.read_csv('./seoul_boarding_pop_by_subway.csv')
def repl(name):
    if "승차" in name:
        return str(int(name.split("-")[0].replace("시", "")))+"_승차"
    else:
        return str(int(name.split("-")[0].replace("시", "")))+"_하차"
    
boarding = boarding.rename(columns={col: repl(col) for col in boarding.columns[3:-3]})
boarding['total'] = 0
for i in range(0, 24):
    boarding[i] = boarding[str(i)+"_승차"]+boarding[str(i)+"_하차"]
    boarding = boarding.drop([str(i)+"_승차", str(i)+"_하차"], axis=1)
    boarding['total'] += boarding[i]
boarding

In [None]:
_above = [x[0] for x in above[:3]]
_below = [x[0] for x in below[:3]]
age = floating.age.unique()
sex = floating.sex.unique()

for _id in _above+_below:
    boarding_pop = boarding[(boarding.ID==_id)]
    gu = boarding_pop.gu.values[0]
    floating_pop = floating[floating.gu==gu].reset_index()
    floating_pop['estimated'] = floating_pop.apply(lambda x: boarding_pop[x.hour]*x.percentage, axis=1)    
    floating_pop['id'] = _id
    data = floating_pop.groupby(['age', 'hour']).sum('estimated')

#     연령 + 시간대별 승하차 비율
    x = np.arange(0, 24)
    boarding_total_by_hour = boarding_pop[range(0, 24)].replace([0], 0.1)
    for a in age:
        y = (data.loc[a, 'estimated']/boarding_total_by_hour).iloc[0]
        plt.plot(x, y, label=a)
    plt.legend()
    plt.title(_id+" 승하차 비율")
    plt.show()
    
    for a in age:
        y = data.loc[a, 'estimated']
        plt.plot(x, y, label=a)
    plt.legend()
    plt.title(_id+" 추정 승하차 고객수")
    plt.show()



In [None]:
# pie chart 에서 유동인구 비율 단순 통계와 비교
_above = [x[0] for x in above[:3]]
_below = [x[0] for x in below[:3]]
age = floating.age.unique()
sex = floating.sex.unique()

for _id in _above+_below:
    gu = boarding[boarding.ID==_id].gu.iloc[0]
    floating_pop = floating[floating.gu==gu]
    pie_chart = floating_pop.groupby(['age']).sum('floating_pop')
    plt.pie(pie_chart.floating_pop.values, 
            labels=pie_chart.index.values,
           autopct='%1.2f%%',
           )
    plt.title(_id+" 유동인구 비율 통계")
    plt.show()
    
    pie_chart = data.groupby(['age']).sum('estimated')
    plt.pie(pie_chart.estimated.values, 
            labels=pie_chart.index.values,
           autopct='%1.2f%%',
           )
    plt.title(_id+" 추정 승하차객 비율 통계")
    plt.show()

In [None]:
# 출근 낮 퇴근 밤 시간 그래프
_above = [x[0] for x in above[:3]]
_below = [x[0] for x in below[:3]]
age = floating.age.unique()
sex = floating.sex.unique()
times = [range(4, 10), range(10, 16), range(16, 20), list(range(20, 24))+list(range(0, 4))]
time_texts = ["출근", "낮", "퇴근", "밤"]

for _id in _above+_below:
    boarding_pop = boarding[(boarding.ID==_id)]
    gu = boarding_pop.gu.values[0]
    floating_pop = floating[floating.gu==gu].reset_index()
    floating_pop['estimated'] = floating_pop.apply(lambda x: boarding_pop[x.hour]*x.percentage, axis=1)    
    data = []
    
    x = np.arange(20, 80, 10)
    for i, time in enumerate(times):
        y = floating_pop[floating_pop.hour.isin(time)].groupby(['age']).sum('estimated')
        plt.plot(x, y.estimated, label=time_texts[i])
    plt.legend()
    plt.title(_id+" 시간대별 승하차 수")
    plt.show()
    
    x = np.arange(20, 80, 10)
    for i, time in enumerate(times):
        y = floating_pop[floating_pop.hour.isin(time)].groupby(['age']).sum('estimated')
        boarding_total = y.sum().estimated
        plt.plot(x, y.estimated/boarding_total, label=time_texts[i])
    plt.legend()
    plt.title(_id+" 시간대별 승하차 비율")
    plt.show()

