# 서울시 생활인구
## 서울 생활인구 현황 (2022.09.28. 기준)
### url: https://data.seoul.go.kr/dataVisual/seoul/seoulLivingPopulation.do

1. 집계구 단위 서울 생활인구(내국인)
    - url: https://data.seoul.go.kr/dataList/OA-14979/F/1/datasetView.do
    - 설명: 서울시가 보유한 공공데이터와 통신데이터로 측정한 특정시점에 서울의 특정 지역에 존재하는 인구 중 내국인

<!-- <br> -->


2. 집계구 단위 서울 생활인구(장기체류 외국인)
    - url: https://data.seoul.go.kr/dataList/OA-14978/F/1/datasetView.do
    - 설명:서울시가 보유한 공공데이터와 통신데이터로 측정한 특정시점에 서울의 특정 지역에 존재하는 인구 중 장기체류 외국인

<!-- <br> -->


3. 집계구 단위 서울 생활인구(단기체류 외국인)
    - url: https://data.seoul.go.kr/dataList/OA-14980/F/1/datasetView.do
    - 설명:서울시가 보유한 공공데이터와 통신데이터로 측정한 특정시점에 서울의 특정 지역에 존재하는 인구 중 단기체류 외국인

<!-- <br> -->



※ 개인정보 비 식별화를 위하여 ‘3명’ 이하인 경우 “ * ” 처리


In [1]:
import os
import glob
import pandas as pd
import numpy as np

In [2]:
from tqdm.auto import tqdm, trange
from time import sleep

## 1. 집계구 단위 서울 생활인구(내국인)
    - a)2022년 01월~08월
    - b)2019년, 2020년, 2021년
    - 2019년 10월 15-27일 데이터 없음
## 2. 집계구 단위 서울 생활인구(장기체류 외국인)
    - a)2022년 01월~08월
    - b)2019년, 2020년, 2021년
## 3. 집계구 단위 서울 생활인구(단기체류 외국인)
    - a)2022년 01월~08월
    - b)2019년, 2020년, 2021년

In [3]:
# 폴더 안 파일명 리스트 가져오기
local_people_2019_list = []
local_people_2020_list = []
local_people_2021_list = []

long_foreigner_list = []
temp_foreinger_list= []
pbar = tqdm(os.listdir(r"./data/집계동 단위 서울 생활인구/local_people/2019"))
for file in pbar:
    if file.endswith(".csv"):
        pbar.set_description(file)
        local_people_2019_list.append(file)
        
pbar = tqdm(os.listdir(r"./data/집계동 단위 서울 생활인구/local_people/2020"))
for file in pbar:
    if file.endswith(".csv"):
        pbar.set_description(file)
        local_people_2020_list.append(file)
        
pbar = tqdm(os.listdir(r"./data/집계동 단위 서울 생활인구/local_people/2021"))
for file in pbar:
    if file.endswith(".csv"):
        pbar.set_description(file)
        local_people_2021_list.append(file)

pbar = tqdm(os.listdir(r"./data/집계동 단위 서울 생활인구/long_foreigner/"))
for file in pbar:
    if file.endswith(".csv"):
        pbar.set_description(file)
        long_foreigner_list.append(file)
 
pbar = tqdm(os.listdir(r"./data/집계동 단위 서울 생활인구/temp_foreigner/"))
for file in pbar:
    if file.endswith(".csv"):
        pbar.set_description(file)
        temp_foreinger_list.append(file)

print("local_people_2019_list >>", len(local_people_2019_list))
print("local_people_2020_list >>", len(local_people_2020_list))
print("local_people_2021_list >>", len(local_people_2021_list))

print("long_foreigner_list >>", long_foreigner_list)
print("temp_foreinger_list >>", temp_foreinger_list)

  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/365 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

local_people_2019_list >> 352
local_people_2020_list >> 366
local_people_2021_list >> 365
long_foreigner_list >> ['LONG_FOREIGNER_20220924.csv']
temp_foreinger_list >> ['TEMP_FOREIGNER_20220924.csv']


In [4]:
test = pd.read_csv("./data/집계동 단위 서울 생활인구/local_people/2019/"+local_people_2019_list[0], na_values="*")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459634 entries, 0 to 459633
Data columns (total 33 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   기준일ID            459634 non-null  int64  
 1   시간대구분            459634 non-null  int64  
 2   행정동코드            459634 non-null  int64  
 3   집계구코드            459634 non-null  int64  
 4   총생활인구수           459634 non-null  float64
 5   남자0세부터9세생활인구수    365541 non-null  float64
 6   남자10세부터14세생활인구수  292191 non-null  float64
 7   남자15세부터19세생활인구수  342539 non-null  float64
 8   남자20세부터24세생활인구수  357041 non-null  float64
 9   남자25세부터29세생활인구수  376752 non-null  float64
 10  남자30세부터34세생활인구수  376966 non-null  float64
 11  남자35세부터39세생활인구수  396705 non-null  float64
 12  남자40세부터44세생활인구수  393863 non-null  float64
 13  남자45세부터49세생활인구수  400026 non-null  float64
 14  남자50세부터54세생활인구수  384596 non-null  float64
 15  남자55세부터59세생활인구수  384709 non-null  float64
 16  남자60세부터64세생활인구수  368125 non-null  floa

In [5]:
test.fillna(0, inplace=True)

In [6]:
test.head(-50)

Unnamed: 0,기준일ID,시간대구분,행정동코드,집계구코드,총생활인구수,남자0세부터9세생활인구수,남자10세부터14세생활인구수,남자15세부터19세생활인구수,남자20세부터24세생활인구수,남자25세부터29세생활인구수,...,여자25세부터29세생활인구수,여자30세부터34세생활인구수,여자35세부터39세생활인구수,여자40세부터44세생활인구수,여자45세부터49세생활인구수,여자50세부터54세생활인구수,여자55세부터59세생활인구수,여자60세부터64세생활인구수,여자65세부터69세생활인구수,여자70세이상생활인구수
0,20190811,0,11110515,1101072010001,559.0,26.0,17.0,19.0,25.0,17.0,...,17.0,18.0,17.0,27.0,33.0,19.0,28.0,14.0,11.0,46.0
1,20190811,0,11110515,1101072010002,873.0,22.0,14.0,19.0,27.0,29.0,...,42.0,38.0,32.0,35.0,34.0,43.0,29.0,34.0,16.0,89.0
2,20190811,0,11110515,1101072010003,74.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,4.0,0.0,7.0,0.0,0.0,0.0,4.0,9.0
3,20190811,0,11110515,1101072010004,291.0,10.0,6.0,0.0,12.0,16.0,...,11.0,11.0,7.0,8.0,10.0,11.0,8.0,10.0,0.0,19.0
4,20190811,0,11110515,1101072010005,196.0,6.0,4.0,7.0,7.0,6.0,...,4.0,8.0,8.0,7.0,14.0,5.0,10.0,4.0,6.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459579,20190811,23,11740700,1125071020001,596.0,19.0,10.0,15.0,13.0,25.0,...,22.0,21.0,31.0,20.0,27.0,25.0,33.0,26.0,25.0,32.0
459580,20190811,23,11740700,1125071020002,376.0,17.0,8.0,9.0,8.0,9.0,...,10.0,11.0,18.0,16.0,14.0,11.0,19.0,14.0,17.0,27.0
459581,20190811,23,11740700,1125071020003,753.0,34.0,17.0,19.0,18.0,20.0,...,22.0,25.0,28.0,28.0,30.0,24.0,36.0,28.0,28.0,54.0
459582,20190811,23,11740700,1125071020004,1185.0,23.0,11.0,17.0,15.0,17.0,...,47.0,31.0,26.0,39.0,35.0,34.0,50.0,62.0,67.0,171.0


In [8]:
# 기준일ID 컬럼을 date형식으로 변환
# 년, 월 컬럼 생성
test.insert(0, '날짜', pd.to_datetime(test['기준일ID'], format='%Y%m%d'))
test.insert(1, '년', test['날짜'].dt.year)
test.insert(2, '월', test['날짜'].dt.month)

# 주말 여부 컬럼 생성
test.insert(3, "주말", test['날짜'].dt.dayofweek > 4)
test

Unnamed: 0,날짜,년,월,주말,기준일ID,시간대구분,행정동코드,집계구코드,총생활인구수,남자0세부터9세생활인구수,...,여자25세부터29세생활인구수,여자30세부터34세생활인구수,여자35세부터39세생활인구수,여자40세부터44세생활인구수,여자45세부터49세생활인구수,여자50세부터54세생활인구수,여자55세부터59세생활인구수,여자60세부터64세생활인구수,여자65세부터69세생활인구수,여자70세이상생활인구수
0,2019-08-11,2019,8,True,20190811,0,11110515,1101072010001,559.0,26.0,...,17.0,18.0,17.0,27.0,33.0,19.0,28.0,14.0,11.0,46.0
1,2019-08-11,2019,8,True,20190811,0,11110515,1101072010002,873.0,22.0,...,42.0,38.0,32.0,35.0,34.0,43.0,29.0,34.0,16.0,89.0
2,2019-08-11,2019,8,True,20190811,0,11110515,1101072010003,74.0,0.0,...,0.0,4.0,4.0,0.0,7.0,0.0,0.0,0.0,4.0,9.0
3,2019-08-11,2019,8,True,20190811,0,11110515,1101072010004,291.0,10.0,...,11.0,11.0,7.0,8.0,10.0,11.0,8.0,10.0,0.0,19.0
4,2019-08-11,2019,8,True,20190811,0,11110515,1101072010005,196.0,6.0,...,4.0,8.0,8.0,7.0,14.0,5.0,10.0,4.0,6.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459629,2019-08-11,2019,8,True,20190811,23,11740700,1125071022201,905.0,37.0,...,26.0,30.0,28.0,31.0,40.0,33.0,46.0,31.0,33.0,77.0
459630,2019-08-11,2019,8,True,20190811,23,11740700,1125071022202,231.0,9.0,...,6.0,7.0,7.0,8.0,10.0,8.0,12.0,8.0,8.0,19.0
459631,2019-08-11,2019,8,True,20190811,23,11740700,1125071022501,715.0,29.0,...,20.0,24.0,22.0,25.0,31.0,26.0,37.0,24.0,26.0,61.0
459632,2019-08-11,2019,8,True,20190811,23,11740700,1125071022701,203.0,15.0,...,4.0,0.0,9.0,4.0,8.0,11.0,10.0,6.0,7.0,11.0


In [12]:
test.groupby(by=['년', '월', '주말', '시간대구분'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x293ec4e20>

In [13]:
test

Unnamed: 0,날짜,년,월,주말,기준일ID,시간대구분,행정동코드,집계구코드,총생활인구수,남자0세부터9세생활인구수,...,여자25세부터29세생활인구수,여자30세부터34세생활인구수,여자35세부터39세생활인구수,여자40세부터44세생활인구수,여자45세부터49세생활인구수,여자50세부터54세생활인구수,여자55세부터59세생활인구수,여자60세부터64세생활인구수,여자65세부터69세생활인구수,여자70세이상생활인구수
0,2019-08-11,2019,8,True,20190811,0,11110515,1101072010001,559.0,26.0,...,17.0,18.0,17.0,27.0,33.0,19.0,28.0,14.0,11.0,46.0
1,2019-08-11,2019,8,True,20190811,0,11110515,1101072010002,873.0,22.0,...,42.0,38.0,32.0,35.0,34.0,43.0,29.0,34.0,16.0,89.0
2,2019-08-11,2019,8,True,20190811,0,11110515,1101072010003,74.0,0.0,...,0.0,4.0,4.0,0.0,7.0,0.0,0.0,0.0,4.0,9.0
3,2019-08-11,2019,8,True,20190811,0,11110515,1101072010004,291.0,10.0,...,11.0,11.0,7.0,8.0,10.0,11.0,8.0,10.0,0.0,19.0
4,2019-08-11,2019,8,True,20190811,0,11110515,1101072010005,196.0,6.0,...,4.0,8.0,8.0,7.0,14.0,5.0,10.0,4.0,6.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459629,2019-08-11,2019,8,True,20190811,23,11740700,1125071022201,905.0,37.0,...,26.0,30.0,28.0,31.0,40.0,33.0,46.0,31.0,33.0,77.0
459630,2019-08-11,2019,8,True,20190811,23,11740700,1125071022202,231.0,9.0,...,6.0,7.0,7.0,8.0,10.0,8.0,12.0,8.0,8.0,19.0
459631,2019-08-11,2019,8,True,20190811,23,11740700,1125071022501,715.0,29.0,...,20.0,24.0,22.0,25.0,31.0,26.0,37.0,24.0,26.0,61.0
459632,2019-08-11,2019,8,True,20190811,23,11740700,1125071022701,203.0,15.0,...,4.0,0.0,9.0,4.0,8.0,11.0,10.0,6.0,7.0,11.0


In [14]:
cols = {
    '10세 이하': ['남자0세부터9세생활인구수 ','여자0세부터9세생활인구수'],
    '10대': ['남자10세부터14세생활인구수', '남자15세부터19세생활인구수', '여자10세부터14세생활인구수', '여자15세부터19세생활인구수'], 
    '20대': ['남자20세부터24세생활인구수', '남자25세부터29세생활인구수', '여자20세부터24세생활인구수', '여자25세부터29세생활인구수'], 
    '30대': ['남자30세부터34세생활인구수', '남자35세부터39세생활인구수', '여자30세부터34세생활인구수', '여자35세부터39세생활인구수'],
    '40대': ['남자40세부터44세생활인구수', '남자45세부터49세생활인구수', '여자40세부터44세생활인구수', '여자45세부터49세생활인구수'],
    '50대': ['남자50세부터54세생활인구수', '남자55세부터59세생활인구수', '여자50세부터54세생활인구수', '여자55세부터59세생활인구수'],
    '60대': ['남자60세부터64세생활인구수', '남자65세부터69세생활인구수', '여자60세부터64세생활인구수', '여자65세부터69세생활인구수'],
    '70세 이상': ['남자70세이상생활인구수', '여자70세이상생활인구수']
       }

In [16]:
cols.keys

TypeError: 'builtin_function_or_method' object is not iterable

In [None]:
from datetime import datetime
from datetime import date

def is_weekend(d = datetime.today()):
  return d.weekday() > 4

In [11]:
def data_process(file_list, path, cols):
    df_list = []
    pbar = tqdm(file_list)
    for file in pbar:
        pbar.set_description(file) # pbar description에 작업중인 파일명 업데이트
        try:df_temp = pd.read_csv(path + file, encoding="euc-kr", na_values="*")
        except: df_temp = pd.read_csv(path + file, encoding="utf-8", na_values="*")
        
        #결측값 0으로 대체
        df_temp.fillna(0, inplace=True) 
        
        #성별 컬럼, 나이대 컬럼 추가 후 삭제
        df_temp['10대 남성'] = df_temp['남자10세부터14세생활인구수'] + df_temp['남자15세부터19세생활인구수']
        df_temp['10대 여성'] = df_temp['여자10
        
        df_list.append(df_temp)
        

In [4]:
# csv concat
def concat_files(file_list, path):
    df_list = []
    pbar = tqdm(file_list)
    for file in pbar:
        pbar.set_description(file)
        try:df_temp = pd.read_csv(path + file, encoding="euc-kr", na_values="*")
        except: df_temp = pd.read_csv(path + file, encoding="utf-8", na_values="*")
        df_list.append(df_temp)
    
    df = pd.concat(df_list)
    return df 

In [None]:
local_people_2019_df = concat_files(local_people_2019_list, "./data/집계동 단위 서울 생활인구/local_people/2019/")
local_people_2020_df = concat_files(local_people_2020_list, "./data/집계동 단위 서울 생활인구/local_people/2020/")
local_people_2021_df = concat_files(local_people_2021_list, "./data/집계동 단위 서울 생활인구/local_people/2021/")


long_foreigner_df = concat_files(long_foreigner_list, "./data/집계동 단위 서울 생활인구/long_foreigner/")
temp_foreinger_df = concat_files(temp_foreinger_list, "./data/집계동 단위 서울 생활인구/temp_foreigner/")
print("local_people_df.shape >>", local_people_df.shape)
print("long_foreigner_df.shape >>", long_foreigner_df.shape)
print("temp_foreinger_df.shape >>", temp_foreinger_df.shape)

  0%|          | 0/352 [00:00<?, ?it/s]

In [None]:
# 결측값 0으로 대체
local_people_df.fillna(0, inplace=True)
local_people_df.info()

In [None]:
cols = local_people_df.columns
cols = ['남자10세부터14세생활인구수', '남자15세부터19세생활인구수', '남자20세부터24세생활인구수',
       '남자25세부터29세생활인구수', '남자30세부터34세생활인구수', '남자35세부터39세생활인구수',
       '남자40세부터44세생활인구수', '남자45세부터49세생활인구수', '남자50세부터54세생활인구수',
       '남자55세부터59세생활인구수', '남자60세부터64세생활인구수', '남자65세부터69세생활인구수', '남자70세이상생활인구수',
       '여자0세부터9세생활인구수', '여자10세부터14세생활인구수', '여자15세부터19세생활인구수',
       '여자20세부터24세생활인구수', '여자25세부터29세생활인구수', '여자30세부터34세생활인구수',
       '여자35세부터39세생활인구수', '여자40세부터44세생활인구수', '여자45세부터49세생활인구수',
       '여자50세부터54세생활인구수', '여자55세부터59세생활인구수', '여자60세부터64세생활인구수',
       '여자65세부터69세생활인구수', '여자70세이상생활인구수']
cols

In [None]:
for col in tqdm(cols):
    if col

In [None]:
local_people_df['10대 남성'] = local_people_df.남자10세부터14세생활인구수 + local_people_df.남자15세부터19세생활인구수
local_people_df['20대 남성'] = 