In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode,quote
import urllib.request
import json

In [3]:
#그래프 패키지 
import matplotlib.pyplot as plt 
%matplotlib inline 

In [4]:
# 한글 문제 해결
import platform

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':  # 맥OS 
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':  # 윈도우
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...  sorry~~~')

In [5]:
# 동시 출력
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

## 백신 데이터 정리

In [6]:
## csv 파일 불러
seoul_vc = pd.read_csv('data/vc_seoul.csv').iloc[:112,3:]
busan_vc = pd.read_csv('data/vc_busan.csv').iloc[:112,3:]
daegu_vc = pd.read_csv('data/vc_daegu.csv').iloc[:112,3:]

In [7]:
#날짜변경
date_lis = []
for date in seoul_vc['baseDate']:
    date_lis.append(int(date[:10].replace("-","")))

seoul_vc['baseDate'] = date_lis
busan_vc['baseDate'] = date_lis

In [8]:
# 지역별 주간 누적 (영화 기준 인덱스 수정)
## 서울
tmp_s = seoul_vc[
         (seoul_vc['baseDate']==20210318) | 
         (seoul_vc['baseDate']==20210325) |
         (seoul_vc['baseDate']==20210401) |
         (seoul_vc['baseDate']==20210408) | 
         (seoul_vc['baseDate']==20210415) |
         (seoul_vc['baseDate']==20210422) | 
         (seoul_vc['baseDate']==20210429) |
         (seoul_vc['baseDate']==20210506) |
         (seoul_vc['baseDate']==20210513) | 
         (seoul_vc['baseDate']==20210520) |
         (seoul_vc['baseDate']==20210527) |
         (seoul_vc['baseDate']==20210603) |
         (seoul_vc['baseDate']==20210610) |
         (seoul_vc['baseDate']==20210617) | 
         (seoul_vc['baseDate']==20210624) |
         (seoul_vc['baseDate']==20210630) 
        ]

## 부산
tmp_b = busan_vc[
         (busan_vc['baseDate']==20210318) | 
         (busan_vc['baseDate']==20210314) |
         (busan_vc['baseDate']==20210401) |
         (busan_vc['baseDate']==20210408) | 
         (busan_vc['baseDate']==20210415) |
         (busan_vc['baseDate']==20210422) | 
         (busan_vc['baseDate']==20210429) |
         (busan_vc['baseDate']==20210506) |
         (busan_vc['baseDate']==20210513) | 
         (busan_vc['baseDate']==20210520) |
         (busan_vc['baseDate']==20210527) |
         (busan_vc['baseDate']==20210603) |
         (busan_vc['baseDate']==20210610) |
         (busan_vc['baseDate']==20210617) | 
         (busan_vc['baseDate']==20210624) |
         (busan_vc['baseDate']==20210630)  
        ]

In [9]:
## 불필요한 열 제거
tmp_s = tmp_s.reset_index(drop=True)
tmp_b = tmp_b.reset_index(drop=True)

In [10]:
#주간 접종자 수 계산
s_week_1 = [0]
b_week_1 = [0]
s_week_2 = [0]
b_week_2 = [0]

for i in range(16):
    try:
        s_week_2.append(tmp_s['totalSecondCnt'][i+1]-tmp_s['totalSecondCnt'][i])
        b_week_2.append(tmp_b['totalSecondCnt'][i+1]-tmp_b['totalSecondCnt'][i])
        
        s_week_1.append(tmp_s['totalFirstCnt'][i+1]-tmp_s['totalFirstCnt'][i])
        b_week_1.append(tmp_b['totalFirstCnt'][i+1]-tmp_b['totalFirstCnt'][i])
        
    except:
        pass

In [11]:
#지역별 주간 누적 dict: 날짜,1차누계,2차
dict_seoul_vaccine = {"일자":tmp_s['baseDate'].astype(str).values,
             "1차접종누계":tmp_s['totalFirstCnt'].values,
             "2차접종누계":tmp_s['totalSecondCnt'].values,
             "주간1차접종자수": np.array(s_week_1),
             "주간2차접종자수": np.array(s_week_2)
            }

dict_busan_vaccine = {"일자":tmp_b['baseDate'].astype(str).values,
             "1차접종누계":tmp_b['totalFirstCnt'].values,
             "2차접종누계":tmp_b['totalSecondCnt'].values,
             "주간1차접종자수": np.array(b_week_1),
             "주간2차접종자수": np.array(b_week_2)
               }

In [12]:
## 딕셔너리 형태 데이터 확인 
dict_seoul_vaccine
dict_busan_vaccine

{'일자': array(['20210318', '20210325', '20210401', '20210408', '20210415',
        '20210422', '20210429', '20210506', '20210513', '20210520',
        '20210527', '20210603', '20210610', '20210617', '20210624',
        '20210630'], dtype=object),
 '1차접종누계': array([ 109536,  122249,  141584,  166563,  195071,  294555,  438269,
         563044,  600548,  616705,  650432, 1165736, 1746392, 2554809,
        2802006, 2833050], dtype=int64),
 '2차접종누계': array([     0,   1725,   6227,  12397,  13502,  13543,  27699,  47940,
        112781, 175445, 291118, 346936, 380200, 685532, 813666, 914519],
       dtype=int64),
 '주간1차접종자수': array([     0,  12713,  19335,  24979,  28508,  99484, 143714, 124775,
         37504,  16157,  33727, 515304, 580656, 808417, 247197,  31044],
       dtype=int64),
 '주간2차접종자수': array([     0,   1725,   4502,   6170,   1105,     41,  14156,  20241,
         64841,  62664, 115673,  55818,  33264, 305332, 128134, 100853],
       dtype=int64)}

{'일자': array(['20210314', '20210318', '20210401', '20210408', '20210415',
        '20210422', '20210429', '20210506', '20210513', '20210520',
        '20210527', '20210603', '20210610', '20210617', '20210624',
        '20210630'], dtype=object),
 '1차접종누계': array([  45868,   50988,   73571,   83895,   96082,  121702,  174943,
         223997,  231495,  233540,  254350,  470503,  713446,  996171,
        1089840, 1100426], dtype=int64),
 '2차접종누계': array([     0,      0,    245,   2388,   4161,   4161,   8592,  18249,
         32353,  60033, 118935, 132160, 141696, 230090, 285163, 326829],
       dtype=int64),
 '주간1차접종자수': array([     0,   5120,  22583,  10324,  12187,  25620,  53241,  49054,
          7498,   2045,  20810, 216153, 242943, 282725,  93669,  10586],
       dtype=int64),
 '주간2차접종자수': array([    0,     0,   245,  2143,  1773,     0,  4431,  9657, 14104,
        27680, 58902, 13225,  9536, 88394, 55073, 41666], dtype=int64)}

In [13]:
## 백신 데이터 프레임 생성
seoul_vaccine = pd.DataFrame(dict_seoul_vaccine)
busan_vaccine = pd.DataFrame(dict_busan_vaccine)

In [14]:
# 중간 확인
seoul_vaccine.head()
busan_vaccine.head()

Unnamed: 0,일자,1차접종누계,2차접종누계,주간1차접종자수,주간2차접종자수
0,20210318,109536,0,0,0
1,20210325,122249,1725,12713,1725
2,20210401,141584,6227,19335,4502
3,20210408,166563,12397,24979,6170
4,20210415,195071,13502,28508,1105


Unnamed: 0,일자,1차접종누계,2차접종누계,주간1차접종자수,주간2차접종자수
0,20210314,45868,0,0,0
1,20210318,50988,0,5120,0
2,20210401,73571,245,22583,245
3,20210408,83895,2388,10324,2143
4,20210415,96082,4161,12187,1773


## 영화관 데이터 정리

In [15]:
seuoul_test_data_2020 = pd.read_csv("./data/seoul_movie_2020.csv", thousands=',', encoding='UTF-8')
busan_test_data_2020 = pd.read_csv("./data/busan_movie_2020.csv", thousands=',', encoding='UTF-8')
avg_movie = pd.read_csv("./data/2019_평균자료_영화.csv", thousands=',', encoding='UTF-8')

In [16]:
## 데이터 확인
seuoul_test_data_2020.head()
busan_test_data_2020.head()
avg_movie.head()

Unnamed: 0.1,Unnamed: 0,지역,매출액,관객수,주
0,5,서울시,3361656490,368804,2020-08-16
1,5,서울시,2189550260,247399,2020-08-23
2,5,서울시,1842116650,200100,2020-08-30
3,5,서울시,1733144880,188395,2020-09-06
4,5,서울시,1571785690,172993,2020-09-13


Unnamed: 0.1,Unnamed: 0,지역,매출액,관객수,주
0,7,부산시,1114537230,123943,2020-08-16
1,7,부산시,618924290,71890,2020-08-23
2,7,부산시,467448880,53630,2020-08-30
3,7,부산시,401747560,45878,2020-09-06
4,7,부산시,350543860,40374,2020-09-13


Unnamed: 0.1,Unnamed: 0,상영편수,매출액,관객수
0,서울,62.076923,9735130000.0,1111436.0
1,부산,27.519231,2929068000.0,349188.2


In [17]:
seuoul_test_data_2021_bv = pd.read_csv("./data/2021_~02서울.csv", thousands=',', encoding='UTF-8')
seuoul_test_data_2021_av = pd.read_csv("./data/2021_03~06서울.csv", thousands=',', encoding='UTF-8')
busan_test_data_2021_bv = pd.read_csv("./data/2021_~02부산.csv", thousands=',', encoding='UTF-8')
busan_test_data_2021_av = pd.read_csv("./data/2021_03~06부산.csv", thousands=',', encoding='UTF-8')

In [18]:
## 데이터 확인
seuoul_test_data_2021_bv.head()
seuoul_test_data_2021_av.head()
busan_test_data_2021_bv.head()
busan_test_data_2021_av.head()

Unnamed: 0.1,Unnamed: 0,지역,매출액,관객수,주
0,5,서울시,625176800,71052,2021-01-01
1,5,서울시,446257140,50249,2021-01-08
2,5,서울시,761371720,85940,2021-01-15
3,5,서울시,2050589220,230553,2021-01-22
4,5,서울시,2287327000,246303,2021-01-29


Unnamed: 0.1,Unnamed: 0,지역,매출액,관객수,주
0,5,서울시,2370034110,245237,2021-03-12
1,5,서울시,2370034110,245237,2021-03-12
2,5,서울시,2089378170,221908,2021-03-19
3,5,서울시,2156717370,223590,2021-03-26
4,5,서울시,2394668990,261118,2021-04-02


Unnamed: 0.1,Unnamed: 0,지역,매출액,관객수,주
0,7,부산시,133154520,15720,2021-01-01
1,7,부산시,90327570,10890,2021-01-08
2,7,부산시,139130900,16832,2021-01-15
3,7,부산시,381529810,45703,2021-01-22
4,7,부산시,467990770,52508,2021-01-29


Unnamed: 0.1,Unnamed: 0,지역,매출액,관객수,주
0,7,부산시,513370590,56521,2021-03-12
1,7,부산시,513370590,56521,2021-03-12
2,7,부산시,438756130,48866,2021-03-19
3,7,부산시,425217210,46215,2021-03-26
4,7,부산시,522446130,60290,2021-04-02


In [19]:
## 중복된 0312 데이터 삭제
seuoul_test_data_2021_av.drop([0], inplace=True)
busan_test_data_2021_av.drop([0], inplace=True)

In [20]:
## 필요한 데이터만 추출해서 새로 생성
movie_seoul_2021 = seuoul_test_data_2021_bv.sort_values(by=['주', '매출액'])
movie_busan_2021 = busan_test_data_2021_bv.sort_values(by=['주', '매출액'])

## 서울시/부산시 20년 8월부터 21년 6월 까지 데이터 합치기

In [21]:
## 불필요한 열제거
seuoul_test_data_2020.drop('Unnamed: 0',axis=1, inplace=True)
seuoul_test_data_2021_bv.drop('Unnamed: 0',axis=1, inplace=True)
seuoul_test_data_2021_av.drop('Unnamed: 0',axis=1, inplace=True)
## 불필요한 열제거
busan_test_data_2020.drop('Unnamed: 0',axis=1, inplace=True)
busan_test_data_2021_bv.drop('Unnamed: 0',axis=1, inplace=True)
busan_test_data_2021_av.drop('Unnamed: 0',axis=1, inplace=True)

In [22]:
seuoul_test_data_2020.head()
seuoul_test_data_2021_bv.head()
seuoul_test_data_2021_av.head()
busan_test_data_2020.head()
busan_test_data_2021_bv.head()
busan_test_data_2021_av.head()

Unnamed: 0,지역,매출액,관객수,주
0,서울시,3361656490,368804,2020-08-16
1,서울시,2189550260,247399,2020-08-23
2,서울시,1842116650,200100,2020-08-30
3,서울시,1733144880,188395,2020-09-06
4,서울시,1571785690,172993,2020-09-13


Unnamed: 0,지역,매출액,관객수,주
0,서울시,625176800,71052,2021-01-01
1,서울시,446257140,50249,2021-01-08
2,서울시,761371720,85940,2021-01-15
3,서울시,2050589220,230553,2021-01-22
4,서울시,2287327000,246303,2021-01-29


Unnamed: 0,지역,매출액,관객수,주
1,서울시,2370034110,245237,2021-03-12
2,서울시,2089378170,221908,2021-03-19
3,서울시,2156717370,223590,2021-03-26
4,서울시,2394668990,261118,2021-04-02
5,서울시,1826626560,193633,2021-04-09


Unnamed: 0,지역,매출액,관객수,주
0,부산시,1114537230,123943,2020-08-16
1,부산시,618924290,71890,2020-08-23
2,부산시,467448880,53630,2020-08-30
3,부산시,401747560,45878,2020-09-06
4,부산시,350543860,40374,2020-09-13


Unnamed: 0,지역,매출액,관객수,주
0,부산시,133154520,15720,2021-01-01
1,부산시,90327570,10890,2021-01-08
2,부산시,139130900,16832,2021-01-15
3,부산시,381529810,45703,2021-01-22
4,부산시,467990770,52508,2021-01-29


Unnamed: 0,지역,매출액,관객수,주
1,부산시,513370590,56521,2021-03-12
2,부산시,438756130,48866,2021-03-19
3,부산시,425217210,46215,2021-03-26
4,부산시,522446130,60290,2021-04-02
5,부산시,395182360,43284,2021-04-09


In [23]:
tot_seoul_data = pd.concat([seuoul_test_data_2020, seuoul_test_data_2021_bv, seuoul_test_data_2021_av])
tot_busan_data = pd.concat([busan_test_data_2020, busan_test_data_2021_bv, busan_test_data_2021_av])

In [24]:
tot_seoul_data
tot_busan_data

Unnamed: 0,지역,매출액,관객수,주
0,서울시,3361656490,368804,2020-08-16
1,서울시,2189550260,247399,2020-08-23
2,서울시,1842116650,200100,2020-08-30
3,서울시,1733144880,188395,2020-09-06
4,서울시,1571785690,172993,2020-09-13
5,서울시,1686750342,194747,2020-09-20
6,서울시,4365204790,464557,2020-09-27
7,서울시,2310220710,243950,2020-10-04
8,서울시,1635498260,175172,2020-10-11
9,서울시,2051049940,235614,2020-10-18


Unnamed: 0,지역,매출액,관객수,주
0,부산시,1114537230,123943,2020-08-16
1,부산시,618924290,71890,2020-08-23
2,부산시,467448880,53630,2020-08-30
3,부산시,401747560,45878,2020-09-06
4,부산시,350543860,40374,2020-09-13
5,부산시,369444420,43470,2020-09-20
6,부산시,1270147320,137503,2020-09-27
7,부산시,677889690,74197,2020-10-04
8,부산시,425792690,46497,2020-10-11
9,부산시,517480560,57976,2020-10-18


In [26]:
## 인덱스 리셋 (tot_data와 결함시키기 위해)
seuoul_test_data_2021_av.reset_index(drop=True, inplace=True)
busan_test_data_2021_av.reset_index(drop=True, inplace=True)

In [27]:
## 인덱스 리셋
tot_seoul_data.reset_index(drop=True, inplace=True)
tot_busan_data.reset_index(drop=True, inplace=True)

In [28]:
## seoul_vaccine에 주 컬럼 추가
## busan_vaccine에 주 컬럼 추가
seoul_vaccine['주'] = seuoul_test_data_2021_av['주']
busan_vaccine['주'] = busan_test_data_2021_av['주']

In [29]:
## 불 열 삭제
seoul_vaccine.drop(['일자', '주간1차접종자수','주간2차접종자수'],axis=1, inplace=True)
busan_vaccine.drop(['일자', '주간1차접종자수','주간2차접종자수'],axis=1, inplace=True)

In [30]:
## 최종 데이터 생성
final_seoul_data = pd.merge(tot_seoul_data, seoul_vaccine, how='left')
final_busan_data = pd.merge(tot_busan_data, busan_vaccine, how='left')

In [31]:
## 최종 데이터 확인
final_seoul_data
final_busan_data

Unnamed: 0,지역,매출액,관객수,주,1차접종누계,2차접종누계
0,서울시,3361656490,368804,2020-08-16,,
1,서울시,2189550260,247399,2020-08-23,,
2,서울시,1842116650,200100,2020-08-30,,
3,서울시,1733144880,188395,2020-09-06,,
4,서울시,1571785690,172993,2020-09-13,,
5,서울시,1686750342,194747,2020-09-20,,
6,서울시,4365204790,464557,2020-09-27,,
7,서울시,2310220710,243950,2020-10-04,,
8,서울시,1635498260,175172,2020-10-11,,
9,서울시,2051049940,235614,2020-10-18,,


Unnamed: 0,지역,매출액,관객수,주,1차접종누계,2차접종누계
0,부산시,1114537230,123943,2020-08-16,,
1,부산시,618924290,71890,2020-08-23,,
2,부산시,467448880,53630,2020-08-30,,
3,부산시,401747560,45878,2020-09-06,,
4,부산시,350543860,40374,2020-09-13,,
5,부산시,369444420,43470,2020-09-20,,
6,부산시,1270147320,137503,2020-09-27,,
7,부산시,677889690,74197,2020-10-04,,
8,부산시,425792690,46497,2020-10-11,,
9,부산시,517480560,57976,2020-10-18,,


In [32]:
final_seoul_data.to_csv('./result/finaly_data_서울시.csv')
final_busan_data.to_csv('./result/finaly_data_부산시.csv')