In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager    # 매 번 크롬 드라이버를 설치할 필요없이 자동으로
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.alert import Alert
from tqdm import tqdm_notebook
from urllib.request import urlopen
import time
import requests

## 1) Total_audience 전처리

In [2]:
audience = pd.read_csv("Total_audience.csv", index_col=0, header=[0,1])
audience.rename(columns={"Unnamed: 0_level_1":""}, inplace=True)

In [4]:
audience.head()
audience.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7268 entries, 0 to 7267
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   (연도, )      7268 non-null   object 
 1   (한국, 개봉편수)  7268 non-null   int64  
 2   (한국, 상영편수)  7268 non-null   int64  
 3   (한국, 매출액)   7268 non-null   int64  
 4   (한국, 관객수)   7268 non-null   int64  
 5   (한국, 점유율)   7268 non-null   float64
 6   (외국, 개봉편수)  7268 non-null   int64  
 7   (외국, 상영편수)  7268 non-null   int64  
 8   (외국, 매출액)   7268 non-null   int64  
 9   (외국, 관객수)   7268 non-null   int64  
 10  (외국, 점유율)   7268 non-null   float64
 11  (전체, 개봉편수)  7268 non-null   int64  
 12  (전체, 상영편수)  7268 non-null   int64  
 13  (전체, 매출액)   7268 non-null   int64  
 14  (전체, 관객수)   7268 non-null   int64  
dtypes: float64(2), int64(12), object(1)
memory usage: 908.5+ KB


## 1-1) 필요한 열과 필요한 행만 가져오기

### (1) 필요한 열만 가져오기

In [5]:
audience = audience.iloc[:,[0,11,12,13,14]]      # 연도, 전체 개봉편수, 전체 상영편수, 전체 매출액, 전체 관객수

In [7]:
audience.head()

Unnamed: 0_level_0,연도,전체,전체,전체,전체
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수
0,2004-01-01 00:00:00,1,7,727005000,113837
1,2004-01-02 00:00:00,0,7,642276500,104357
2,2004-01-03 00:00:00,0,7,796519500,125234
3,2004-01-04 00:00:00,0,7,763271000,120203
4,2004-01-05 00:00:00,0,7,409269500,64180


### (2) 필요 없는 행 제거해주기

- 각 월 마다 통계치를 합계해 준 행들이 있음

In [8]:
audience.loc[audience.loc[:,"연도"]=="합계",:]

Unnamed: 0_level_0,연도,전체,전체,전체,전체
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수
31,합계,13,295,19429676000,3076067
61,합계,28,420,19538678000,3113385
93,합계,21,541,11777867500,1845368
124,합계,35,573,20828686000,3307663
156,합계,14,519,24678301700,3817166
...,...,...,...,...,...
7167,합계,132,3157,157645792126,14171771
7199,합계,124,2661,124073209373,11250357
7228,합계,104,2571,69050497975,6421295
7260,합계,145,2738,80022730615,7476215


In [9]:
audience.drop(audience[audience.loc[:,"연도"]=="합계"].index, inplace = True)    # 7268 행 -> 7036 행

In [10]:
audience.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7036 entries, 0 to 7266
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   (연도, )      7036 non-null   object
 1   (전체, 개봉편수)  7036 non-null   int64 
 2   (전체, 상영편수)  7036 non-null   int64 
 3   (전체, 매출액)   7036 non-null   int64 
 4   (전체, 관객수)   7036 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 329.8+ KB


## 1-2) 평일과 주말 데이터를 나누기 위해 "요일" 파생변수 추가

In [11]:
audience['요일'] = pd.to_datetime(audience["연도"],format="%Y-%m-%d %H:%M:%S").dt.day_name() 

In [12]:
audience.head(3)

Unnamed: 0_level_0,연도,전체,전체,전체,전체,요일
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수,Unnamed: 6_level_1
0,2004-01-01 00:00:00,1,7,727005000,113837,Thursday
1,2004-01-02 00:00:00,0,7,642276500,104357,Friday
2,2004-01-03 00:00:00,0,7,796519500,125234,Saturday


## 1-3) 이중 컬럼을 하나의 컬럼으로 합쳐주기

In [16]:
print(audience.columns.get_level_values(0))
len(audience.columns.get_level_values(0))

Index(['연도', '전체', '전체', '전체', '전체', '요일'], dtype='object')


6

In [15]:
print(audience.columns.get_level_values(1))
len(audience.columns.get_level_values(1))

Index(['', '개봉편수', '상영편수', '매출액', '관객수', ''], dtype='object')


6

In [17]:
audience1 = audience.copy()    # 혹시 모를 경우를 대비해 사본으로 만들기

In [18]:
audience1.columns = [audience.columns.get_level_values(0)[i] + audience.columns.get_level_values(1)[i] for
                    i in range(0,len(audience.columns.get_level_values(0)))]

In [21]:
audience1.head(3)

Unnamed: 0,연도,전체개봉편수,전체상영편수,전체매출액,전체관객수,요일
0,2004-01-01 00:00:00,1,7,727005000,113837,Thursday
1,2004-01-02 00:00:00,0,7,642276500,104357,Friday
2,2004-01-03 00:00:00,0,7,796519500,125234,Saturday


In [22]:
audience.head(3)

Unnamed: 0_level_0,연도,전체,전체,전체,전체,요일
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수,Unnamed: 6_level_1
0,2004-01-01 00:00:00,1,7,727005000,113837,Thursday
1,2004-01-02 00:00:00,0,7,642276500,104357,Friday
2,2004-01-03 00:00:00,0,7,796519500,125234,Saturday


In [23]:
audience.to_csv("audience_preprocess1.csv", encoding = "utf-8", index=True, header=True)        # 이중 컬럼
audience1.to_csv("audience1_preprocess1.csv", encoding = "utf-8", index= True, header=True)     # 이중 컬럼 합친 것.

### 이중 컬럼을 사용한 audience_preprocess1 파일을 이용할 때는 다음과 같은 코드로 읽어오면 됨.

In [29]:
audience = pd.read_csv("audience_preprocess1.csv", encoding = "utf-8",index_col = 0, header=[0,1])
audience.rename(columns = {"Unnamed: 1_level_1":"","Unnamed: 6_level_1":""}, inplace=True)
audience.head()

Unnamed: 0_level_0,연도,전체,전체,전체,전체,요일
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수,Unnamed: 6_level_1
0,2004-01-01 00:00:00,1,7,727005000,113837,Thursday
1,2004-01-02 00:00:00,0,7,642276500,104357,Friday
2,2004-01-03 00:00:00,0,7,796519500,125234,Saturday
3,2004-01-04 00:00:00,0,7,763271000,120203,Sunday
4,2004-01-05 00:00:00,0,7,409269500,64180,Monday


## 1-4) 가격을 구분해주기 위해 주말인지/ 평일인지 나누어 주기

- 월요일~ 목요일은 평일 요금을 받음.
- 금요일~ 일요일은 주말 요금을 받음.

In [46]:
audience.head()

Unnamed: 0_level_0,연도,전체,전체,전체,전체,요일
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수,Unnamed: 6_level_1
0,2004-01-01 00:00:00,1,7,727005000,113837,Thursday
1,2004-01-02 00:00:00,0,7,642276500,104357,Friday
2,2004-01-03 00:00:00,0,7,796519500,125234,Saturday


In [50]:
price_map = {'Monday':'week',"Tuesday":'week',"Wednesday":'week', "Thursday":'week',"Friday":'weekend',
             "Saturday":'weekend', "Sunday":'weekend'}  

audience["요금 기준"] = audience["요일"].map(price_map)

In [51]:
audience

Unnamed: 0_level_0,연도,전체,전체,전체,전체,요일,요금 기준
Unnamed: 0_level_1,Unnamed: 1_level_1,개봉편수,상영편수,매출액,관객수,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2004-01-01 00:00:00,1,7,727005000,113837,Thursday,week
1,2004-01-02 00:00:00,0,7,642276500,104357,Friday,weekend
2,2004-01-03 00:00:00,0,7,796519500,125234,Saturday,weekend
3,2004-01-04 00:00:00,0,7,763271000,120203,Sunday,weekend
4,2004-01-05 00:00:00,0,7,409269500,64180,Monday,week
...,...,...,...,...,...,...,...
7262,2023-04-02 00:00:00,0,80,3423512979,336191,Sunday,weekend
7263,2023-04-03 00:00:00,0,87,802358725,81632,Monday,week
7264,2023-04-04 00:00:00,2,91,815279367,82221,Tuesday,week
7265,2023-04-05 00:00:00,6,94,1347838240,134732,Wednesday,week


## 2. Gni 전처리

In [30]:
Gni = pd.read_csv("C:\\Users\\user\\Downloads\\주요지표_연간지표__20230407142721.csv",
                 encoding = "euc-kr", index_col=0)

In [32]:
Gni.T

계정항목별,"국내총생산(명목, 원화표시) (십억원)","국내총생산(명목, 달러표시) (억달러)","국민총소득(명목, 원화표시) (십억원)","국민총소득(명목, 달러표시) (억달러)",요소비용국민소득(명목) (십억원),국민처분가능소득(명목) (십억원),국민총처분가능소득(명목) (십억원),가계총처분가능소득(명목) (십억원),"1인당 국내총생산(명목, 원화표시) (만원)","1인당 국내총생산(명목, 달러표시) (달러)","1인당 국민총소득(명목, 원화표시) (만원)","1인당 국민총소득(명목, 달러표시) (달러)","1인당 가계총처분가능소득(명목, 원화표시) (만원)","1인당 가계총처분가능소득(명목, 달러표시) (달러)"
2003,837365.0,7025.5,834443.2,7001.0,602692.2,689409.2,831292.4,494340.2,1748.4,14669.4,1742.3,14618.2,1032.2,8660.1
2004,908439.2,7936.3,906864.7,7922.5,662698.2,750340.6,904224.4,532359.3,1889.3,16505.5,1886.1,16476.9,1107.2,9672.5
2005,957447.8,9347.2,950685.4,9281.2,691776.2,784904.9,948273.9,556248.7,1987.0,19398.8,1973.0,19261.8,1154.4,11270.2
2006,1005601.5,10524.2,1002664.7,10493.5,728236.3,827011.2,999356.0,581833.5,2076.0,21727.1,2070.0,21663.7,1201.2,12571.1
2007,1089660.2,11726.9,1086897.3,11697.1,792507.9,901343.3,1084398.3,617108.9,2238.2,24087.9,2232.6,24026.8,1267.6,13641.7
2008,1154216.5,10468.2,1154509.7,10470.9,828123.1,946644.4,1154672.2,658676.7,2352.9,21339.9,2353.5,21345.3,1342.7,12178.0
2009,1205347.7,9443.3,1203479.8,9428.7,859286.3,976066.0,1202422.0,681634.6,2444.5,19151.8,2440.7,19122.1,1382.4,10830.5
2010,1322611.2,11438.7,1324586.9,11455.8,953725.5,1080250.2,1319387.5,722576.0,2669.0,23083.3,2673.0,23117.7,1458.2,12611.0
2011,1388937.2,12534.3,1397534.8,12611.9,1002826.4,1135618.1,1393483.2,762753.3,2781.4,25100.4,2798.6,25255.8,1527.4,13784.2
2012,1440111.4,12779.6,1455170.3,12913.3,1041626.9,1177260.3,1450611.4,793887.9,2868.8,25457.5,2898.8,25723.7,1581.5,14033.9


In [33]:
Gni.to_csv("Gni.csv", encoding="utf-8", index=True, header=True)

In [35]:
from datetime import datetime, timedelta

start_date = datetime(2004, 1, 1)
end_date = datetime(2023, 4, 6)
date = pd.date_range(start_date, end_date)

price = pd.DataFrame(date, columns=['date'])
price['연도'] = price['date'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [37]:
del price["date"]

In [38]:
price

Unnamed: 0,연도
0,2004-01-01 00:00:00
1,2004-01-02 00:00:00
2,2004-01-03 00:00:00
3,2004-01-04 00:00:00
4,2004-01-05 00:00:00
...,...
7031,2023-04-02 00:00:00
7032,2023-04-03 00:00:00
7033,2023-04-04 00:00:00
7034,2023-04-05 00:00:00


In [43]:
price['요일'] = pd.to_datetime(price["연도"],format="%Y-%m-%d %H:%M:%S").dt.day_name() 

In [40]:
price

Unnamed: 0,연도,요일
0,2004-01-01 00:00:00,Thursday
1,2004-01-02 00:00:00,Friday
2,2004-01-03 00:00:00,Saturday
3,2004-01-04 00:00:00,Sunday
4,2004-01-05 00:00:00,Monday
...,...,...
7031,2023-04-02 00:00:00,Sunday
7032,2023-04-03 00:00:00,Monday
7033,2023-04-04 00:00:00,Tuesday
7034,2023-04-05 00:00:00,Wednesday


In [42]:
price.loc[price["연도"] == "2004-07-01 00:00:00"]    # 182행
price.loc[price["연도"] == "2009-07-03 00:00:00"]    # 2010행

Unnamed: 0,연도,요일
2010,2009-07-03 00:00:00,Friday


## 프라임 타임 ( 피크 타임 기준 ) 

- 2004년 1월 1일  ~ 7000원

- 2004년 7월 1일,주중 7000원, 주말 8000원
---
- 2009년 7월 3일부터 일반 평일 요금은 8천원, 주말 요금은 9천원 
---
- 2013년 2월 14일부터 평일 요금 9천 / 주말 요금 1만원
- 2016년 3월 3일부터 평일 1만원 / 주말 11000원
- 2018년 4월 11일 평일:11000 / 주말:12000  
- 2020년 10월 26일 좌석 차등제 폐지  평일 : 12000 / 주말 : 13000
- 2021년 4월 2일 평일: 13000/ 주말: 14000

In [44]:
Date = pd.read_csv("C:\\Users\\user\\Downloads\\date.csv", encoding="utf-8")

In [45]:
Date  # week는 평일 / 주말

Unnamed: 0,date,week,day
0,2003-12-01,0,Monday
1,2003-12-02,0,Tuesday
2,2003-12-03,0,Wednesday
3,2003-12-04,0,Thursday
4,2003-12-05,0,Friday
...,...,...,...
6966,2022-12-27,0,Tuesday
6967,2022-12-28,0,Wednesday
6968,2022-12-29,0,Thursday
6969,2022-12-30,0,Friday
