In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from datetime import datetime
import warnings

plt.rc('font', family='malgun gothic')
plt.rc('axes' , unicode_minus = False)
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_parquet("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/거래까지걸린시간_추가_클래식_데이터.parquet")
seat_information = pd.read_csv("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/seat_information_angle.csv")
display(df.head())
print(df.shape)

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,...,원등급추정,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,좌석 오픈 여부,거래까지걸린시간,거래까지걸린시간_시간,거래까지걸린시간_등수,표준화 등수 점수
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,15.2199,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,16.945594,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,18.869853,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,21.009736,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,23.314833,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286


(378255, 47)


# **데이터 테이블 생성 (공연 별, 좌석 별)**

In [3]:
reservation_grouping = df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, 'price':np.mean,'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
reservation_grouping['공연 예매율'] = reservation_grouping['예매여부'] / reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

reservation_grouping = reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

reservation_grouping.columns = [' '.join(col).strip() for col in reservation_grouping.columns.values]

display(reservation_grouping.sample(5))
print(reservation_grouping.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,공연 별 평균 가격 mean,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율
146,2023-05-09 19:30:00,2505,1549,68160.103292,14.0,674.951861,1816.3,0.618363
67,2021-07-29 19:30:00,2073,443,9864.559819,33.183333,658.431866,740.066667,0.2137
72,2021-10-01 19:30:00,1896,1216,39724.506579,14.0,835.892352,1267.15,0.64135
47,2020-01-05 17:00:00,2505,914,103851.203501,14.0,1180.328939,3779.85,0.36487
128,2022-12-08 19:30:00,2505,1424,58478.230337,16.0,524.827645,1387.8,0.568463


(151, 8)


In [4]:
# 공연별 예매율 테이블 생성

cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
        'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


pre_table = pd.merge(reservation_grouping,df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

reservation_table = pre_table.reset_index(drop = True) # index 초기화 

In [5]:
# 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

reservation_table['연도'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.year)
reservation_table['월'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.month)
reservation_table['연 월'] = pd.to_datetime(reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
reservation_table['일'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.day)
reservation_table['시간'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.hour)
reservation_table['요일'] =  pd.to_datetime(reservation_table['play_date']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경

In [6]:
reservation_table['총_누적_검색량'] = reservation_table['콘서트홀클래식_누적검색량'] + reservation_table['예술의전당클래식_누적검색량'] + reservation_table['서울클래식_누적검색량']

In [7]:
display(reservation_table.sample(5))
print(reservation_table.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,공연 별 평균 가격 mean,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율,play_date,play_st_time,...,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,연도,월,연 월,일,시간,요일,총_누적_검색량
15,2019-07-02 20:00:00,2505,1967,108487.036096,10.0,1394.0177,4359.15,0.78523,2019-07-02,20:00:00,...,1250.0,0.0,20.0,2019,7,2019-07-01,2,20,1,1270.0
146,2023-05-09 19:30:00,2505,1549,68160.103292,14.0,674.951861,1816.3,0.618363,2023-05-09,19:30:00,...,1060.0,0.0,20.0,2023,5,2023-05-01,9,19,1,1080.0
147,2023-05-21 17:00:00,2505,1918,18981.230448,14.0,1155.064451,1481.233333,0.765669,2023-05-21,17:00:00,...,820.0,80.0,20.0,2023,5,2023-05-01,21,17,6,920.0
107,2022-09-07 19:30:00,2505,897,81853.957637,16.0,967.937793,2371.066667,0.358084,2022-09-07,19:30:00,...,930.0,40.0,110.0,2022,9,2022-09-01,7,19,2,1080.0
4,2019-02-16 17:00:00,2505,1192,6859.060403,10.666667,864.852699,1073.016667,0.475848,2019-02-16,17:00:00,...,1210.0,0.0,20.0,2019,2,2019-02-01,16,17,5,1230.0


(151, 24)


# **좌석 데이터 요약 테이블 생성**

In [8]:
seat_grouping = df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()

seat_grouping['좌석 예매율'] = seat_grouping['예매여부'] / seat_grouping['좌석 오픈 여부']
seat_grouping = seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


seat_grouping.columns = [' '.join(col).strip() for col in seat_grouping.columns.values]
pre_table = seat_grouping.reset_index(drop = True)

pre_table.sample(5)

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율
1714,2층 E블록3열 4,76,139,71112.582781,840.036404,942.0,4.0,0.240154,0.546763
500,1층 C블록11열 5,85,140,110973.509934,688.836275,985.0,1.0,0.333827,0.607143
1712,2층 E블록3열 2,85,147,72033.112583,894.76,942.0,3.0,0.253384,0.578231
1203,1층 E블록7열 10,67,143,71046.357616,1077.088557,985.0,7.0,0.168003,0.468531
556,1층 C블록15열 5,92,141,108701.986755,812.055072,985.0,2.0,0.325829,0.652482


In [9]:
# merge 하기 전에 이름을 변경해주자

seat_information = seat_information.rename(columns = {'전체_좌석' : 'seat'})

seat_table = pd.merge(pre_table, seat_information, how = 'left', on = 'seat')

display(seat_table.sample(5))

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율,층,블록,열,넘버,X,Y,Z,대칭점,무대까지의 거리,좌우시야각,상하시야각
2064,3층 E블록6열 3,58,138,48298.013245,1017.341092,935.0,2.0,0.190387,0.42029,3층,E블록,6,3,-528,3038,1388,3층 C블록6열 10,3381.533971,25.474992,7.480241
252,1층 B블록13열 5,88,149,106205.298013,885.431439,985.0,2.0,0.268545,0.590604,1층,B블록,13,5,929,1818,130,1층 D블록13열 8,2045.743141,38.922108,12.3075
1705,2층 E블록3열 12,72,146,63298.013245,722.443519,967.0,6.0,0.230658,0.493151,2층,E블록,3,12,-1518,1946,717,2층 A블록3열 7,2570.083462,27.283754,14.440119
889,1층 D블록1열 1,88,148,109278.145695,414.847727,749.0,1.0,0.431302,0.594595,1층,D블록,1,1,-484,725,-93,1층 B블록1열 9,876.658428,93.879638,32.540076
1034,1층 E블록10열 9,62,139,70715.231788,929.612903,967.0,2.0,0.190266,0.446043,1층,E블록,10,9,-1649,1375,69,1층 A블록10열 3,2148.158979,25.304345,20.171002


# **추후 특정 기준에 따라 요약 테이블을 생성하기 위해 함수로 만들어두자**

In [10]:
class ConditionSummaryTable:
    
    def __init__(self,seat_information):
        
        self.seat_information = seat_information
        self.seat_information = self.seat_information.rename(columns = {'전체_좌석' : 'seat'})
        
    def reservation_condition_table(self ,cond_df):
        
        
        self.reservation_grouping = cond_df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, 'price':np.mean,'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
        
        
        self.reservation_grouping['공연 예매율'] = self.reservation_grouping['예매여부'] / self.reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

        self.reservation_grouping = self.reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

        self.reservation_grouping.columns = [' '.join(col).strip() for col in self.reservation_grouping.columns.values]

        # 공연별 예매율 테이블 생성

        cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
                'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


        pre_table = pd.merge(reservation_grouping,cond_df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

        self.reservation_table = pre_table.reset_index(drop = True) # index 초기화 
        
        # 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

        self.reservation_table['연도'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.year)
        self.reservation_table['월'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.month)
        self.reservation_table['연 월'] = pd.to_datetime(self.reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
        self.reservation_table['일'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.day)
        self.reservation_table['시간'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.hour)
        self.reservation_table['요일'] =  pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경
        
        return self.reservation_table
    
    def seat_information_condition_table(self,cond_df):
        
        
        self.seat_grouping = cond_df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()


        self.seat_grouping['좌석 예매율'] = self.seat_grouping['예매여부'] / self.seat_grouping['좌석 오픈 여부']
        self.seat_grouping = self.seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


        self.seat_grouping.columns = [' '.join(col).strip() for col in self.seat_grouping.columns.values]
        
        self.pre_table = self.seat_grouping.reset_index(drop = True)
        

        self.seat_table = pd.merge(pre_table, self.seat_information, how = 'left', on = 'seat')
        
        return self.seat_table

In [11]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

reservation_table.to_parquet(file_path + 'reservation_table.parquet',index = False)
seat_table.to_parquet(file_path + 'seat_table.parquet', index = False)