In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from datetime import datetime
import warnings

plt.rc('font', family='malgun gothic')
plt.rc('axes' , unicode_minus = False)
warnings.filterwarnings(action = 'ignore')

In [10]:
df = pd.read_parquet("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/거래까지걸린시간_추가_클래식_데이터.parquet")
seat_information = pd.read_csv("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/seat_information_angle_ver2.csv")
display(df.head())
print(df.shape)

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,...,원등급추정,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,좌석 오픈 여부,거래까지걸린시간,거래까지걸린시간_시간,거래까지걸린시간_등수,표준화 등수 점수
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,69.517555,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,68.459024,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,67.347261,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,66.19942,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,65.044882,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286


(415830, 50)


# **데이터 테이블 생성 (공연 별, 좌석 별)**

In [11]:
reservation_grouping = df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, 'price':np.mean,'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
reservation_grouping['공연 예매율'] = reservation_grouping['예매여부'] / reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

reservation_grouping = reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

reservation_grouping.columns = [' '.join(col).strip() for col in reservation_grouping.columns.values]

display(reservation_grouping.sample(5))
print(reservation_grouping.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,공연 별 평균 가격 mean,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율
97,2022-06-22 19:30:00,2505,1599,8347.091932,14.0,905.891453,1051.25,0.638323
59,2021-01-01 19:30:00,1666,581,33590.361446,11.1,497.866552,898.15,0.348739
69,2021-09-19 17:00:00,1726,1254,52299.84051,208.0,936.759117,1816.916667,0.726535
66,2021-06-24 19:30:00,1829,650,7475.384615,160.016667,1270.747615,1387.316667,0.355385
122,2022-11-15 19:30:00,2505,815,76289.570552,14.0,780.624192,1699.416667,0.325349


(151, 8)


In [12]:
# 공연별 예매율 테이블 생성

cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
        'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


pre_table = pd.merge(reservation_grouping,df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

reservation_table = pre_table.reset_index(drop = True) # index 초기화 

In [13]:
# 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

reservation_table['연도'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.year)
reservation_table['월'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.month)
reservation_table['연 월'] = pd.to_datetime(reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
reservation_table['일'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.day)
reservation_table['시간'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.hour)
reservation_table['요일'] =  pd.to_datetime(reservation_table['play_date']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경

In [14]:
reservation_table['총_누적_검색량'] = reservation_table['콘서트홀클래식_누적검색량'] + reservation_table['예술의전당클래식_누적검색량'] + reservation_table['서울클래식_누적검색량']

In [15]:
display(reservation_table.sample(5))
print(reservation_table.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,공연 별 평균 가격 mean,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율,play_date,play_st_time,...,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,연도,월,연 월,일,시간,요일,총_누적_검색량
108,2022-09-09 19:30:00,2505,455,94536.263736,11.016667,709.655311,1363.433333,0.181637,2022-09-09,19:30:00,...,940.0,20.0,140.0,2022,9,2022-09-01,9,19,4,1100.0
148,2023-05-23 19:30:00,2505,1698,52592.46172,15.0,935.243375,2012.7,0.677844,2023-05-23,19:30:00,...,850.0,40.0,40.0,2023,5,2023-05-01,23,19,1,930.0
52,2020-06-03 19:30:00,2142,622,6908.360129,326.033333,4275.988585,4557.083333,0.290383,2020-06-03,19:30:00,...,1710.0,0.0,70.0,2020,6,2020-06-01,3,19,2,1780.0
93,2022-05-08 17:00:00,1938,866,85754.04157,11.0,360.121882,905.033333,0.446852,2022-05-08,17:00:00,...,580.0,20.0,30.0,2022,5,2022-05-01,8,17,6,630.0
44,2019-12-25 20:00:00,2505,79,28924.050633,9.633333,298.109072,548.983333,0.031537,2019-12-25,20:00:00,...,2000.0,0.0,90.0,2019,12,2019-12-01,25,20,2,2090.0


(151, 24)


# **좌석 데이터 요약 테이블 생성**

In [16]:
seat_grouping = df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()

seat_grouping['좌석 예매율'] = seat_grouping['예매여부'] / seat_grouping['좌석 오픈 여부']
seat_grouping = seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


seat_grouping.columns = [' '.join(col).strip() for col in seat_grouping.columns.values]
pre_table = seat_grouping.reset_index(drop = True)

pre_table.sample(5)

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율
1460,2층 B블록5열 11,88,156,78132.53012,800.065909,736.0,1.0,0.300998,0.564103
1517,2층 C블록2열 4,94,158,92891.566265,720.807979,985.0,1.0,0.323333,0.594937
2494,합창석 H블록4열 10,30,160,22409.638554,643.247222,942.0,2.0,0.10811,0.1875
1355,2층 A블록7열 4,76,153,50933.73494,749.269298,800.0,4.0,0.264195,0.496732
1577,2층 C블록7열 4,88,156,64277.108434,722.901326,985.0,2.0,0.302438,0.564103


In [17]:
# merge 하기 전에 이름을 변경해주자

seat_information = seat_information.rename(columns = {'전체_좌석' : 'seat'})

seat_table = pd.merge(pre_table, seat_information, how = 'left', on = 'seat')

display(seat_table.sample(5))

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율,층,...,넘버,X,Y,Z,대칭점,무대까지의 거리,좌우면적시야각,상하면적시야각,좌우시야각,상하시야각
2325,합창석 G블록1열 4,37,152,23734.939759,789.231081,942.0,3.0,0.158375,0.243421,합창석,...,4,-536,-995,300,합창석 G블록1열 25,1169.325019,53.20593,47.316256,-151.688944,-14.865939
1218,1층 E블록8열 5,87,152,74367.46988,772.632759,942.0,2.0,0.276083,0.572368,1층,...,5,-1405,1242,30,1층 A블록8열 7,1875.497001,30.641652,22.138975,-48.523776,-0.916529
1214,1층 E블록8열 11,84,160,67018.072289,943.084127,967.0,2.0,0.220568,0.525,1층,...,11,-1695,1165,30,1층 A블록8열 1,2056.975936,22.715966,22.666717,-55.498651,-0.835661
97,1층 A블록19열 8,103,161,70662.650602,853.668932,935.0,2.0,0.284256,0.639752,1층,...,8,1456,2328,261,1층 E블록19열 2,2758.195243,26.874375,11.041139,32.023078,-5.429857
199,1층 A블록9열 1,83,165,71445.783133,854.088353,942.0,1.0,0.225739,0.50303,1층,...,1,1722,1256,50,1층 E블록9열 11,2131.97561,22.994338,21.472632,53.893571,-1.343848


# **추후 특정 기준에 따라 요약 테이블을 생성하기 위해 함수로 만들어두자**

In [18]:
class ConditionSummaryTable:
    
    def __init__(self,seat_information):
        
        self.seat_information = seat_information
        self.seat_information = self.seat_information.rename(columns = {'전체_좌석' : 'seat'})
        
    def reservation_condition_table(self ,cond_df):
        
        
        self.reservation_grouping = cond_df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, 'price':np.mean,'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
        
        
        self.reservation_grouping['공연 예매율'] = self.reservation_grouping['예매여부'] / self.reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

        self.reservation_grouping = self.reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

        self.reservation_grouping.columns = [' '.join(col).strip() for col in self.reservation_grouping.columns.values]

        # 공연별 예매율 테이블 생성

        cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
                'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


        pre_table = pd.merge(reservation_grouping,cond_df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

        self.reservation_table = pre_table.reset_index(drop = True) # index 초기화 
        
        # 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

        self.reservation_table['연도'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.year)
        self.reservation_table['월'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.month)
        self.reservation_table['연 월'] = pd.to_datetime(self.reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
        self.reservation_table['일'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.day)
        self.reservation_table['시간'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.hour)
        self.reservation_table['요일'] =  pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경
        
        return self.reservation_table
    
    def seat_information_condition_table(self,cond_df):
        
        
        self.seat_grouping = cond_df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()


        self.seat_grouping['좌석 예매율'] = self.seat_grouping['예매여부'] / self.seat_grouping['좌석 오픈 여부']
        self.seat_grouping = self.seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


        self.seat_grouping.columns = [' '.join(col).strip() for col in self.seat_grouping.columns.values]
        
        self.pre_table = self.seat_grouping.reset_index(drop = True)
        

        self.seat_table = pd.merge(pre_table, self.seat_information, how = 'left', on = 'seat')
        
        return self.seat_table

In [None]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

reservation_table.to_parquet(file_path + 'reservation_table.parquet',index = False)
seat_table.to_parquet(file_path + 'seat_table.parquet', index = False)