In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from datetime import datetime
import warnings

plt.rc('font', family='malgun gothic')
plt.rc('axes' , unicode_minus = False)
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_parquet("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/거래까지걸린시간_추가_클래식_데이터.parquet")
seat_information = pd.read_csv("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/seat_information_angle_ver2.csv")
display(df.head())
print(df.shape)

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,...,원등급추정,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,좌석 오픈 여부,거래까지걸린시간,거래까지걸린시간_시간,거래까지걸린시간_등수,표준화 등수 점수
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,69.517555,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,68.459024,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,67.347261,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,66.19942,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,65.044882,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286


(415830, 50)


# **데이터 테이블 생성 (공연 별, 좌석 별)**

In [3]:
reservation_grouping = df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, '원가격추정':[np.min , np.mean , np.max],'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
reservation_grouping['공연 예매율'] = reservation_grouping['예매여부'] / reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

reservation_grouping = reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

reservation_grouping.columns = [' '.join(col).strip() for col in reservation_grouping.columns.values]

display(reservation_grouping.sample(5))
print(reservation_grouping.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,원가격추정 amin,원가격추정 mean,원가격추정 amax,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율
61,2021-03-20 17:00:00,1891,469,0,47285.429142,70000,16.0,426.184684,929.816667,0.248017
74,2021-10-24 17:00:00,1719,753,0,63471.057884,120000,13.0,515.392209,761.7,0.438045
69,2021-09-19 17:00:00,1726,1254,0,73415.169661,110000,208.0,936.759117,1816.916667,0.726535
50,2020-02-22 17:00:00,2505,960,0,67241.516966,120000,11.25,972.628455,1217.583333,0.383234
59,2021-01-01 19:30:00,1666,581,0,55305.389222,80000,11.1,497.866552,898.15,0.348739


(151, 10)


In [4]:
# 공연별 예매율 테이블 생성

cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
        'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


pre_table = pd.merge(reservation_grouping,df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

reservation_table = pre_table.reset_index(drop = True) # index 초기화 

In [5]:
# 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

reservation_table['연도'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.year)
reservation_table['월'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.month)
reservation_table['연 월'] = pd.to_datetime(reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
reservation_table['일'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.day)
reservation_table['시간'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.hour)
reservation_table['요일'] =  pd.to_datetime(reservation_table['play_date']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경

In [6]:
reservation_table['총_누적_검색량'] = reservation_table['콘서트홀클래식_누적검색량'] + reservation_table['예술의전당클래식_누적검색량'] + reservation_table['서울클래식_누적검색량']

In [7]:
display(reservation_table.sample(5))
print(reservation_table.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,원가격추정 amin,원가격추정 mean,원가격추정 amax,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율,...,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,연도,월,연 월,일,시간,요일,총_누적_검색량
149,2023-06-01 19:30:00,2505,931,0,96419.161677,180000,14.0,1157.557859,2228.3,0.371657,...,850.0,40.0,40.0,2023,6,2023-06-01,1,19,3,930.0
32,2019-10-15 20:00:00,2505,2140,0,106956.087824,150000,135.183333,714.241877,1123.716667,0.854291,...,790.0,0.0,50.0,2019,10,2019-10-01,15,20,1,840.0
75,2021-10-26 19:30:00,1660,1224,0,88249.500998,130000,14.016667,226.250694,1003.116667,0.737349,...,740.0,0.0,10.0,2021,10,2021-10-01,26,19,1,750.0
126,2022-12-01 20:00:00,2505,597,0,59682.634731,120000,-32.266667,2029.66407,3716.05,0.238323,...,1030.0,0.0,120.0,2022,12,2022-12-01,1,20,3,1150.0
123,2022-11-19 17:00:00,2505,982,0,62339.321357,110000,15.016667,1770.802919,2105.366667,0.392016,...,750.0,0.0,60.0,2022,11,2022-11-01,19,17,5,810.0


(151, 26)


# **좌석 데이터 요약 테이블 생성**

In [8]:
seat_grouping = df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()

seat_grouping['좌석 예매율'] = seat_grouping['예매여부'] / seat_grouping['좌석 오픈 여부']
seat_grouping = seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


seat_grouping.columns = [' '.join(col).strip() for col in seat_grouping.columns.values]
pre_table = seat_grouping.reset_index(drop = True)

pre_table.sample(5)

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율
1248,2층 A블록1열 7,110,160,66204.819277,612.749242,967.0,2.0,0.384906,0.6875
487,1층 C블록10열 6,110,163,111445.783133,618.302576,985.0,1.0,0.403866,0.674847
143,1층 A블록3열 5,96,158,78463.855422,614.225,985.0,1.0,0.309509,0.607595
1684,2층 E블록2열 1,88,155,54548.192771,563.094129,749.0,1.0,0.341953,0.567742
1006,1층 D블록8열 3,85,142,106445.783133,695.105686,942.0,2.0,0.34884,0.598592


In [9]:
# merge 하기 전에 이름을 변경해주자

seat_information = seat_information.rename(columns = {'전체_좌석' : 'seat'})

seat_table = pd.merge(pre_table, seat_information, how = 'left', on = 'seat')

display(seat_table.sample(5))

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율,층,...,넘버,X,Y,Z,대칭점,무대까지의 거리,좌우면적시야각,상하면적시야각,좌우시야각,상하시야각
2207,3층 N블록1열 15,64,159,35000.0,787.075781,746.0,3.0,0.186529,0.402516,3층,...,15,-1018,3329,1583,3층 M블록1열 1,3824.193248,21.843581,7.503743,-17.003503,-24.452802
1609,2층 D블록3열 6,81,150,85060.240964,851.726749,985.0,2.0,0.229294,0.54,2층,...,6,-674,2498,717,2층 B블록3열 6,2684.840591,31.806457,7.680469,-15.099741,-15.489095
1061,1층 E블록13열 12,68,147,62108.433735,963.227206,967.0,2.0,0.192662,0.462585,1층,...,12,-1838,1618,130,1층 A블록13열 1,2452.155786,22.822824,17.487426,-48.642386,-3.038936
1429,2층 B블록2열 3,87,160,85301.204819,792.665709,985.0,7.0,0.235053,0.54375,2층,...,3,782,2387,675,2층 D블록2열 8,2600.945597,32.36597,8.572955,18.139203,-15.041642
1146,1층 E블록21열 7,99,165,60963.855422,828.944276,985.0,2.0,0.236739,0.6,1층,...,7,-1709,2465,308,1층 A블록21열 1,3015.256208,23.644159,10.959703,-34.733813,-5.86283


# **추후 특정 기준에 따라 요약 테이블을 생성하기 위해 함수로 만들어두자**

In [10]:
class ConditionSummaryTable:
    
    def __init__(self,seat_information):
        
        self.seat_information = seat_information
        self.seat_information = self.seat_information.rename(columns = {'전체_좌석' : 'seat'})
        
    def reservation_condition_table(self ,cond_df):
        
        
        self.reservation_grouping = cond_df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, '원가격추정':[np.min, np.mean, np.max],'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
        
        
        self.reservation_grouping['공연 예매율'] = self.reservation_grouping['예매여부'] / self.reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

        self.reservation_grouping = self.reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

        self.reservation_grouping.columns = [' '.join(col).strip() for col in self.reservation_grouping.columns.values]

        # 공연별 예매율 테이블 생성

        cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
                'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


        pre_table = pd.merge(reservation_grouping,cond_df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

        self.reservation_table = pre_table.reset_index(drop = True) # index 초기화 
        
        # 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

        self.reservation_table['연도'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.year)
        self.reservation_table['월'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.month)
        self.reservation_table['연 월'] = pd.to_datetime(self.reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
        self.reservation_table['일'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.day)
        self.reservation_table['시간'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.hour)
        self.reservation_table['요일'] =  pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경
        
        return self.reservation_table
    
    def seat_information_condition_table(self,cond_df):
        
        
        self.seat_grouping = cond_df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()


        self.seat_grouping['좌석 예매율'] = self.seat_grouping['예매여부'] / self.seat_grouping['좌석 오픈 여부']
        self.seat_grouping = self.seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


        self.seat_grouping.columns = [' '.join(col).strip() for col in self.seat_grouping.columns.values]
        
        self.pre_table = self.seat_grouping.reset_index(drop = True)
        

        self.seat_table = pd.merge(pre_table, self.seat_information, how = 'left', on = 'seat')
        
        return self.seat_table

In [11]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

reservation_table.to_parquet(file_path + 'reservation_table.parquet',index = False)
seat_table.to_parquet(file_path + 'seat_table.parquet', index = False)