In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from datetime import datetime
import warnings

plt.rc('font', family='malgun gothic')
plt.rc('axes' , unicode_minus = False)
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_parquet("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/거래까지걸린시간_추가_클래식_데이터.parquet")
seat_information = pd.read_csv("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/seat_information_angle_ver2.csv")
display(df.head())
print(df.shape)

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,...,원등급추정,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,좌석 오픈 여부,거래까지걸린시간,거래까지걸린시간_시간,거래까지걸린시간_등수,표준화 등수 점수
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,69.517555,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,68.459024,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,67.347261,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,66.19942,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,65.044882,...,2,1050.0,1000.0,0.0,50.0,1,19 days 15:06:00,471.1,38.0,0.339286


(378255, 50)


# **데이터 테이블 생성 (공연 별, 좌석 별)**

In [3]:
reservation_grouping = df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, '원가격추정':[np.min , np.mean , np.max],'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
reservation_grouping['공연 예매율'] = reservation_grouping['예매여부'] / reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

reservation_grouping = reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

reservation_grouping.columns = [' '.join(col).strip() for col in reservation_grouping.columns.values]

display(reservation_grouping.sample(5))
print(reservation_grouping.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,원가격추정 amin,원가격추정 mean,원가격추정 amax,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율
49,2020-02-11 19:30:00,2505,2012,30000,64898.203593,120000,10.016667,730.840929,1195.733333,0.803194
7,2019-03-14 20:00:00,2505,1741,60000,162225.548902,260000,11.0,554.179743,1915.816667,0.69501
26,2019-09-08 17:00:00,2505,1992,5000,48491.017964,65000,159.833333,362.580924,928.883333,0.79521
128,2022-12-08 19:30:00,2505,1424,5000,73984.031936,100000,16.0,524.827645,1387.8,0.568463
134,2022-12-31 17:00:00,2505,996,5000,70203.592814,120000,-32.25,1359.466332,4433.1,0.397605


(151, 10)


In [4]:
# 공연별 예매율 테이블 생성

cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
        'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


pre_table = pd.merge(reservation_grouping,df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

reservation_table = pre_table.reset_index(drop = True) # index 초기화 

In [5]:
# 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

reservation_table['연도'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.year)
reservation_table['월'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.month)
reservation_table['연 월'] = pd.to_datetime(reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
reservation_table['일'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.day)
reservation_table['시간'] = pd.to_datetime(reservation_grouping['전체공연시간']).map(lambda x: x.hour)
reservation_table['요일'] =  pd.to_datetime(reservation_table['play_date']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경

In [6]:
reservation_table['총_누적_검색량'] = reservation_table['콘서트홀클래식_누적검색량'] + reservation_table['예술의전당클래식_누적검색량'] + reservation_table['서울클래식_누적검색량']

In [7]:
display(reservation_table.sample(5))
print(reservation_table.shape)

Unnamed: 0,전체공연시간,좌석 오픈 여부 sum,예매여부 sum,원가격추정 amin,원가격추정 mean,원가격추정 amax,거래까지걸린시간_시간 amin,거래까지걸린시간_시간 mean,거래까지걸린시간_시간 amax,공연 예매율,...,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량,연도,월,연 월,일,시간,요일,총_누적_검색량
95,2022-06-09 19:30:00,2325,268,30000,53399.201597,80000,16.016667,650.624192,1052.016667,0.115269,...,470.0,0.0,30.0,2022,6,2022-06-01,9,19,3,500.0
44,2019-12-25 20:00:00,2505,79,5000,42157.684631,70000,9.633333,298.109072,548.983333,0.031537,...,2000.0,0.0,90.0,2019,12,2019-12-01,25,20,2,2090.0
94,2022-05-21 17:00:00,1933,1252,30000,67269.461078,90000,14.0,927.088578,2272.783333,0.647698,...,520.0,0.0,40.0,2022,5,2022-05-01,21,17,5,560.0
19,2019-07-19 20:00:00,2505,1675,5000,62674.650699,150000,66.9,772.142716,1100.133333,0.668663,...,1390.0,20.0,40.0,2019,7,2019-07-01,19,20,4,1450.0
79,2021-11-11 19:30:00,1631,839,40000,71409.181637,100000,13.016667,523.815872,691.483333,0.514408,...,570.0,0.0,40.0,2021,11,2021-11-01,11,19,3,610.0


(151, 26)


# **좌석 데이터 요약 테이블 생성**

In [8]:
seat_grouping = df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()

seat_grouping['좌석 예매율'] = seat_grouping['예매여부'] / seat_grouping['좌석 오픈 여부']
seat_grouping = seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


seat_grouping.columns = [' '.join(col).strip() for col in seat_grouping.columns.values]
pre_table = seat_grouping.reset_index(drop = True)

pre_table.sample(5)

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율
2025,3층 D블록7열 1,75,145,47847.682119,713.014889,955.0,2.0,0.295024,0.517241
646,1층 C블록20열 8,91,148,105695.364238,797.134799,967.0,2.0,0.297466,0.614865
1119,1층 E블록19열 6,68,131,71622.516556,935.65049,967.0,2.0,0.223294,0.519084
248,1층 B블록13열 12,86,135,110993.377483,739.31686,935.0,1.0,0.37088,0.637037
1348,2층 A블록7열 12,71,136,63046.357616,898.632864,985.0,6.0,0.252364,0.522059


In [9]:
# merge 하기 전에 이름을 변경해주자

seat_information = seat_information.rename(columns = {'전체_좌석' : 'seat'})

seat_table = pd.merge(pre_table, seat_information, how = 'left', on = 'seat')

display(seat_table.sample(5))

Unnamed: 0,seat,예매여부 sum,좌석 오픈 여부 sum,원가격추정 mean,거래까지걸린시간_시간 mean,거래까지걸린시간_등수 amax,거래까지걸린시간_등수 amin,표준화 등수 점수 mean,좌석 예매율,층,...,넘버,X,Y,Z,대칭점,무대까지의 거리,좌우면적시야각,상하면적시야각,좌우시야각,상하시야각
2007,3층 D블록5열 4,68,136,51456.953642,721.054902,749.0,2.0,0.278243,0.5,3층,...,4,124,2953,1328,3층 D블록5열 9,3240.242121,26.959897,7.282811,2.404506,-24.195156
2176,3층 M블록1열 14,64,140,40728.476821,495.51875,749.0,2.0,0.288848,0.457143,3층,...,14,370,3378,1583,3층 N블록1열 2,3748.822882,23.077667,6.718987,6.250818,-24.977725
1726,2층 E블록4열 15,57,137,60165.562914,853.387427,985.0,8.0,0.195913,0.416058,2층,...,15,-1672,1929,760,2층 A블록4열 5,2663.498639,25.228664,14.803632,-40.917775,-16.579112
330,1층 B블록19열 5,80,136,95894.039735,876.967083,985.0,2.0,0.26973,0.588235,1층,...,5,1048,2403,261,1층 D블록19열 10,2634.546261,30.714493,8.750294,23.563052,-5.685521
239,1층 B블록12열 4,83,137,106655.629139,828.689357,967.0,2.0,0.301074,0.605839,1층,...,4,969,1716,109,1층 D블록12열 9,1973.701599,39.508042,13.711404,29.452779,-3.165838


# **추후 특정 기준에 따라 요약 테이블을 생성하기 위해 함수로 만들어두자**

In [10]:
class ConditionSummaryTable:
    
    def __init__(self,seat_information):
        
        self.seat_information = seat_information
        self.seat_information = self.seat_information.rename(columns = {'전체_좌석' : 'seat'})
        
    def reservation_condition_table(self ,cond_df):
        
        
        self.reservation_grouping = cond_df.groupby('전체공연시간').agg({'좌석 오픈 여부' : np.sum, 
                                    '예매여부' : np.sum, '원가격추정':[np.min, np.mean, np.max],'거래까지걸린시간_시간' : [np.min,np.mean,np.max]}).reset_index()
        
        
        self.reservation_grouping['공연 예매율'] = self.reservation_grouping['예매여부'] / self.reservation_grouping['좌석 오픈 여부'] # 공연 별 예매율 계산 

        self.reservation_grouping = self.reservation_grouping.rename(columns = {'price': '공연 별 평균 가격'})

        self.reservation_grouping.columns = [' '.join(col).strip() for col in self.reservation_grouping.columns.values]

        # 공연별 예매율 테이블 생성

        cols = ['play_date', 'play_st_time','pre_open_date','open_date', 'running_time', 
                'intermission','전체공연시간','콘서트홀클래식_누적검색량','예술의전당클래식_누적검색량', '서울클래식_누적검색량'] # 합병해주자 


        pre_table = pd.merge(reservation_grouping,cond_df[cols], on = '전체공연시간', how = 'left').drop_duplicates() # merge 후 중복을 제거 

        self.reservation_table = pre_table.reset_index(drop = True) # index 초기화 
        
        # 공연별 요약 테이블에서 EDA 할 때 용이하도록 날짜도 포함해서 만들도록 하자 

        self.reservation_table['연도'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.year)
        self.reservation_table['월'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.month)
        self.reservation_table['연 월'] = pd.to_datetime(self.reservation_table['연도'].astype(str) + '-' + reservation_table['월'].astype(str)) 
        self.reservation_table['일'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.day)
        self.reservation_table['시간'] = pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.hour)
        self.reservation_table['요일'] =  pd.to_datetime(self.reservation_table['전체공연시간']).map(lambda x: x.weekday())  # 월요일 0 , 화요일 1 .... 일요일 6 으로 맵핑되도록 변경
        
        return self.reservation_table
    
    def seat_information_condition_table(self,cond_df):
        
        
        self.seat_grouping = cond_df.groupby('seat').agg({'예매여부': np.sum, '좌석 오픈 여부': np.sum, 
                                        '원가격추정':np.mean, '거래까지걸린시간_시간':np.mean,
                                        '거래까지걸린시간_등수' : np.mean, '거래까지걸린시간_등수':[np.max, np.min],
                                        '표준화 등수 점수': np.mean}).reset_index()


        self.seat_grouping['좌석 예매율'] = self.seat_grouping['예매여부'] / self.seat_grouping['좌석 오픈 여부']
        self.seat_grouping = self.seat_grouping.rename(columns = {'price': '좌석 별 평균 가격'})


        self.seat_grouping.columns = [' '.join(col).strip() for col in self.seat_grouping.columns.values]
        
        self.pre_table = self.seat_grouping.reset_index(drop = True)
        

        self.seat_table = pd.merge(pre_table, self.seat_information, how = 'left', on = 'seat')
        
        return self.seat_table

In [11]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

reservation_table.to_parquet(file_path + 'reservation_table.parquet',index = False)
seat_table.to_parquet(file_path + 'seat_table.parquet', index = False)