## 예술의전당 콘서트홀 가격 모델

### 1. 라이브러리

In [347]:
# base
import pandas as pd
import numpy as np
import re

# etc.
import sys
from glob import glob
import warnings
from tqdm import tqdm
import time 
import datetime as dt 
from scipy import stats


# visualization
import matplotlib.pyplot as plt
import koreanize_matplotlib
import plotly.express as px
import seaborn as sns
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable

# M.L
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, silhouette_score
from kneed import KneeLocator

In [348]:
# settings
%matplotlib inline
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings(action = 'ignore')


### 2. 데이터셋 로드

In [349]:
# 기존의 제공된 데이터프레임에 빈좌석, 좌표, 무대와의 관계가 추가된 데이터프레임
file_name = "빈좌석_포함_클래식_데이터.parquet"
df = pd.read_parquet(glob(file_name)[0])

# 필요없는 컬럼 삭제
df = df.drop(['membership_type_1', 'membership_type_2', 'membership_type_3', 
              'membership_type_4', 'membership_type_5', 'membership_type_6',], axis=1)

print(f"""Found.. {len(glob(file_name))} file(s) : {glob(file_name)}
Reading.. {glob(file_name)[-1]}
df.shape : {df.shape}""")


Found.. 1 file(s) : ['빈좌석_포함_클래식_데이터.parquet']
Reading.. 빈좌석_포함_클래식_데이터.parquet
df.shape : (405810, 36)


In [350]:
df.loc[df['층'] == '합창석', 'Y'].describe()

count    44388.000000
mean      -793.423358
std        418.705553
min      -1295.000000
25%      -1195.000000
50%       -995.000000
75%       -410.000000
max         91.000000
Name: Y, dtype: float64

### 3. 클래스/함수 : 원가격, 원등급 추정

In [351]:
p_val = 2

class Performance:
    instance_cnt = 0

    
    def __init__(self, df):
        Performance.instance_cnt += 1
        self.instance_cnt = Performance.instance_cnt
        self.df = df
        self.perform_time = df['전체공연시간'].iloc[0]
        self.get_original_price()

    def get_original_price(self):
        """
        discount_type에서 할인율을 추출하고 역산해서 '할인전금액'을 df에 컬럼으로 추가하는 함수
        """
        # 할인이 적용되기 전 가격 추정
        self.df['할인율'] = self.df['discount_type'].str.extract('(\d+)%')
        self.df['할인율'] = self.df['할인율'].fillna(0).astype(int) / 100
        self.df['할인전가격'] = (self.df['price'].fillna(0) // (1 - self.df['할인율'])).round(-2).astype(int)
        self.df['할인전가격'] = self.df['할인전가격'].fillna(0)
        
        # 원가격추정 컬럼 추가 (5000원 단위로)
        self.df['원가격추정'] = ((self.df['할인전가격'] + 2500) // 5000 * 5000).copy()
        self.priced_seat = self.df[self.df['원가격추정'] > 0]
        self.unpriced_seat = self.df[self.df['원가격추정'] == 0]
        self.priced_rate = round((self.priced_seat.shape[0] / self.df.shape[0]), 3)
        self.booked_rate = round(self.df['예매여부'].mean(), 3)

    def get_best_n_neighbors(self):
        """
        knn 가격 추정 모델의 적절한 n_neighbors값을 찾는 함수
        """
        X = self.priced_seat.loc[self.priced_seat['층']!='합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층']!='합창석', '원가격추정']
        cv_scores = []
        for n in range(1, min(50, (X.shape[0]*9//10)) + 1):
            model = KNeighborsRegressor(n_neighbors=n, weights='distance', p=p_val)
            scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
            cv_scores.append(scores.mean())
            self.best_n_neighbors = [i for i in range(1, 51)][np.argmax(cv_scores)]
            self.knn_mse = max(cv_scores) * -1

    def estimate_price(self):
        """
        knn모델에 추정된 n_neighbors값을 적용해서 판매되지 않은 티켓의 가격을 추정하는 함수
        """
        # 가격을 하나도 알 수 없는 경우 종료
        if self.priced_rate == 0:
            self.best_n_neighbors = 1
            self.knn_mse = 0
            self.mean_price = self.max_price = self.min_price = 0
            return
        
        # 1) 합창석이 아닌 좌석 : knn모델로 가젹 추정
        self.get_best_n_neighbors()  # 적절한 n_neighbors값 찾기
        X = self.priced_seat.loc[self.priced_seat['층']!='합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층']!='합창석', '원가격추정']
        model = KNeighborsRegressor(n_neighbors=self.best_n_neighbors, weights='distance', p=p_val)
        model.fit(X, y)
        y_pred = model.predict(self.unpriced_seat.loc[self.unpriced_seat['층']!='합창석', ['X', 'Y', 'Z']])
        self.df.loc[(self.df['원가격추정']==0) & (self.df['층']!='합창석'), '원가격추정'] = y_pred
        
        # 2) 합창석 좌석 : 블록별 판매된 티켓의 최빈값으로 맵핑
        temp_dict = self.priced_seat.loc[self.priced_seat['층']=='합창석'].groupby('블록')['원가격추정'].agg(
            lambda x: x.mode().iloc[0] if not x.empty else None).to_dict()
        self.df.loc[self.df['층']=='합창석', '원가격추정'] = self.df.loc[self.df['층']=='합창석', '블록'].map(temp_dict)

        # 3) 가격 보정 : 이상치 보정, 5000원 단위로 표현
        threshold = 1.75  # 이상치 판단 기준
        self.df['원가격추정'] = self.df['원가격추정'].fillna(0)
        self.df['층블록'] = self.df['층'].astype(str) + self.df['블록'].astype(str)
        self.df['z_score'] = self.df.groupby('층블록')['원가격추정'].transform(lambda x: np.abs(stats.zscore(x))).round(2).fillna(0)
        temp_dict = self.df.groupby(['층', '블록'])['원가격추정'].median().to_dict()
        self.df.loc[self.df['z_score'] > threshold, '원가격추정'] = self.df.loc[self.df['z_score'] > threshold, '층블록'].map(temp_dict)
        self.df = self.df.drop('층블록', axis=1)
        self.df['원가격추정'] = ((self.df['원가격추정'].fillna(0) + 2500) // 5000 * 5000).astype(int)
 
    def estimate_cluster_kmeans(self):
        # 중복되는 값이 없도록 난수를 더해서 노이즈 만들기
        self.df['rand'] = np.random.rand(self.df.shape[0])
        self.df['원가격추정_rand'] = self.df['원가격추정'] + self.df['rand']

        # 군집화 모델 생성 (K-means)
        X = self.df[['원가격추정_rand']]
        inertia = []
        k_range = range(1, 11)
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
            kmeans.fit(X)
            inertia.append(kmeans.inertia_)
        
        # 적절한 K값 찾기 (elbow point)        
        kneedle = KneeLocator(k_range[1:], inertia[1:], curve='convex', direction='decreasing')
        self.best_k = kneedle.elbow

        # 찾은 K값을 적용해서 원등급 추정하기
        kmeans = KMeans(n_clusters=self.best_k, random_state=42, n_init='auto')
        self.df['원등급추정'] = kmeans.fit_predict(X)
        cluster_means = self.df.groupby('원등급추정').agg({'seat':'count', '원가격추정':'mean', '예매여부':'mean'}).reset_index()
        cluster_means = cluster_means.sort_values(by='원가격추정')
        grade_mapping = {grade: idx for idx, grade in enumerate(cluster_means['원등급추정'])}
        self.df = self.df.drop(['원가격추정_rand', 'rand'], axis=1)

        # 원등급, 등급별 가격의 비율, 군집분석의 실루엣점수
        self.df['원등급추정'] = (self.df['원등급추정'].map(grade_mapping) + 1)
        temp_min = min((x for x in cluster_means['원가격추정'] if x > 0), default=None)
        self.class_price_ratio = list((cluster_means['원가격추정'] / temp_min).round(2)) if temp_min else [0 for _ in range(len(cluster_means))]
        self.class_price = list(cluster_means['원가격추정'].round().astype(int))
        self.class_seats_cnt = list(cluster_means['seat'])
        self.class_booked_ratio = list(cluster_means['예매여부'].round(4))
        self.silhouette_score = silhouette_score(self.df[['원가격추정']], self.df['원등급추정'])
        self.mean_price = self.df['원가격추정'].mean().round(2)
        self.max_price = self.df['원가격추정'].max()
        self.min_price = self.df['원가격추정'].min()

    def px3dscatter(self, col_name):
        # 결과 시각화
        fig = px.scatter_3d(self.df, x='X', y='Y', z='Z', color=col_name, 
                            hover_name='seat', hover_data=['예매여부', '할인전가격', '원가격추정', 'z_score', '원등급추정'],
                            width=800, height=600)
        fig.update_traces(marker={'size': 1})
        fig.show()

### 4. 원가격/등급 추정 군집분석 결과

In [352]:
# 전체 데이터를 공연시간을 기준으로 공연별 분할
공연시간_list = sorted(df['전체공연시간'].unique())
공연별_df_list = [df[df['전체공연시간'] == 공연시간] for 공연시간 in 공연시간_list]

# 공연별 군집분석 결과 컬럼
instance_cnt_list = []
perform_time_list = []
priced_rate_list = []
booked_rate_list = []
best_n_neighbors_list = []
knn_mse_list = []
best_k_list = []
silhouette_score_list = []
mean_price_list = []
max_price_list = []
min_price_list = []

class_price_ratio_list = []
class_price_list = []
class_seats_cnt_list = []
class_booked_ratio_list = []
df_list = []

In [356]:
# class > 군집분석 적용
for 공연별_df in tqdm(공연별_df_list):
    p = Performance(공연별_df)
    p.estimate_price() # knn => 원가격 추정
    p.estimate_cluster_kmeans() # k-means => 원등급 추정

    instance_cnt_list.append(p.instance_cnt)
    perform_time_list.append(p.perform_time)
    priced_rate_list.append(p.priced_rate)
    booked_rate_list.append(p.booked_rate)
    best_n_neighbors_list.append(p.best_n_neighbors)
    knn_mse_list.append(p.knn_mse)
    best_k_list.append(p.best_k)
    silhouette_score_list.append(p.silhouette_score)
    mean_price_list.append(p.mean_price)
    max_price_list.append(p.max_price)
    min_price_list.append(p.min_price)
    class_price_ratio_list.append(p.class_price_ratio)
    class_price_list.append(p.class_price)
    class_seats_cnt_list.append(p.class_seats_cnt)
    class_booked_ratio_list.append(p.class_booked_ratio)
    df_list.append(p.df)

    # 시각화
    # print(p.perform_time)
    # p.px3dscatter('원등급추정')

# 군집결과 데이터프레임으로 합치기
df_result = pd.concat(df_list, axis=0)
df_summary = pd.DataFrame({
    '공연시간' : perform_time_list,
    '금액명시비율' : priced_rate_list,
    '예약율' : booked_rate_list,
    'knn_n_neighbors' : best_n_neighbors_list,
    'knn_mse' : knn_mse_list,
    'kmeans_군집수' : best_k_list,
    'kmeans_실루엣' : silhouette_score_list,
    '평균가격' : mean_price_list,
    '최소가격' : min_price_list,
    '최대가격' : max_price_list,
    '등급별가격비율' : class_price_ratio_list,
    '등급별가격' : class_price_list,
    '등급별좌석수' : class_seats_cnt_list,
    '등급별예매율' : class_booked_ratio_list
})

100%|██████████| 162/162 [04:42<00:00,  1.74s/it]


### 5. 결과 저장

* 총 162개의 클래식공연 중 11개의 공연은 가격이 전혀 명시되지 않았음.  => 해당 공연의 데이터에서는 원가격, 원등급 추정이 불가함.    
* 앞으로의 EDA를 위해 151개의 공연 데이터만 사용하기로 함.

#### 5-1) 공연별 요약 데이터 (df_summary)

In [357]:
# 공연별 (추정된) 가격, 등급 정보 요약 (군집분석 불가한 11개 공연 데이터는 제외)
공연별_원가격추정군집분석_결과요약 = df_summary.loc[df_summary['금액명시비율']!=0].reset_index().drop('index', axis=1)

# 컬럼 분리
공연별_원가격추정군집분석_결과요약['등급별가격_1'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[0])
공연별_원가격추정군집분석_결과요약['등급별가격_2'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[1])
공연별_원가격추정군집분석_결과요약['등급별가격_3'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[2] if len(x)>2 else np.nan)
공연별_원가격추정군집분석_결과요약['등급별가격_4'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[3] if len(x)>3 else np.nan)
공연별_원가격추정군집분석_결과요약['등급별가격_5'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[4] if len(x)>4 else np.nan)

# 데이터프레임 저장
공연별_원가격추정군집분석_결과요약.to_csv('공연별_원가격추정군집분석_결과요약.csv', index=False)
공연별_원가격추정군집분석_결과요약

Unnamed: 0,공연시간,금액명시비율,예약율,knn_n_neighbors,knn_mse,kmeans_군집수,kmeans_실루엣,평균가격,최소가격,최대가격,등급별가격비율,등급별가격,등급별좌석수,등급별예매율,등급별가격_1,등급별가격_2,등급별가격_3,등급별가격_4,등급별가격_5
0,2018-11-25 17:00:00,0.026,0.62,2,312589000.0,4,0.899692,60700.6,0,100000,"[0.0, 1.0, 1.46, 2.11]","[0, 47294, 69216, 99750]","[403, 872, 389, 841]","[0.1935, 0.4106, 0.7404, 0.9869]",0.0,1.0,1.46,2.11,
1,2018-12-08 17:00:00,0.021,0.862,2,50225820.0,4,0.790457,51369.26,0,100000,"[0.0, 1.0, 1.81, 3.27]","[0, 30392, 55156, 99409]","[357, 510, 1122, 516]","[0.2185, 0.9529, 0.9724, 0.9767]",0.0,1.0,1.81,3.27,
2,2019-01-11 20:00:00,0.386,0.636,6,246925300.0,4,0.858119,90982.04,0,140000,"[0.0, 1.0, 1.59, 2.18]","[0, 63702, 101025, 139158]","[188, 863, 771, 683]","[0.484, 0.5388, 0.6265, 0.8097]",0.0,1.0,1.59,2.18,
3,2019-02-05 20:00:00,0.023,0.858,1,307000000.0,4,0.730581,67648.7,0,150000,"[1.0, 10.56, 21.74, 32.62]","[4599, 48558, 100000, 150000]","[424, 1137, 586, 358]","[0.3373, 0.9569, 0.9573, 1.0]",1.0,10.56,21.74,32.62,
4,2019-02-16 17:00:00,0.157,0.476,12,28242750.0,5,0.748526,24381.24,0,50000,"[1.0, 7.42, 14.16, 22.49, 32.32]","[1519, 11267, 21511, 34162, 49078]","[428, 525, 579, 382, 591]","[0.1799, 0.4686, 0.5855, 0.5838, 0.5195]",1.0,7.42,14.16,22.49,32.32
5,2018-11-25 17:00:00,0.026,0.62,2,312589000.0,4,0.899692,60700.6,0,100000,"[0.0, 1.0, 1.46, 2.11]","[0, 47294, 69216, 99750]","[403, 872, 389, 841]","[0.1935, 0.4106, 0.7404, 0.9869]",0.0,1.0,1.46,2.11,
6,2018-12-08 17:00:00,0.021,0.862,2,50225820.0,4,0.790457,51369.26,0,100000,"[0.0, 1.0, 1.81, 3.27]","[0, 30392, 55156, 99409]","[357, 510, 1122, 516]","[0.2185, 0.9529, 0.9724, 0.9767]",0.0,1.0,1.81,3.27,
7,2019-01-11 20:00:00,0.386,0.636,6,246925300.0,4,0.858119,90982.04,0,140000,"[0.0, 1.0, 1.59, 2.18]","[0, 63702, 101025, 139158]","[188, 863, 771, 683]","[0.484, 0.5388, 0.6265, 0.8097]",0.0,1.0,1.59,2.18,
8,2019-02-05 20:00:00,0.023,0.858,1,307000000.0,4,0.730581,67648.7,0,150000,"[1.0, 10.56, 21.74, 32.62]","[4599, 48558, 100000, 150000]","[424, 1137, 586, 358]","[0.3373, 0.9569, 0.9573, 1.0]",1.0,10.56,21.74,32.62,
9,2019-02-16 17:00:00,0.157,0.476,12,28242750.0,5,0.748526,24381.24,0,50000,"[1.0, 7.42, 14.16, 22.49, 32.32]","[1519, 11267, 21511, 34162, 49078]","[428, 525, 579, 382, 591]","[0.1799, 0.4686, 0.5855, 0.5838, 0.5195]",1.0,7.42,14.16,22.49,32.32


#### 5-2) 공연별 결과 데이터

In [358]:
# 전체공연좌석별 예매 데이터
공연별_결과_list = []
for 공연시간 in list(df_summary.loc[df_summary['금액명시비율']!=0, '공연시간']):
    공연별_결과_list.append(df_result[df_result['전체공연시간'] == 공연시간])

df_result = pd.concat(공연별_결과_list, axis=0, ignore_index=True)
print(f"""df_result 
memory usage: {round(df_result.memory_usage(deep=False).sum() / 1024**2, 1)}+ MB""")
df_result.to_parquet('전체공연_원가격원등급추정결과.parquet', index=False)

df_result 
memory usage: 122.2+ MB


In [359]:
df_result

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,상하시야각,좌우면적시야각,상하면적시야각,무대까지의 거리,예매여부,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,performance_code,pre_open_date,open_date,running_time,intermission,member_yn,공연연도,공연월,공연일,공연연월,전체공연시간,전체거래시간,할인율,할인전가격,원가격추정,z_score,원등급추정
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,69.517555,3.436010,15.219900,33.948421,1551.713247,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,68.459024,3.520697,16.945594,34.331746,1514.433888,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,67.347261,3.608334,18.869853,34.695385,1477.698887,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,66.199420,3.702327,21.009736,35.069575,1440.234009,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,65.044882,3.798387,23.314833,35.426958,1403.862529,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415825,2층 BOX6 2,2층,BOX,6,2,-1432,505,383,2층 BOX1 1,-70.574617,-14.156604,21.173142,34.169657,1565.994253,1,60.0,M,2023-03-26,16:33:00,2023-06-06,19:30:00,15000.0,0.0,장애인/국가유공자 할인50%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-03-26 16:33:00,0.50,30000,30000,0.23,2
415826,2층 BOX6 3,2층,BOX,6,3,-1500,565,428,2층 BOX1 4,-69.360281,-14.950280,21.144470,32.190452,1659.038577,1,60.0,M,2023-03-26,16:33:00,2023-06-06,19:30:00,15000.0,0.0,장애인/국가유공자 할인50%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-03-26 16:33:00,0.50,30000,30000,0.23,2
415827,2층 BOX6 4,2층,BOX,6,4,-1454,617,428,2층 BOX1 3,-67.006224,-15.161505,23.354990,31.913243,1636.456232,1,50.0,F,2023-05-31,01:57:00,2023-06-06,19:30:00,28000.0,0.0,그린회원 할인5%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-05-31 01:57:00,0.05,29500,30000,0.23,2
415828,2층 BOX6 5,2층,BOX,6,5,-1522,677,483,2층 BOX1 6,-66.020049,-16.169769,23.078793,30.097223,1734.388077,0,,,NaT,,2023-06-06,19:30:00,,0.0,,1508.0,2023-03-25,2023-03-26,100.0,15.0,,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,NaT,0.00,0,30000,0.23,2
