## 예술의전당 콘서트홀 가격 모델

### 1. 라이브러리

In [1]:
# base
import pandas as pd
import numpy as np
import re

# etc.
import sys
from glob import glob
import warnings
from tqdm import tqdm
import time 
import datetime as dt 
from scipy import stats


# visualization
import matplotlib.pyplot as plt
import koreanize_matplotlib
import plotly.express as px
import seaborn as sns
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable

# M.L
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, silhouette_score
from kneed import KneeLocator

In [5]:
# settings
%matplotlib inline
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings(action = 'ignore')


### 2. 데이터셋 로드

In [3]:
# 기존의 제공된 데이터프레임에 빈좌석, 좌표, 무대와의 관계가 추가된 데이터프레임
file_path = '../data/빈좌석_포함_클래식_데이터.parquet'
df = pd.read_parquet(glob(file_path)[0])

# 필요없는 컬럼 삭제
df = df.drop(['membership_type_1', 'membership_type_2', 'membership_type_3', 
              'membership_type_4', 'membership_type_5', 'membership_type_6',], axis=1)

print(f"""Found.. {len(glob(file_path))} file(s) : {glob(file_path)}
Reading.. {glob(file_path)[-1]}
df.shape : {df.shape}""")


Found.. 1 file(s) : ['../data/빈좌석_포함_클래식_데이터.parquet']
Reading.. ../data/빈좌석_포함_클래식_데이터.parquet
df.shape : (405810, 36)


### 3. 클래스/함수 : 원가격, 원등급 추정

In [10]:
# knn regression
class Performance:
    instance_cnt = 0
    
    def __init__(self, df):
        Performance.instance_cnt += 1
        self.instance_cnt = Performance.instance_cnt
        self.p_val = 2
        self.df = df
        self.perform_time = df['전체공연시간'].iloc[0]
        self.get_original_price()
        self.best_n_neighbors_1 = None
        self.best_n_neighbors_2 = None

    def get_original_price(self):
        """
        discount_type에서 할인율을 추출하고 역산해서 '할인전금액'을 df에 컬럼으로 추가하는 함수
        """
        # 할인이 적용되기 전 가격 추정
        self.df['할인율'] = self.df['discount_type'].str.extract('(\d+)%')
        self.df['할인율'] = self.df['할인율'].fillna(0).astype(int) / 100
        self.df['price'] = self.df['price'].fillna(0)
        self.df['할인전가격'] = (self.df['price'].fillna(0) // (1 - self.df['할인율'])).round(-2).astype(int)
        self.df['할인전가격'] = self.df['할인전가격'].fillna(0)
        
        # 원가격추정 컬럼 추가 (5000원 단위로)
        self.df['원가격추정'] = ((self.df['할인전가격'] + 2500) // 5000 * 5000).copy()
        self.priced_seat = self.df[(self.df['원가격추정'] > 0)]
        self.unpriced_seat = self.df[(self.df['원가격추정'] == 0)]
        self.priced_rate = round((self.priced_seat.shape[0] / self.df.shape[0]), 3)
        self.booked_rate = round(self.df['예매여부'].mean(), 3)

    def get_best_n_neighbors_1(self):
        """
        일반석 : knn 가격 추정 모델의 적절한 n_neighbors값을 찾는 함수
        """
        X = self.priced_seat.loc[self.priced_seat['층'] != '합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층'] != '합창석', '원가격추정'] // 1000

        cv_scores = []
        for n in range(1, min(50, (X.shape[0]*9//10)) + 1):
            model = KNeighborsRegressor(n_neighbors=n, weights='distance', p=self.p_val)
            scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
            cv_scores.append(scores.mean())
        
        self.best_n_neighbors_1 = np.argmax(cv_scores) + 1

    def get_best_n_neighbors_2(self):
        """
        합창석 : knn 가격 추정 모델의 적절한 n_neighbors값을 찾는 함수
        """
        X = self.priced_seat.loc[self.priced_seat['층'] == '합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층'] == '합창석', '원가격추정'] // 1000

        cv_scores = []
        n_splits = min(len(X), 5) 
        if len(X) < 2:
            # 샘플 수가 2개 미만인 경우 예외 처리
            self.best_n_neighbors_2 = 1
            return

        for n in range(1, min(50, len(X)) + 1):
            model = KNeighborsRegressor(n_neighbors=n, weights='distance', p=self.p_val)
            scores = cross_val_score(model, X, y, cv=n_splits, scoring='neg_mean_squared_error')
            cv_scores.append(scores.mean())
        
        self.best_n_neighbors_2 = np.argmax(cv_scores) + 1 if cv_scores else 1
            
    def estimate_price(self):
        """
        일반석 : knn모델에 추정된 n_neighbors값을 적용해서 판매되지 않은 티켓의 가격을 추정하는 함수
        """
        # 가격을 하나도 알 수 없는 경우 종료
        if self.priced_rate == 0:
            self.best_n_neighbors_1 = 1
            self.best_n_neighbors_2 = 1
            self.mean_price = self.max_price = self.min_price = 0
            return
        

        # 1) 합창석이 아닌 좌석 
        self.get_best_n_neighbors_1()
        #   1-1) 3층에 데이터가 있을 경우 : 1, 2, 3층 전체로 knn수행
        if self.priced_seat.loc[self.priced_seat['층']=='3층'].shape[0] != 0 :
            X_train = self.priced_seat.loc[self.priced_seat['층'] != '합창석', ['X', 'Y', 'Z']] 
            y_train = self.priced_seat.loc[self.priced_seat['층'] != '합창석', '원가격추정'] // 1000
            model = KNeighborsRegressor(n_neighbors=self.best_n_neighbors_1, weights='distance', p=self.p_val)
            model.fit(X_train, y_train)
            y_pred = model.predict(self.unpriced_seat.loc[self.unpriced_seat['층'] != '합창석', ['X', 'Y', 'Z']])
            self.df.loc[(self.df['층'] != '합창석') & (self.df['원가격추정'] == 0), '원가격추정'] = y_pred * 1000
    
        #   1-2) 3층은 가격데이터가 아예 없을 경우
        else :
            # 1, 2층은 knn으로 추정
            X = self.priced_seat.loc[(self.priced_seat['층'] != '합창석') & (self.priced_seat['층'] != '3층'), ['X', 'Y', 'Z']] 
            y = self.priced_seat.loc[(self.priced_seat['층'] != '합창석') & (self.priced_seat['층'] != '3층'), '원가격추정'] // 1000
            model = KNeighborsRegressor(n_neighbors=self.best_n_neighbors_1, weights='distance', p=self.p_val)
            model.fit(X, y)
            y_pred = model.predict(self.unpriced_seat.loc[(self.unpriced_seat['층'] != '합창석') & (self.unpriced_seat['층'] != '3층'), ['X', 'Y', 'Z']])
            self.df.loc[(self.df['층'] != '합창석') & (self.df['층'] != '3층') & (self.df['원가격추정'] == 0), '원가격추정'] = y_pred * 1000

            # 3층은 1, 2층 가격 비율로 적용
            temp_price_1 = self.df.loc[(self.df['블록'] == 'A블록') & ((self.df['층']=='1층') | (self.df['층']=='1층'))].groupby('층').agg({'원가격추정':'mean'}).iloc[0,0]
            temp_price_2 = self.df.loc[(self.df['블록'] == 'A블록') & ((self.df['층']=='2층') | (self.df['층']=='2층'))].groupby('층').agg({'원가격추정':'mean'}).iloc[0,0]
            temp_price_3 = temp_price_2**2 // temp_price_1
            self.df.loc[(self.df['층'] == '3층'), '원가격추정'] = temp_price_3

            # 혹시 1 2 3층의 추정결과가 2500원 이하일 경우 2500원으로
            self.df.loc[(self.df['층'] != '합창석') & (self.df['원가격추정'] < 2500), '원가격추정'] = 2500


        # 2) 합창석 좌석
        if self.unpriced_seat.loc[self.unpriced_seat['층']=='합창석'].shape[0] != 0:
            #   2-1) 합창석 자리를 하나도 모를 경우
            if self.priced_seat.loc[self.priced_seat['층']=='합창석'].shape[0] == 0:
                lowest_price = self.df.loc[(self.df['층']=='3층'), '원가격추정'].min()
                self.df.loc[self.df['층']=='합창석', '원가격추정'] = lowest_price

            #   2-2) 일부 자리는 알 경우, 자체적으로 knn 수행으로 가격 추정
            else:
                self.get_best_n_neighbors_2()
                X = self.priced_seat.loc[self.priced_seat['층'] == '합창석', ['X', 'Y', 'Z']] 
                y = self.priced_seat.loc[self.priced_seat['층'] == '합창석', '원가격추정'] // 1000
                model = KNeighborsRegressor(n_neighbors=self.best_n_neighbors_2, weights='distance', p=self.p_val)
                model.fit(X, y)
                y_pred = model.predict(self.unpriced_seat.loc[self.unpriced_seat['층'] == '합창석', ['X', 'Y', 'Z']])
                self.df.loc[(self.df['층'] == '합창석') & (self.df['원가격추정'] == 0), '원가격추정'] = y_pred * 1000


        # 3) 블록별로 이상치 보정하기
        threshold = 1.75  # 이상치 판단 기준
        self.df['층블록'] = self.df['층'].astype(str) + self.df['블록'].astype(str)
        self.df['z_score'] = self.df.groupby('층블록')['원가격추정'].transform(lambda x: np.abs(stats.zscore(x))).round(2).fillna(0)
        temp_dict = self.df[self.df['층'] != '합창석'].groupby(['층', '블록'])['원가격추정'].median().to_dict()
        mask = (self.df['층'] != '합창석') & (self.df['z_score'] > threshold)
        self.df.loc[mask, '원가격추정'] = self.df[mask].apply(lambda row: temp_dict.get((row['층'], row['블록']), row['원가격추정']), axis=1)
        self.df = self.df.drop('층블록', axis=1)
        self.df['원가격추정'] = ((self.df['원가격추정'].fillna(2500) + 2500) // 5000 * 5000).astype(int)


    def estimate_cluster_kmeans(self):
        # 중복되는 값이 없도록 난수를 더해서 노이즈 만들기
        self.df['rand'] = np.random.rand(self.df.shape[0])
        self.df['원가격추정_rand'] = self.df['원가격추정'] + self.df['rand']

        # 군집화 모델 생성 (K-means)
        X = self.df[['원가격추정_rand']]
        inertia = []
        k_range = range(1, 11)
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
            kmeans.fit(X)
            inertia.append(kmeans.inertia_)
        
        # 적절한 K값 찾기 (elbow point)        
        kneedle = KneeLocator(k_range[1:], inertia[1:], curve='convex', direction='decreasing')
        self.best_k = kneedle.elbow

        # 찾은 K값을 적용해서 원등급 추정하기
        while self.best_k <= 6:
            kmeans = KMeans(n_clusters=self.best_k, random_state=42, n_init='auto')
            self.df['원등급추정'] = kmeans.fit_predict(X)
            cluster_means = self.df.groupby('원등급추정').agg({'seat':'count', '원가격추정':'mean', '예매여부':'mean'}).reset_index()           
            cluster_means = cluster_means.sort_values(by='원가격추정')
            grade_mapping = {grade: idx for idx, grade in enumerate(cluster_means['원등급추정'])}
            self.df['원등급추정'] = (self.df['원등급추정'].map(grade_mapping) + 1)
            temp_min = min((x for x in cluster_means['원가격추정'] if x > 0), default=None)
            self.class_price_ratio = list((cluster_means['원가격추정'] / temp_min).round(2)) if temp_min else [0 for _ in range(len(cluster_means))]
            self.class_price = list(cluster_means['원가격추정'].round().astype(int))
            self.class_seats_cnt = list(cluster_means['seat'])

            # 합창석의 예매되지 않은 좌서들이 그룹핑에 섞여있으면 k+=1 해서 다시 군집분석 수행        
            if (self.class_price[0] != 0) and (self.class_seats_cnt[0] > 274) and (self.df['원가격추정'].min() == 0):
                self.best_k += 1
            else:
                break
    
            
        # 원등급, 등급별 가격의 비율, 군집분석의 실루엣점수
        self.df = self.df.drop(['원가격추정_rand', 'rand'], axis=1)
        self.class_booked_ratio = list(cluster_means['예매여부'].round(4))
        self.silhouette_score = silhouette_score(self.df[['원가격추정']], self.df['원등급추정'])
        self.mean_price = self.df['원가격추정'].mean().round(2)
        self.max_price = self.df['원가격추정'].max()
        self.min_price = self.df['원가격추정'].min()

    def px3dscatter(self, col_name):
        # 결과 시각화
        fig = px.scatter_3d(self.df, x='X', y='Y', z='Z', color=col_name, 
                            hover_name='seat', hover_data=['예매여부', '할인전가격', '원가격추정', 'z_score', '원등급추정'],
                            width=800, height=600)
        fig.update_traces(marker={'size': 1})
        fig.show()

### 4. 원가격/등급 추정 군집분석 결과

In [11]:
# 전체 데이터를 공연시간을 기준으로 공연별 분할
공연시간_list = sorted(df['전체공연시간'].unique())
공연별_df_list = [df[df['전체공연시간'] == 공연시간] for 공연시간 in 공연시간_list]

# 공연별 군집분석 결과 컬럼
instance_cnt_list = []
perform_time_list = []
priced_rate_list = []
booked_rate_list = []
best_n_neighbors_1_list = []
best_n_neighbors_2_list = []
# knn_mse_list = []
best_k_list = []
silhouette_score_list = []
mean_price_list = []
max_price_list = []
min_price_list = []

class_price_ratio_list = []
class_price_list = []
class_seats_cnt_list = []
class_booked_ratio_list = []
df_list = []

# class > 군집분석 적용
for 공연별_df in tqdm(공연별_df_list):
    p = Performance(공연별_df)
    p.estimate_price() # knn => 원가격 추정
    p.estimate_cluster_kmeans() # k-means => 원등급 추정

    instance_cnt_list.append(p.instance_cnt)
    perform_time_list.append(p.perform_time)
    priced_rate_list.append(p.priced_rate)
    booked_rate_list.append(p.booked_rate)
    best_n_neighbors_1_list.append(p.best_n_neighbors_1)
    best_n_neighbors_2_list.append(p.best_n_neighbors_2)
    # knn_mse_list.append(p.knn_mse)
    best_k_list.append(p.best_k)
    silhouette_score_list.append(p.silhouette_score)
    mean_price_list.append(p.mean_price)
    max_price_list.append(p.max_price)
    min_price_list.append(p.min_price)
    class_price_ratio_list.append(p.class_price_ratio)
    class_price_list.append(p.class_price)
    class_seats_cnt_list.append(p.class_seats_cnt)
    class_booked_ratio_list.append(p.class_booked_ratio)
    df_list.append(p.df)

# 군집결과 데이터프레임으로 합치기
df_result = pd.concat(df_list, axis=0)
df_summary = pd.DataFrame({
    '공연시간' : perform_time_list,
    '금액명시비율' : priced_rate_list,
    '예약율' : booked_rate_list,
    'knn_n_neighbors_1' : best_n_neighbors_1_list,
    'knn_n_neighbors_2' : best_n_neighbors_2_list,
    # 'knn_mse' : knn_mse_list,
    'kmeans_군집수' : best_k_list,
    'kmeans_실루엣' : silhouette_score_list,
    '평균가격' : mean_price_list,
    '최소가격' : min_price_list,
    '최대가격' : max_price_list,
    '등급별가격비율' : class_price_ratio_list,
    '등급별가격' : class_price_list,
    '등급별좌석수' : class_seats_cnt_list,
    '등급별예매율' : class_booked_ratio_list
})

100%|██████████| 162/162 [02:38<00:00,  1.02it/s]


### 5. 결과 저장

* 총 162개의 클래식공연 중 11개의 공연은 가격이 전혀 명시되지 않았음.  => 해당 공연의 데이터에서는 원가격, 원등급 추정이 불가함.    
* 앞으로의 EDA를 위해 151개의 공연 데이터만 사용하기로 함.

#### 5-1) 공연별 요약 데이터 (df_summary)

In [13]:
# 공연별 (추정된) 가격, 등급 정보 요약 (군집분석 불가한 11개 공연 데이터는 제외)
공연별_원가격추정군집분석_결과요약 = df_summary.loc[df_summary['금액명시비율']!=0].reset_index().drop('index', axis=1)

# # 컬럼 분리
공연별_원가격추정군집분석_결과요약['등급별가격_1'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[0])
공연별_원가격추정군집분석_결과요약['등급별가격_2'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[1])
공연별_원가격추정군집분석_결과요약['등급별가격_3'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[2] if len(x)>2 else np.nan)
공연별_원가격추정군집분석_결과요약['등급별가격_4'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[3] if len(x)>3 else np.nan)
공연별_원가격추정군집분석_결과요약['등급별가격_5'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[4] if len(x)>4 else np.nan)

# 데이터프레임 저장
공연별_원가격추정군집분석_결과요약.to_csv('공연별_원가격추정군집분석_결과요약.csv', index=False)
공연별_원가격추정군집분석_결과요약

Unnamed: 0,공연시간,금액명시비율,예약율,knn_n_neighbors_1,knn_n_neighbors_2,kmeans_군집수,kmeans_실루엣,평균가격,최소가격,최대가격,등급별가격비율,등급별가격,등급별좌석수,등급별예매율,등급별가격_1,등급별가격_2,등급별가격_3,등급별가격_4,등급별가격_5
0,2018-11-25 17:00:00,0.026,0.62,2,,4,0.954535,67840.32,30000,100000,"[1.0, 1.67, 2.34, 3.29]","[30349, 50539, 70896, 100000]","[401, 844, 374, 886]","[0.1097, 0.4017, 0.7914, 0.9876]",1.0,1.67,2.34,3.29,
1,2018-12-08 17:00:00,0.021,0.862,2,,4,0.876801,55277.45,20000,100000,"[1.0, 1.9, 2.77, 3.85]","[25974, 49397, 71921, 100000]","[765, 920, 315, 505]","[0.6065, 0.9641, 1.0, 0.9762]",1.0,1.9,2.77,3.85,
2,2019-01-11 20:00:00,0.386,0.636,6,50.0,3,0.851291,98778.44,50000,140000,"[1.0, 1.61, 2.22]","[62612, 100939, 139255]","[913, 820, 772]","[0.5093, 0.6354, 0.785]",1.0,1.61,2.22,,
3,2019-02-05 20:00:00,0.023,0.858,1,1.0,4,0.82929,71992.02,15000,150000,"[1.0, 2.39, 4.12, 6.18]","[24276, 57940, 100000, 150000]","[718, 801, 628, 358]","[0.5947, 0.9513, 0.9602, 1.0]",1.0,2.39,4.12,6.18,
4,2019-02-16 17:00:00,0.157,0.476,12,2.0,4,0.77552,26950.1,5000,50000,"[1.0, 2.0, 3.16, 4.53]","[10791, 21559, 34109, 48937]","[847, 622, 387, 649]","[0.3235, 0.5868, 0.584, 0.5039]",1.0,2.0,3.16,4.53,
5,2019-02-23 17:00:00,0.062,0.543,1,,4,1.0,39261.48,20000,70000,"[1.0, 1.5, 2.5, 3.5]","[20000, 30000, 50000, 70000]","[712, 731, 608, 454]","[0.0899, 0.513, 0.8339, 0.9141]",1.0,1.5,2.5,3.5,
6,2019-03-09 17:00:00,0.394,0.742,6,20.0,4,0.851387,67770.46,5000,110000,"[1.0, 1.66, 2.61, 3.53]","[30747, 51188, 80170, 108532]","[582, 686, 617, 620]","[0.6306, 0.7143, 0.8185, 0.8]",1.0,1.66,2.61,3.53,
7,2019-03-14 20:00:00,0.688,0.695,18,1.0,4,0.923653,162225.55,60000,260000,"[1.0, 1.86, 2.51, 4.29]","[60075, 111701, 150583, 257767]","[603, 432, 532, 938]","[0.8955, 0.6944, 0.6842, 0.5725]",1.0,1.86,2.51,4.29,
8,2019-05-03 20:00:00,0.171,0.18,11,4.0,4,0.704202,66738.52,30000,100000,"[1.0, 1.69, 2.41, 2.9]","[34282, 57960, 82634, 99378]","[766, 642, 317, 780]","[0.1997, 0.1713, 0.1514, 0.1795]",1.0,1.69,2.41,2.9,
9,2019-05-08 20:00:00,0.29,0.499,11,6.0,4,0.743745,85590.82,5000,110000,"[1.0, 2.16, 3.04, 4.05]","[26029, 56110, 79111, 105512]","[272, 347, 422, 1464]","[0.1507, 0.4092, 0.2701, 0.651]",1.0,2.16,3.04,4.05,


In [16]:
공연별_원가격추정군집분석_결과요약.describe()

Unnamed: 0,금액명시비율,예약율,knn_n_neighbors_1,knn_n_neighbors_2,kmeans_군집수,kmeans_실루엣,평균가격,최소가격,최대가격,등급별가격_1,등급별가격_2,등급별가격_3,등급별가격_4,등급별가격_5
count,151.0,151.0,151.0,96.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,129.0,15.0
mean,0.200119,0.462483,9.178808,10.947917,3.953642,0.81452,75421.831987,27880.794702,113476.821192,1.0,2.025364,2.94755,4.07,7.360667
std,0.162038,0.227569,10.292448,14.911931,0.494473,0.170965,34062.343305,18712.180401,49146.018413,0.0,1.223167,2.020279,3.023574,3.151681
min,0.005,0.032,1.0,1.0,3.0,0.0,9453.09,5000.0,10000.0,1.0,1.0,1.0,1.0,2.74
25%,0.0665,0.3015,3.0,1.0,4.0,0.734178,54509.98,10000.0,85000.0,1.0,1.485,1.985,2.59,4.89
50%,0.168,0.465,6.0,4.0,4.0,0.845171,70900.2,30000.0,110000.0,1.0,1.67,2.34,3.16,6.66
75%,0.3065,0.628,11.0,12.25,4.0,0.930638,87494.01,40000.0,130000.0,1.0,2.03,3.07,4.14,10.205
max,0.718,0.981,50.0,50.0,5.0,1.0,225870.26,75000.0,320000.0,1.0,9.73,16.38,23.11,11.9


#### 5-2) 공연별 결과 데이터

In [15]:
# 전체공연좌석별 예매 데이터
공연별_결과_list = []
for 공연시간 in list(df_summary.loc[df_summary['금액명시비율']!=0, '공연시간']):
    공연별_결과_list.append(df_summary[df_summary['전체공연시간'] == 공연시간])

df_result = pd.concat(공연별_결과_list, axis=0, ignore_index=True)
print(f"""df_result 
memory usage: {round(df_result.memory_usage(deep=False).sum() / 1024**2, 1)}+ MB""")
df_result.to_parquet('전체공연_원가격원등급추정결과.parquet', index=False)

df_result 
memory usage: 111.1+ MB
