## 예술의전당 콘서트홀 가격 모델

### 1. 라이브러리

In [161]:
# base
import pandas as pd
import numpy as np
import re

# etc.
import sys
from glob import glob
import warnings
from tqdm import tqdm
import time 
import datetime as dt 
from scipy import stats


# visualization
import matplotlib.pyplot as plt
import koreanize_matplotlib
import plotly.express as px
import seaborn as sns
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable

# M.L
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import mean_squared_error, silhouette_score
from kneed import KneeLocator

In [4]:
# settings
%matplotlib inline
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings(action = 'ignore')


### 2. 데이터셋 로드

In [11]:
# 기존의 제공된 데이터프레임에 빈좌석, 좌표, 무대와의 관계가 추가된 데이터프레임
file_path = '../data/빈좌석_포함_클래식_데이터.parquet'
df = pd.read_parquet(glob(file_path)[0])

# 필요없는 컬럼 삭제
df = df.drop(['membership_type_1', 'membership_type_2', 'membership_type_3', 
              'membership_type_4', 'membership_type_5', 'membership_type_6',], axis=1)

print(f"""Found.. {len(glob(file_path))} file(s) : {glob(file_path)}
Reading.. {glob(file_path)[-1]}
df.shape : {df.shape}""")


Found.. 1 file(s) : ['../data/빈좌석_포함_클래식_데이터.parquet']
Reading.. ../data/빈좌석_포함_클래식_데이터.parquet
df.shape : (405810, 36)


### 3. 클래스/함수 : 원가격, 원등급 추정

In [236]:
# knn regression
class Performance_reg:
    instance_cnt = 0
    
    def __init__(self, df):
        Performance_reg.instance_cnt += 1
        self.instance_cnt = Performance_reg.instance_cnt
        self.p_val = 2
        self.df = df
        self.perform_time = df['전체공연시간'].iloc[0]
        self.get_original_price()

    def get_original_price(self):
        """
        discount_type에서 할인율을 추출하고 역산해서 '할인전금액'을 df에 컬럼으로 추가하는 함수
        """
        # 할인이 적용되기 전 가격 추정
        self.df['할인율'] = self.df['discount_type'].str.extract('(\d+)%')
        self.df['할인율'] = self.df['할인율'].fillna(0).astype(int) / 100
        self.df['할인전가격'] = (self.df['price'].fillna(0) // (1 - self.df['할인율'])).round(-2).astype(int)
        self.df['할인전가격'] = self.df['할인전가격'].fillna(0)
        
        # 원가격추정 컬럼 추가 (5000원 단위로)
        self.df['원가격추정'] = ((self.df['할인전가격'] + 2500) // 5000 * 5000).copy()
        self.priced_seat = self.df[self.df['원가격추정'] > 0]
        self.unpriced_seat = self.df[self.df['원가격추정'] == 0]
        self.priced_rate = round((self.priced_seat.shape[0] / self.df.shape[0]), 3)
        self.booked_rate = round(self.df['예매여부'].mean(), 3)

    def get_best_n_neighbors(self):
        """
        knn 가격 추정 모델의 적절한 n_neighbors값을 찾는 함수
        """
        X = self.priced_seat.loc[self.priced_seat['층']!='합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층']!='합창석', '원가격추정'] // 1000
        cv_scores = []
        for n in range(1, min(50, (X.shape[0]*9//10)) + 1):
            model = KNeighborsRegressor(n_neighbors=n, weights='distance', p=p_val)
            scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
            cv_scores.append(scores.mean())
            self.best_n_neighbors = np.argmax(cv_scores) + 1
            self.knn_mse = max(cv_scores) * -1

    def estimate_price(self):
        """
        knn모델에 추정된 n_neighbors값을 적용해서 판매되지 않은 티켓의 가격을 추정하는 함수
        """
        # 가격을 하나도 알 수 없는 경우 종료
        if self.priced_rate == 0:
            self.best_n_neighbors = 1
            self.knn_mse = 0
            self.mean_price = self.max_price = self.min_price = 0
            return
        
        # 1) 합창석이 아닌 좌석 : knn모델로 가젹 추정
        self.get_best_n_neighbors()  # 적절한 n_neighbors값 찾기
        X = self.priced_seat.loc[self.priced_seat['층']!='합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층']!='합창석', '원가격추정'] // 1000
        model = KNeighborsRegressor(n_neighbors=self.best_n_neighbors, weights='distance', p=p_val)
        model.fit(X, y)
        y_pred = model.predict(self.unpriced_seat.loc[self.unpriced_seat['층']!='합창석', ['X', 'Y', 'Z']])
        self.df.loc[(self.df['원가격추정']==0) & (self.df['층']!='합창석'), '원가격추정'] = y_pred * 1000
        self.df.loc[(self.df['층']!='합창석') & (self.df['원가격추정'] < 2500), '원가격추정'] = 2500
        
        # 2) 합창석이 아닌 좌석 : 이상치 보정
        threshold = 1.75  # 이상치 판단 기준
        self.df['원가격추정'] = self.df['원가격추정'].fillna(0)
        self.df['층블록'] = self.df['층'].astype(str) + self.df['블록'].astype(str)
        self.df['z_score'] = self.df.groupby('층블록')['원가격추정'].transform(lambda x: np.abs(stats.zscore(x))).round(2).fillna(0)
        temp_dict = self.df[self.df['층'] != '합창석'].groupby(['층', '블록'])['원가격추정'].median().to_dict()
        mask = (self.df['층'] != '합창석') & (self.df['z_score'] > threshold)
        self.df.loc[mask, '원가격추정'] = self.df[mask].apply(lambda row: temp_dict.get((row['층'], row['블록']), row['원가격추정']), axis=1)
        self.df = self.df.drop('층블록', axis=1)

        # 3) 합창석 좌석 : 블록별 판매된 티켓의 최빈값으로 맵핑
        temp_dict = self.priced_seat.loc[self.priced_seat['층']=='합창석'].groupby('블록')['원가격추정'].agg(
            lambda x: x.mode().iloc[0] if not x.empty else None).to_dict()
        self.df.loc[self.df['층']=='합창석', '원가격추정'] = self.df.loc[self.df['층']=='합창석', '블록'].map(temp_dict)

        # 4) 가격 단위 보정 : 5000원 단위로 표현
        self.df['원가격추정'] = ((self.df['원가격추정'].fillna(0) + 2500) // 5000 * 5000).astype(int)
 
    def estimate_cluster_kmeans(self):
        # 중복되는 값이 없도록 난수를 더해서 노이즈 만들기
        self.df['rand'] = np.random.rand(self.df.shape[0])
        self.df['원가격추정_rand'] = self.df['원가격추정'] + self.df['rand']

        # 군집화 모델 생성 (K-means)
        X = self.df[['원가격추정_rand']]
        inertia = []
        k_range = range(1, 11)
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
            kmeans.fit(X)
            inertia.append(kmeans.inertia_)
        
        # 적절한 K값 찾기 (elbow point)        
        kneedle = KneeLocator(k_range[1:], inertia[1:], curve='convex', direction='decreasing')
        self.best_k = kneedle.elbow

        # 찾은 K값을 적용해서 원등급 추정하기
        while self.best_k <= 6:
            kmeans = KMeans(n_clusters=self.best_k, random_state=42, n_init='auto')
            self.df['원등급추정'] = kmeans.fit_predict(X)
            cluster_means = self.df.groupby('원등급추정').agg({'seat':'count', '원가격추정':'mean', '예매여부':'mean'}).reset_index()           
            cluster_means = cluster_means.sort_values(by='원가격추정')
            grade_mapping = {grade: idx for idx, grade in enumerate(cluster_means['원등급추정'])}
            self.df['원등급추정'] = (self.df['원등급추정'].map(grade_mapping) + 1)
            temp_min = min((x for x in cluster_means['원가격추정'] if x > 0), default=None)
            self.class_price_ratio = list((cluster_means['원가격추정'] / temp_min).round(2)) if temp_min else [0 for _ in range(len(cluster_means))]
            self.class_price = list(cluster_means['원가격추정'].round().astype(int))
            self.class_seats_cnt = list(cluster_means['seat'])

            # 합창석의 예매되지 않은 좌서들이 그룹핑에 섞여있으면 k+=1 해서 다시 군집분석 수행        
            if (self.class_price[0] != 0) and (self.class_seats_cnt[0] > 274) and (self.df['원가격추정'].min() == 0):
                self.best_k += 1
            else:
                break
    
            
        # 원등급, 등급별 가격의 비율, 군집분석의 실루엣점수
        self.df = self.df.drop(['원가격추정_rand', 'rand'], axis=1)
        self.class_booked_ratio = list(cluster_means['예매여부'].round(4))
        self.silhouette_score = silhouette_score(self.df[['원가격추정']], self.df['원등급추정'])
        self.mean_price = self.df['원가격추정'].mean().round(2)
        self.max_price = self.df['원가격추정'].max()
        self.min_price = self.df['원가격추정'].min()

    def px3dscatter(self, col_name):
        # 결과 시각화
        fig = px.scatter_3d(self.df, x='X', y='Y', z='Z', color=col_name, 
                            hover_name='seat', hover_data=['예매여부', '할인전가격', '원가격추정', 'z_score', '원등급추정'],
                            width=800, height=600)
        fig.update_traces(marker={'size': 1})
        fig.show()

In [207]:
# knn classifier
class Performance_clf:
    instance_cnt = 0
    
    def __init__(self, df):
        Performance_clf.instance_cnt += 1
        self.instance_cnt = Performance_clf.instance_cnt
        self.df = df
        self.perform_time = df['전체공연시간'].iloc[0]
        self.get_original_price()

    def get_original_price(self):
        """
        discount_type에서 할인율을 추출하고 역산해서 '할인전금액'을 df에 컬럼으로 추가하는 함수
        """
        # 할인이 적용되기 전 가격 추정
        self.df['할인율'] = self.df['discount_type'].str.extract('(\d+)%')
        self.df['할인율'] = self.df['할인율'].fillna(0).astype(int) / 100
        self.df['할인전가격'] = (self.df['price'].fillna(0) // (1 - self.df['할인율'])).round(-2).astype(int)
        self.df['할인전가격'] = self.df['할인전가격'].fillna(0)
        
        # 원가격추정 컬럼 추가 (5000원 단위로)
        self.df['원가격추정'] = ((self.df['할인전가격'] + 2500) // 5000 * 5000).copy()
        self.priced_seat = self.df[self.df['원가격추정'] > 0]
        self.unpriced_seat = self.df[self.df['원가격추정'] == 0]
        self.priced_rate = round((self.priced_seat.shape[0] / self.df.shape[0]), 3)
        self.booked_rate = round(self.df['예매여부'].mean(), 3)

    def get_best_n_neighbors(self):
        """
        knn 가격 추정 모델의 적절한 n_neighbors값을 찾는 함수
        """
        X = self.priced_seat.loc[self.priced_seat['층']!='합창석', ['X', 'Y', 'Z']] 
        y = self.priced_seat.loc[self.priced_seat['층']!='합창석', '원가격추정'] // 1000
        cv_scores = []
        
        # 클래스 수 계산
        num_classes = len(np.unique(y))
        
        # 클래스 수가 2개 이상인 경우에만 교차 검증 수행
        if num_classes > 1:
            cv_value = min(num_classes, 10)
            
            skf = StratifiedKFold(n_splits=cv_value, shuffle=True, random_state=42)  # 클래스에 따라 분할
            for n in range(1, min(50, (X.shape[0] * 9 // 10)) + 1):
                model = KNeighborsClassifier(n_neighbors=n)
                scores = cross_val_score(model, X, y, cv=skf, scoring='neg_mean_squared_error')  # StratifiedKFold를 사용하여 교차 검증
                cv_scores.append(-scores.mean())
            self.best_n_neighbors = np.argmax(cv_scores) + 1
            self.knn_mse = np.mean(cv_scores)
        else:
            self.best_n_neighbors = 1
            self.knn_mse = 0

    def estimate_price(self):
        """
        knn모델에 추정된 n_neighbors값을 적용해서 판매되지 않은 티켓의 가격을 추정하는 함수
        """
        # 가격을 하나도 알 수 없는 경우 종료
        if self.priced_rate == 0:
            self.best_n_neighbors = 1
            self.knn_mse = 0
            self.mean_price = self.max_price = self.min_price = 0
            return
        
        # 1) 합창석이 아닌 좌석 : knn모델로 가젹 추정
        self.get_best_n_neighbors()  # 적절한 n_neighbors값 찾기
        X = self.priced_seat.loc[self.priced_seat['층'] != '합창석', ['X', 'Y', 'Z']]
        y = self.priced_seat.loc[self.priced_seat['층'] != '합창석', '원가격추정'] // 1000  # 가격대(등급)를 정수로 변환
        model = KNeighborsClassifier(n_neighbors=self.best_n_neighbors, weights='distance')
        model.fit(X, y)
        y_pred = model.predict(self.unpriced_seat.loc[self.unpriced_seat['층'] != '합창석', ['X', 'Y', 'Z']])
        self.df.loc[(self.df['원가격추정'] == 0) & (self.df['층'] != '합창석'), '원가격추정'] = y_pred * 1000  # 다시 가격으로 변환
        self.df.loc[self.df['원가격추정'] < 2500, '원가격추정'] = 2500

        # 3) 합창석 좌석 : 블록별 판매된 티켓의 최빈값으로 맵핑
        temp_dict = self.priced_seat.loc[self.priced_seat['층']=='합창석'].groupby('블록')['원가격추정'].agg(
            lambda x: x.mode().iloc[0] if not x.empty else None).to_dict()
        self.df.loc[self.df['층']=='합창석', '원가격추정'] = self.df.loc[self.df['층']=='합창석', '블록'].map(temp_dict)

        # 4) 가격 단위 보정 : 5000원 단위로 표현
        self.df['원가격추정'] = ((self.df['원가격추정'].fillna(0) + 2500) // 5000 * 5000).astype(int)
 
    def estimate_cluster_kmeans(self):
        # 중복되는 값이 없도록 난수를 더해서 노이즈 만들기
        self.df['rand'] = np.random.rand(self.df.shape[0])
        self.df['원가격추정_rand'] = self.df['원가격추정'] + self.df['rand']

        # 군집화 모델 생성 (K-means)
        X = self.df[['원가격추정_rand']]
        inertia = []
        k_range = range(1, 11)
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
            kmeans.fit(X)
            inertia.append(kmeans.inertia_)
        
        # 적절한 K값 찾기 (elbow point)        
        kneedle = KneeLocator(k_range[1:], inertia[1:], curve='convex', direction='decreasing')
        self.best_k = kneedle.elbow

        # 찾은 K값을 적용해서 원등급 추정하기
        while self.best_k <= 6:
            kmeans = KMeans(n_clusters=self.best_k, random_state=42, n_init='auto')
            self.df['원등급추정'] = kmeans.fit_predict(X)
            cluster_means = self.df.groupby('원등급추정').agg({'seat':'count', '원가격추정':'mean', '예매여부':'mean'}).reset_index()           
            cluster_means = cluster_means.sort_values(by='원가격추정')
            grade_mapping = {grade: idx for idx, grade in enumerate(cluster_means['원등급추정'])}
            self.df['원등급추정'] = (self.df['원등급추정'].map(grade_mapping) + 1)
            temp_min = min((x for x in cluster_means['원가격추정'] if x > 0), default=None)
            self.class_price_ratio = list((cluster_means['원가격추정'] / temp_min).round(2)) if temp_min else [0 for _ in range(len(cluster_means))]
            self.class_price = list(cluster_means['원가격추정'].round().astype(int))
            self.class_seats_cnt = list(cluster_means['seat'])

            # 합창석의 예매되지 않은 좌서들이 그룹핑에 섞여있으면 k+=1 해서 다시 군집분석 수행        
            if (self.class_price[0] != 0) and (self.class_seats_cnt[0] > 274) and (self.df['원가격추정'].min() == 0):
                self.best_k += 1
            else:
                break
    
            
        # 원등급, 등급별 가격의 비율, 군집분석의 실루엣점수
        self.df = self.df.drop(['원가격추정_rand', 'rand'], axis=1)
        self.class_booked_ratio = list(cluster_means['예매여부'].round(4))
        self.silhouette_score = silhouette_score(self.df[['원가격추정']], self.df['원등급추정'])
        self.mean_price = self.df['원가격추정'].mean().round(2)
        self.max_price = self.df['원가격추정'].max()
        self.min_price = self.df['원가격추정'].min()

    def px3dscatter(self, col_name):
        # 결과 시각화
        fig = px.scatter_3d(self.df, x='X', y='Y', z='Z', color=col_name, 
                            hover_name='seat', hover_data=['예매여부', '할인전가격', '원가격추정', '원등급추정'],
                            width=800, height=600)
        fig.update_traces(marker={'size': 1})
        fig.show()

### 4. 원가격/등급 추정 군집분석 결과

In [214]:
# 전체 데이터를 공연시간을 기준으로 공연별 분할
공연시간_list = sorted(df['전체공연시간'].unique())
공연별_df_list = [df[df['전체공연시간'] == 공연시간] for 공연시간 in 공연시간_list]

# 공연별 군집분석 결과 컬럼
instance_cnt_list = []
perform_time_list = []
priced_rate_list = []
booked_rate_list = []
best_n_neighbors_list = []
knn_mse_list = []
best_k_list = []
silhouette_score_list = []
mean_price_list = []
max_price_list = []
min_price_list = []

class_price_ratio_list = []
class_price_list = []
class_seats_cnt_list = []
class_booked_ratio_list = []
df_list = []

# class > 군집분석 적용
for 공연별_df in tqdm(공연별_df_list):
    p = Performance_reg(공연별_df)
    p.estimate_price() # knn => 원가격 추정
    p.estimate_cluster_kmeans() # k-means => 원등급 추정

    instance_cnt_list.append(p.instance_cnt)
    perform_time_list.append(p.perform_time)
    priced_rate_list.append(p.priced_rate)
    booked_rate_list.append(p.booked_rate)
    best_n_neighbors_list.append(p.best_n_neighbors)
    knn_mse_list.append(p.knn_mse)
    best_k_list.append(p.best_k)
    silhouette_score_list.append(p.silhouette_score)
    mean_price_list.append(p.mean_price)
    max_price_list.append(p.max_price)
    min_price_list.append(p.min_price)
    class_price_ratio_list.append(p.class_price_ratio)
    class_price_list.append(p.class_price)
    class_seats_cnt_list.append(p.class_seats_cnt)
    class_booked_ratio_list.append(p.class_booked_ratio)
    df_list.append(p.df)

# 군집결과 데이터프레임으로 합치기
df_result_reg = pd.concat(df_list, axis=0)
df_summary_reg = pd.DataFrame({
    '공연시간' : perform_time_list,
    '금액명시비율' : priced_rate_list,
    '예약율' : booked_rate_list,
    'knn_n_neighbors' : best_n_neighbors_list,
    'knn_mse' : knn_mse_list,
    'kmeans_군집수' : best_k_list,
    'kmeans_실루엣' : silhouette_score_list,
    '평균가격' : mean_price_list,
    '최소가격' : min_price_list,
    '최대가격' : max_price_list,
    '등급별가격비율' : class_price_ratio_list,
    '등급별가격' : class_price_list,
    '등급별좌석수' : class_seats_cnt_list,
    '등급별예매율' : class_booked_ratio_list
})

100%|██████████| 162/162 [04:02<00:00,  1.50s/it]


In [209]:
# 전체 데이터를 공연시간을 기준으로 공연별 분할
공연시간_list = sorted(df['전체공연시간'].unique())
공연별_df_list = [df[df['전체공연시간'] == 공연시간] for 공연시간 in 공연시간_list]

# 공연별 군집분석 결과 컬럼
instance_cnt_list = []
perform_time_list = []
priced_rate_list = []
booked_rate_list = []
best_n_neighbors_list = []
knn_mse_list = []
best_k_list = []
silhouette_score_list = []
mean_price_list = []
max_price_list = []
min_price_list = []

class_price_ratio_list = []
class_price_list = []
class_seats_cnt_list = []
class_booked_ratio_list = []
df_list = []

# class > 군집분석 적용
for 공연별_df in tqdm(공연별_df_list):
    p = Performance_clf(공연별_df)
    p.estimate_price() # knn => 원가격 추정
    p.estimate_cluster_kmeans() # k-means => 원등급 추정

    instance_cnt_list.append(p.instance_cnt)
    perform_time_list.append(p.perform_time)
    priced_rate_list.append(p.priced_rate)
    booked_rate_list.append(p.booked_rate)
    best_n_neighbors_list.append(p.best_n_neighbors)
    knn_mse_list.append(p.knn_mse)
    best_k_list.append(p.best_k)
    silhouette_score_list.append(p.silhouette_score)
    mean_price_list.append(p.mean_price)
    max_price_list.append(p.max_price)
    min_price_list.append(p.min_price)
    class_price_ratio_list.append(p.class_price_ratio)
    class_price_list.append(p.class_price)
    class_seats_cnt_list.append(p.class_seats_cnt)
    class_booked_ratio_list.append(p.class_booked_ratio)
    df_list.append(p.df)

    # 시각화
    # print(p.perform_time)
    # p.px3dscatter('원등급추정')

# 군집결과 데이터프레임으로 합치기
df_result_clf = pd.concat(df_list, axis=0)
df_summary_clf = pd.DataFrame({
    '공연시간' : perform_time_list,
    '금액명시비율' : priced_rate_list,
    '예약율' : booked_rate_list,
    'knn_n_neighbors' : best_n_neighbors_list,
    'knn_mse' : knn_mse_list,
    'kmeans_군집수' : best_k_list,
    'kmeans_실루엣' : silhouette_score_list,
    '평균가격' : mean_price_list,
    '최소가격' : min_price_list,
    '최대가격' : max_price_list,
    '등급별가격비율' : class_price_ratio_list,
    '등급별가격' : class_price_list,
    '등급별좌석수' : class_seats_cnt_list,
    '등급별예매율' : class_booked_ratio_list
})

100%|██████████| 162/162 [04:49<00:00,  1.78s/it]


### 5. 결과 저장

* 총 162개의 클래식공연 중 11개의 공연은 가격이 전혀 명시되지 않았음.  => 해당 공연의 데이터에서는 원가격, 원등급 추정이 불가함.    
* 앞으로의 EDA를 위해 151개의 공연 데이터만 사용하기로 함.

#### 5-1) 공연별 요약 데이터 (df_summary)

In [216]:
# 공연별 (추정된) 가격, 등급 정보 요약 (군집분석 불가한 11개 공연 데이터는 제외)
공연별_원가격추정군집분석_결과요약 = df_summary_reg.loc[df_summary_reg['금액명시비율']!=0].reset_index().drop('index', axis=1)

# # 컬럼 분리
공연별_원가격추정군집분석_결과요약['등급별가격_1'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[0])
공연별_원가격추정군집분석_결과요약['등급별가격_2'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[1])
공연별_원가격추정군집분석_결과요약['등급별가격_3'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[2] if len(x)>2 else np.nan)
공연별_원가격추정군집분석_결과요약['등급별가격_4'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[3] if len(x)>3 else np.nan)
공연별_원가격추정군집분석_결과요약['등급별가격_5'] = 공연별_원가격추정군집분석_결과요약['등급별가격비율'].map(lambda x: x[4] if len(x)>4 else np.nan)

# 데이터프레임 저장
공연별_원가격추정군집분석_결과요약.to_csv('공연별_원가격추정군집분석_reg_결과요약.csv', index=False)
공연별_원가격추정군집분석_결과요약_reg = 공연별_원가격추정군집분석_결과요약

#### 5-2) 공연별 결과 데이터

In [211]:
# 전체공연좌석별 예매 데이터
공연별_결과_list = []
for 공연시간 in list(df_summary_clf.loc[df_summary_clf['금액명시비율']!=0, '공연시간']):
    공연별_결과_list.append(df_result_clf[df_result_clf['전체공연시간'] == 공연시간])

df_result_clf = pd.concat(공연별_결과_list, axis=0, ignore_index=True)
print(f"""df_result 
memory usage: {round(df_result_clf.memory_usage(deep=False).sum() / 1024**2, 1)}+ MB""")
df_result_clf.to_parquet('전체공연_원가격원등급추정결과.parquet', index=False)

df_result 
memory usage: 108.2+ MB


In [234]:
df_result_clf

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,상하시야각,좌우면적시야각,상하면적시야각,무대까지의 거리,예매여부,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,performance_code,pre_open_date,open_date,running_time,intermission,member_yn,공연연도,공연월,공연일,공연연월,전체공연시간,전체거래시간,할인율,할인전가격,원가격추정,원등급추정
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,69.517555,3.436010,15.219900,33.948421,1551.713247,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,2
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,68.459024,3.520697,16.945594,34.331746,1514.433888,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,2
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,67.347261,3.608334,18.869853,34.695385,1477.698887,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,2
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,66.199420,3.702327,21.009736,35.069575,1440.234009,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,2
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,65.044882,3.798387,23.314833,35.426958,1403.862529,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378250,2층 BOX6 2,2층,BOX,6,2,-1432,505,383,2층 BOX1 1,-70.574617,-14.156604,21.173142,34.169657,1565.994253,1,60.0,M,2023-03-26,16:33:00,2023-06-06,19:30:00,15000.0,0.0,장애인/국가유공자 할인50%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-03-26 16:33:00,0.50,30000,30000,2
378251,2층 BOX6 3,2층,BOX,6,3,-1500,565,428,2층 BOX1 4,-69.360281,-14.950280,21.144470,32.190452,1659.038577,1,60.0,M,2023-03-26,16:33:00,2023-06-06,19:30:00,15000.0,0.0,장애인/국가유공자 할인50%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-03-26 16:33:00,0.50,30000,30000,2
378252,2층 BOX6 4,2층,BOX,6,4,-1454,617,428,2층 BOX1 3,-67.006224,-15.161505,23.354990,31.913243,1636.456232,1,50.0,F,2023-05-31,01:57:00,2023-06-06,19:30:00,28000.0,0.0,그린회원 할인5%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-05-31 01:57:00,0.05,29500,30000,2
378253,2층 BOX6 5,2층,BOX,6,5,-1522,677,483,2층 BOX1 6,-66.020049,-16.169769,23.078793,30.097223,1734.388077,0,,,NaT,,2023-06-06,19:30:00,,0.0,,1508.0,2023-03-25,2023-03-26,100.0,15.0,,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,NaT,0.00,0,30000,2


In [217]:
# 전체공연좌석별 예매 데이터
공연별_결과_list = []
for 공연시간 in list(df_summary_reg.loc[df_summary_reg['금액명시비율']!=0, '공연시간']):
    공연별_결과_list.append(df_result_reg[df_result_reg['전체공연시간'] == 공연시간])

df_result_reg = pd.concat(공연별_결과_list, axis=0, ignore_index=True)
print(f"""df_result 
memory usage: {round(df_result_reg.memory_usage(deep=False).sum() / 1024**2, 1)}+ MB""")
df_result_reg.to_parquet('전체공연_원가격원등급추정결과_reg.parquet', index=False)

df_result 
memory usage: 111.1+ MB


In [233]:
df_result_reg

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,상하시야각,좌우면적시야각,상하면적시야각,무대까지의 거리,예매여부,age,gender,tran_date,tran_time,play_date,play_st_time,price,ticket_cancel,discount_type,performance_code,pre_open_date,open_date,running_time,intermission,member_yn,공연연도,공연월,공연일,공연연월,전체공연시간,전체거래시간,할인율,할인전가격,원가격추정,z_score,원등급추정
0,1층 A블록1열 1,1층,A블록,1,1,1451,542,-93,1층 E블록1열 9,69.517555,3.436010,15.219900,33.948421,1551.713247,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
1,1층 A블록1열 2,1층,A블록,1,2,1406,555,-93,1층 E블록1열 8,68.459024,3.520697,16.945594,34.331746,1514.433888,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
2,1층 A블록1열 3,1층,A블록,1,3,1361,568,-93,1층 E블록1열 7,67.347261,3.608334,18.869853,34.695385,1477.698887,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
3,1층 A블록1열 4,1층,A블록,1,4,1315,580,-93,1층 E블록1열 6,66.199420,3.702327,21.009736,35.069575,1440.234009,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
4,1층 A블록1열 5,1층,A블록,1,5,1270,591,-93,1층 E블록1열 5,65.044882,3.798387,23.314833,35.426958,1403.862529,1,,,2018-11-16,15:06:00,2018-11-25,17:00:00,0.0,0.0,기획사판매,715.0,2018-10-28,2018-10-29,120.0,15.0,N,2018.0,11.0,2018-11-25,2018-11-01,2018-11-25 17:00:00,2018-11-16 15:06:00,0.00,0,50000,0.94,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378250,2층 BOX6 2,2층,BOX,6,2,-1432,505,383,2층 BOX1 1,-70.574617,-14.156604,21.173142,34.169657,1565.994253,1,60.0,M,2023-03-26,16:33:00,2023-06-06,19:30:00,15000.0,0.0,장애인/국가유공자 할인50%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-03-26 16:33:00,0.50,30000,30000,0.23,2
378251,2층 BOX6 3,2층,BOX,6,3,-1500,565,428,2층 BOX1 4,-69.360281,-14.950280,21.144470,32.190452,1659.038577,1,60.0,M,2023-03-26,16:33:00,2023-06-06,19:30:00,15000.0,0.0,장애인/국가유공자 할인50%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-03-26 16:33:00,0.50,30000,30000,0.23,2
378252,2층 BOX6 4,2층,BOX,6,4,-1454,617,428,2층 BOX1 3,-67.006224,-15.161505,23.354990,31.913243,1636.456232,1,50.0,F,2023-05-31,01:57:00,2023-06-06,19:30:00,28000.0,0.0,그린회원 할인5%,1508.0,2023-03-25,2023-03-26,100.0,15.0,Y,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,2023-05-31 01:57:00,0.05,29500,30000,0.23,2
378253,2층 BOX6 5,2층,BOX,6,5,-1522,677,483,2층 BOX1 6,-66.020049,-16.169769,23.078793,30.097223,1734.388077,0,,,NaT,,2023-06-06,19:30:00,,0.0,,1508.0,2023-03-25,2023-03-26,100.0,15.0,,2023.0,6.0,2023-06-06,2023-06-01,2023-06-06 19:30:00,NaT,0.00,0,30000,0.23,2


In [218]:
공연별_원가격추정군집분석_결과요약_reg.describe()

Unnamed: 0,금액명시비율,예약율,knn_n_neighbors,knn_mse,kmeans_군집수,kmeans_실루엣,평균가격,최소가격,최대가격,등급별가격_1,등급별가격_2,등급별가격_3,등급별가격_4,등급별가격_5
count,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,132.0,32.0
mean,0.200119,0.462483,9.178808,229.399172,4.198675,0.82372,73810.94755,13211.92053,113476.821192,0.536424,3.49702,5.706424,8.313788,22.067812
std,0.162038,0.227569,10.292448,247.456795,0.848684,0.141875,34233.326244,20771.329366,49146.018413,0.500331,6.416412,10.866406,15.596469,29.70762
min,0.005,0.032,1.0,-0.0,3.0,0.0,9049.9,0.0,10000.0,0.0,1.0,1.0,1.0,2.28
25%,0.0665,0.3015,3.0,62.857452,4.0,0.732659,52622.755,0.0,85000.0,0.0,1.0,1.63,2.32,3.4475
50%,0.168,0.465,6.0,145.646596,4.0,0.848499,67305.39,0.0,110000.0,1.0,1.42,2.0,2.89,12.445
75%,0.3065,0.628,11.0,278.984749,4.0,0.930769,87615.77,27500.0,130000.0,1.0,1.995,2.98,4.065,30.3775
max,0.718,0.981,50.0,1121.102815,7.0,1.0,227097.8,70000.0,320000.0,1.0,48.83,91.82,112.57,154.38


In [223]:
공연별_원가격추정군집분석_결과요약_reg.describe() - 공연별_원가격추정군집분석_결과요약_clf.describe()

Unnamed: 0,금액명시비율,예약율,knn_n_neighbors,knn_mse,kmeans_군집수,kmeans_실루엣,평균가격,최소가격,최대가격,등급별가격_1,등급별가격_2,등급별가격_3,등급별가격_4,등급별가격_5
count,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.0
mean,0.0,0.0,-33.821192,-83.902575,-0.112583,-0.13476,114.314305,5496.688742,-596.02649,0.006623,-5.890066,-13.091126,-26.145909,-85.934854
std,0.0,0.0,-2.754646,-107.082906,-0.039651,0.028192,220.752553,5852.268858,417.50529,-0.000441,-31.779792,-65.084423,-126.536466,-291.23234
min,0.0,0.0,0.0,-0.0,0.0,-0.003992,0.0,0.0,0.0,0.0,0.0,0.0,-0.44,-0.22
25%,0.0,0.0,-40.0,-25.878907,0.0,-0.221649,-1076.845,0.0,-5000.0,0.0,0.0,-0.28,-0.3575,-2.8225
50%,0.0,0.0,-43.0,-16.216356,0.0,-0.145019,650.7,0.0,0.0,0.0,0.0,-0.35,-0.65,3.445
75%,0.0,0.0,-39.0,-85.445796,-1.0,-0.069231,-711.575,20000.0,0.0,0.0,-0.005,-1.03,-1.9425,10.3775
max,0.0,0.0,0.0,-562.98542,0.0,0.0,-1205.59,10000.0,0.0,0.0,-307.63,-497.68,-808.8,-1145.85


In [232]:
temp = 공연별_원가격추정군집분석_결과요약_reg
print(temp.loc[(temp['최소가격']==0) & (temp['등급별가격'][0]!=0)].shape)

(98, 19)


In [231]:
temp = 공연별_원가격추정군집분석_결과요약_clf
temp.loc[(temp['최소가격']==0) & (temp['등급별가격'][0]!=0)].shape

(98, 19)