## 예술의전당 콘서트홀 가격 모델

### 1. 라이브러리

In [1]:
# base
import pandas as pd
import numpy as np
import re

# etc.
import sys
from glob import glob
import warnings
from tqdm import tqdm
import time 
import datetime as dt 

# visualization
import matplotlib.pyplot as plt
import koreanize_matplotlib
import plotly.express as px
import seaborn as sns
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable

# M.L
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, silhouette_score
from kneed import KneeLocator

In [2]:
# settings
%matplotlib inline
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings(action = 'ignore')

rc = {
    "axes.facecolor": "#F6F6F6",
    "figure.facecolor": "#F6F6F6",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7",
    "font.family": "malgun gothic",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}

sns.set(rc=rc)


### 2. 데이터셋 로드

In [3]:
# 기존의 제공된 데이터프레임에 빈좌석, 좌표, 무대와의 관계가 추가된 데이터프레임
file_name = "빈좌석_포함_클래식_데이터.csv"
df = pd.read_csv(glob(file_name)[0], low_memory=False)

# 필요없는 컬럼 삭제
df = df.drop(['membership_type_1', 'membership_type_2', 'membership_type_3', 
              'membership_type_4', 'membership_type_5', 'membership_type_6',], axis=1)

print(f"""Found.. {len(glob(file_name))} file(s) : {glob(file_name)}
Reading.. {glob(file_name)[-1]}
df.shape : {df.shape}""")


Found.. 1 file(s) : ['빈좌석_포함_클래식_데이터.csv']
Reading.. 빈좌석_포함_클래식_데이터.csv
df.shape : (405810, 34)


### 3. 클래스/함수 : 원가격, 원등급 추정

In [57]:
class Performance:
    instance_cnt = 0

    def __init__(self, df):
        Performance.instance_cnt += 1
        self.instance_cnt = Performance.instance_cnt
        self.df = df
        self.perform_time = df['전체공연시간'].iloc[0]
        self.get_original_price()
    

    def get_original_price(self):
        """
        discount_type에서 할인율을 추출하고 역산해서 '할인전금액'을 df에 컬럼으로 추가하는 함수
        """
        self.df['할인율'] = self.df['discount_type'].str.extract('(\d+)%')
        self.df['할인율'] = self.df['할인율'].fillna(0).astype(int) / 100
        self.df['할인전가격'] = (self.df['price'].fillna(0) // (1 - self.df['할인율'])).round(-2).astype(int)
        self.df['원가격추정'] = self.df['할인전가격'].copy()
        self.priced_seat = self.df[self.df['원가격추정'] > 0]
        self.unpriced_seat = self.df[self.df['원가격추정'] == 0]
        self.priced_rate = round((self.priced_seat.shape[0] / self.df.shape[0]), 3)
        self.booked_rate = round(self.df['예매여부'].mean(), 3)


    def get_best_n_neighbors(self):
        """
        knn가격 추정 모델의 적절한 n_neighbors값을 찾는 함수
        """
        X, y = self.priced_seat[['X', 'Y', 'Z']], self.priced_seat['원가격추정']
        cv_scores = []
        for n in range(1, min(50, (X.shape[0]*9//10)) + 1):
            model = KNeighborsRegressor(n_neighbors=n, weights='distance', p=2)
            scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
            cv_scores.append(scores.mean())
            self.best_n_neighbors = [i for i in range(1, 51)][np.argmax(cv_scores)]
            self.knn_mse = max(cv_scores) * -1


    def estimate_price(self):
        """
        knn모델에 추정된 n_neighbors값을 적용해서 판매되지 않은 티켓의 가격을 추정하는 함수
        """
        # 가격을 하나도 알 수 없는 경우 종료
        if self.priced_rate == 0:
            self.best_n_neighbors = 1
            self.knn_mse = 0
            self.mean_price = self.max_price = self.min_price = 0
            return
        
        # priced_rate != 0 이면 n_neighbors값 찾기
        self.get_best_n_neighbors()

        # 찾은 n_neighbors 값으로 knn 가격추정
        X, y = self.priced_seat[['X', 'Y', 'Z']], self.priced_seat['원가격추정']
        model = KNeighborsRegressor(n_neighbors=self.best_n_neighbors, weights='distance', p=2)
        model.fit(X, y)
        y_pred = model.predict(self.unpriced_seat[['X', 'Y', 'Z']]).round(-2)
        self.df.loc[self.df['원가격추정']==0, '원가격추정'] = y_pred
        
        self.mean_price = self.df['원가격추정'].mean().round(2)
        self.max_price = int(self.df['원가격추정'].max())
        self.min_price = int(self.df['원가격추정'].min())

    def estimate_cluster_kmeans(self):
        # 중복되는 값이 없도록 난수를 더해서 노이즈 만들기
        self.df['rand'] = np.random.rand(self.df.shape[0])
        self.df['원가격추정_rand'] = self.df['원가격추정'] + self.df['rand']

        # 군집화 모델 생성 (K-means)
        X = self.df[['원가격추정_rand']]
        inertia = []
        k_range = range(1, 11)
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
            kmeans.fit(X)
            inertia.append(kmeans.inertia_)
        
        # 적절한 K값 찾기 (elbow point)        
        kneedle = KneeLocator(k_range[1:], inertia[1:], curve='convex', direction='decreasing')
        self.best_k = kneedle.elbow

        # 찾은 K값을 적용해서 원등급 추정하기
        kmeans = KMeans(n_clusters=self.best_k, random_state=42, n_init='auto')
        self.df['원등급추정'] = kmeans.fit_predict(X)
        cluster_means = self.df.groupby('원등급추정').agg({'seat':'count', '원가격추정':'mean', '예매여부':'mean'}).reset_index()
        cluster_means = cluster_means.sort_values(by='원가격추정')
        grade_mapping = {grade: idx for idx, grade in enumerate(cluster_means['원등급추정'])}
        self.df = self.df.drop(['원가격추정_rand', 'rand'], axis=1)

        # 원등급, 등급별 가격의 비율, 군집분석의 실루엣점수
        self.df['원등급추정'] = (self.df['원등급추정'].map(grade_mapping) + 1)
        self.class_price_ratio = list((cluster_means['원가격추정'] / min(cluster_means['원가격추정'])).round(2))
        self.class_price = list(cluster_means['원가격추정'].round().astype(int))
        self.class_seats_cnt = list(cluster_means['seat'])
        self.class_booked_ratio = list(cluster_means['예매여부'].round(4))
        self.silhouette_score = silhouette_score(self.df[['원가격추정']], self.df['원등급추정'])


    def px3dscatter(self, col_name):
        # 결과 시각화
        fig = px.scatter_3d(self.df, x='X', y='Y', z='Z', color=col_name, 
                            hover_name='seat', hover_data=['예매여부', '원가격추정', '원등급추정'],
                            width=800, height=600)
        fig.update_traces(marker={'size': 1})
        fig.show()

### 4. 원가격/등급 추정 군집분석 결과

In [62]:
# 전체 데이터를 공연시간을 기준으로 공연별 분할
공연시간_list = sorted(df['전체공연시간'].unique())
공연별_df_list = [df[df['전체공연시간'] == 공연시간] for 공연시간 in 공연시간_list]

# 공연별 군집분석 결과 컬럼
instance_cnt_list = []
perform_time_list = []
priced_rate_list = []
booked_rate_list = []
best_n_neighbors_list = []
knn_mse_list = []
best_k_list = []
silhouette_score_list = []
mean_price_list = []
max_price_list = []
min_price_list = []
class_price_ratio_list = []
class_price_list = []
class_seats_cnt_list = []
class_booked_ratio_list = []
df_list = []

In [63]:
# class > 군집분석 적용
for 공연별_df in tqdm(공연별_df_list):
    p = Performance(공연별_df)
    p.estimate_price() # knn => 원가격 추정
    p.estimate_cluster_kmeans() # k-means => 원등급 추정

    instance_cnt_list.append(p.instance_cnt)
    perform_time_list.append(p.perform_time)
    priced_rate_list.append(p.priced_rate)
    booked_rate_list.append(p.booked_rate)
    best_n_neighbors_list.append(p.best_n_neighbors)
    knn_mse_list.append(p.knn_mse)
    best_k_list.append(p.best_k)
    silhouette_score_list.append(p.silhouette_score)
    mean_price_list.append(p.mean_price)
    max_price_list.append(p.max_price)
    min_price_list.append(p.min_price)
    class_price_ratio_list.append(p.class_price_ratio)
    class_price_list.append(p.class_price)
    class_seats_cnt_list.append(p.class_seats_cnt)
    class_booked_ratio_list.append(p.class_booked_ratio)
    df_list.append(p.df)

# 군집결과 데이터프레임으로 합치기
df_result = pd.concat(df_list, axis=0)
df_summary = pd.DataFrame({
    '공연시간' : perform_time_list,
    '금액명시비율' : priced_rate_list,
    '예약율' : booked_rate_list,
    'knn_n_neighbors' : best_n_neighbors_list,
    'knn_mse' : knn_mse_list,
    'kmeans_군집수' : best_k_list,
    'kmeans_실루엣' : silhouette_score_list,
    '평균가격' : mean_price_list,
    '최소가격' : min_price_list,
    '최대가격' : max_price_list,
    '등급별가격비율' : class_price_ratio_list,
    '등급별가격' : class_price_list,
    '등급별좌석수' : class_seats_cnt_list,
    '등급별예매율' : class_booked_ratio_list
})

100%|██████████| 162/162 [05:35<00:00,  2.07s/it]


### 5. 결과 저장

* 총 162개의 클래식공연 중 11개의 공연은 가격이 전혀 명시되지 않았음.  => 해당 공연의 데이터에서는 원가격, 원등급 추정이 불가함.    
* 앞으로의 EDA를 위해 151개의 공연 데이터만 사용하기로 함.

#### 5-1) 공연별 요약 데이터 (df_summary)

In [64]:
# 공연별 (추정된) 가격, 등급 정보 요약 (군집분석 불가한 11개 공연 데이터는 제외)
공연별_원가격추정군집분석_결과요약 = df_summary.loc[df_summary['금액명시비율']!=0].reset_index().drop('index', axis=1)
공연별_원가격추정군집분석_결과요약.to_csv('공연별_원가격추정군집분석_결과요약.csv', index=False)
공연별_원가격추정군집분석_결과요약

Unnamed: 0,공연시간,금액명시비율,예약율,knn_n_neighbors,knn_mse,kmeans_군집수,kmeans_실루엣,평균가격,최소가격,최대가격,등급별가격비율,등급별가격,등급별좌석수,등급별예매율
0,2018-11-25 17:00:00,0.026,0.620,2,3.125890e+08,4,0.912130,68896.33,30000,100000,"[1.0, 1.41, 1.97, 2.79]","[35800, 50444, 70535, 99826]","[261, 985, 414, 845]","[0.1724, 0.3645, 0.7705, 0.9834]"
1,2018-12-08 17:00:00,0.021,0.862,2,5.022582e+07,4,0.898037,59729.06,20000,100000,"[1.0, 1.67, 2.46, 3.37]","[29672, 49614, 72999, 100000]","[567, 1021, 354, 563]","[0.8907, 0.8364, 0.887, 0.8632]"
2,2019-01-11 20:00:00,0.386,0.636,10,2.070638e+08,3,0.811674,97404.03,14000,140000,"[1.0, 1.61, 2.19]","[62788, 101197, 137655]","[927, 862, 716]","[0.5102, 0.6195, 0.817]"
3,2019-02-05 20:00:00,0.023,0.858,1,6.707500e+08,4,0.830528,72728.54,15000,150000,"[1.0, 2.3, 3.98, 5.97]","[25106, 57625, 100000, 150000]","[708, 821, 586, 390]","[0.6412, 0.9074, 0.9573, 1.0]"
4,2019-02-16 17:00:00,0.157,0.476,12,3.252762e+07,4,0.692012,28535.53,5000,50000,"[1.0, 2.0, 3.11, 4.55]","[10657, 21324, 33131, 48502]","[542, 770, 558, 635]","[0.5129, 0.4558, 0.4068, 0.5291]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,2023-05-09 19:30:00,0.541,0.618,33,1.791959e+08,4,0.782476,83675.69,40000,110000,"[1.0, 1.57, 2.07, 2.49]","[43626, 68560, 90282, 108622]","[457, 492, 713, 843]","[0.5908, 0.5976, 0.5063, 0.7402]"
147,2023-05-21 17:00:00,0.308,0.766,48,4.229043e+08,4,0.616708,59994.25,5000,90000,"[1.0, 2.06, 2.9, 3.67]","[23213, 47719, 67209, 85184]","[463, 394, 1093, 555]","[0.6523, 0.8096, 0.7109, 0.9369]"
148,2023-05-23 19:30:00,0.565,0.678,1,1.959048e+08,4,0.997531,81084.31,29400,110000,"[1.0, 2.0, 3.0, 3.67]","[29945, 60000, 89975, 109964]","[439, 468, 692, 906]","[0.5011, 0.6368, 0.7428, 0.7351]"
149,2023-06-01 19:30:00,0.365,0.372,8,5.249261e+08,5,0.729774,108115.53,10000,180000,"[1.0, 3.85, 6.08, 9.01, 11.36]","[15624, 60193, 95030, 140729, 177518]","[191, 660, 596, 444, 614]","[0.5131, 0.3667, 0.3003, 0.2545, 0.487]"


#### 5-2) 공연별 결과 데이터

In [65]:
# 전체공연좌석별 예매 데이터
공연별_결과_list = []
for 공연시간 in list(df_summary.loc[df_summary['금액명시비율']!=0, '공연시간']):
    공연별_결과_list.append(df_result[df_result['전체공연시간'] == 공연시간])

df_result = pd.concat(공연별_결과_list, axis=0, ignore_index=True)

In [79]:
print(f"""df_result 
memory usage: {round(df_result.memory_usage(deep=False).sum() / 1024**2, 1)}+ MB""")

df_result 
memory usage: 109.7+ MB


In [80]:
class DataExploratioin:
    '''
    데이터 탐색 시 사용 가능한 Class 

    기존 존재하는 프레임워크들을 이용하여 자주 이용하는 프레임워크들을 활용하여 나만의 분석 툴을 만들려고 함 

    데이터 요약, 결측값 처리 등의 내용이 담겨있는 class 
    '''

    def __init__(self, data):
        self.data = data

    def summarize(self):
        '''
        데이터를 초창기에 요약해주는 method
        '''

        cols = self.data.columns

        size = round(sys.getsizeof(self.data) / 1024 ** 2, 2)

        print(f'data size : {size}MB')

        self.result = pd.DataFrame()

        self.result['Dtype'] = self.data.dtypes.values
        self.result['Count'] = self.data.count().values
        self.result['Nunique'] = self.data.nunique().values
        self.result['Missing value'] = self.data.isna().sum().values
        self.result['Missing %'] = [str(round(
            missing / len(self.data), 2) * 100) + '%' for missing in self.result['Missing value']]
        self.result['Most Freq Value'] = self.data.mode().iloc[0].values

        freq_prop = []

        for i, col in enumerate(cols):

            raw_data = self.data.loc[~self.data[col].isna(), col]
            freq_value = self.result['Most Freq Value'].iloc[i]

            prop = np.mean(
                np.array(raw_data == freq_value)
            )

            prop_str = str(round(np.mean(prop) * 100, 1)) + '%'

            if prop_str == 'nan%':
                freq_prop.append(self.result['Missing %'].iloc[i])
            else:
                freq_prop.append(prop_str)

        self.result['Most Freq Value %'] = freq_prop

        self.result['Min'] = self.data.describe(include='all').T['min'].values
        self.result['Max'] = self.data.describe(include='all').T['max'].values
        self.result['Mean'] = self.data.describe(
            include='all').T['mean'].values
        self.result['Median'] = self.data.describe(
            include='all').T['50%'].values
        
        memory = (self.data.memory_usage(deep = True) // 1024 **2).values[1:] # index 의 usage 는 제외하고 보자 

        
        self.result['MB'] = [str(m) + ' mb' for m in memory]
        self.result = self.result.set_index(cols)

        self.result = self.result.fillna('-')

        display(self.result)
    
    
    def progress_bar(self,iterable, total_blocks = 10):
        
        total_items = len(iterable)
        block_size = total_items // total_blocks
        
        for i, item in enumerate(iterable, start=1):
            if i % block_size == 0 or i == total_items:
                progress = (i / total_items) * 100
                blocks = int(progress / (100 / total_blocks))
                empty_blocks = total_blocks - blocks
                progress_bar = '■' * blocks + '▢' * empty_blocks
                print(f"\rProgress: [{progress_bar}] {progress:.2f}%", end='', flush=True)
            yield item
            time.sleep(0.0000001)
    
    def reduce_size(self):
                
        original_size = round(sys.getsizeof(self.data) / 1024 ** 2,2)
        
        df = self.data.copy()
        
        for col in self.progress_bar(df.columns):
            
            dtp = df[col].dtype
            
            if dtp == 'object':
                df[col] = df[col].astype('category')
            
            else: # numeric type이면 
                if df[col].min() >= 0 : # 부호가 없다면 unit 으로 변경해줘도 된다.
                    max_value = max(df[col])
                    
                    bits = [8,16,32,64]
                    
                    for bit in bits: # 최소한의 비트로 표현 될 수 있게 dtype 변경 
                        if max_value < 2 ** bit:
                            # 결측치가 있는 경우 astype 으로 변경하지 못하니 결측치를 채워준 후 변경하고 다시 결측치를 채우자 
                            df[col] = df[col].fillna(2 ** bit - 1)
                            df[col] = df[col].astype(f'uint{bit}')
                            df[col] = df[col].replace(2 ** bit - 1, np.NaN)
                            break
                        
                else: # 부호가 있다면 int type 으로 바꿔주자 
                    
                    max_value = max(abs(min(df[col])), max(df[col]))
                    
                    bits = [8,16,32,64]
                    
                    for bit in bits:
                        if max_value < 2 ** bit:
                            df[col] = df[col].fillna(2 ** bit - 1)
                            df[col] = df[col].astype(f'int{bit}')
                            df[col] = df[col].replace(2 ** bit - 1, np.NaN)
                            break
                        
        print('\n')
                        
        after_size = round(sys.getsizeof(df) / 1024 ** 2,2)
        
        # 바꾼 후 결과 보여주기 
        after = DataExploratioin(df)
        after.summarize()
        
        print(f'\n {original_size}MB -> {after_size}MB')
            
        return df

In [81]:
df_result = DataExploratioin(df_result).reduce_size()
df_result.to_csv('전체공연_원가격원등급추정결과.csv', index=False)

Progress: [■■■■■■■■■■] 100.00%

data size : 32.66MB


Unnamed: 0,Dtype,Count,Nunique,Missing value,Missing %,Most Freq Value,Most Freq Value %,Min,Max,Mean,Median,MB
seat,category,378255,2505,0,0.0%,1층 A블록10열 1,0.0%,-,-,-,-,1 mb
층,category,378255,4,0,0.0%,1층,49.3%,-,-,-,-,0 mb
블록,category,378255,11,0,0.0%,C블록,16.8%,-,-,-,-,0 mb
열,uint8,378255,22,0,0.0%,4.0,10.7%,1.0,22.0,7.928543,6.0,0 mb
넘버,uint8,378255,37,0,0.0%,1.0,8.3%,1.0,37.0,7.326946,7.0,0 mb
X,int16,378255,1705,0,0.0%,0.0,0.7%,-1900.0,1900.0,0.0,0.0,0 mb
Y,int16,378255,1011,0,0.0%,-1295.0,1.5%,-1295.0,3479.0,1702.224351,1967.0,0 mb
Z,int16,378255,64,0,0.0%,400.0,3.3%,-93.0,1643.0,496.49022,332.0,0 mb
대칭점,category,378255,2505,0,0.0%,1층 A블록10열 1,0.0%,-,-,-,-,1 mb
좌우시야각,uint8,378255,85,0,0.0%,25.0,7.9%,15.0,109.0,35.172056,30.0,0 mb



 486.7MB -> 32.66MB
