# **검색량과 데이터 테이블 합치기**

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable
import time 
import sys 
import warnings 

rc = {
    "axes.facecolor": "#FFFFFF",       
    "figure.facecolor": "#FFFFFF",
    "axes.edgecolor": "#000000",
    "grid.color": "#CCCCCC",
    "font.family": "malgun gothic",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4,
}

sns.set(rc = rc)
plt.rc('axes', unicode_minus = False)
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_csv("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/빈좌석_포함_클래식_데이터.csv",low_memory = False)

# 3가지 키워드 검색량 데이터 추가 

# 예술의전당 콘서트홀 키워드
# 예술의전당 클래식 키워드
# 서울클래식 키워드 

concerthall_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_예술의전당 콘서트홀_검색량.xlsx")
seoulartcenter_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_예술의전당 클래식_검색량.xlsx")
seoulclassic_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_서울 클래식_검색량.xlsx")

In [3]:
display(concerthall_keyword.head())

Unnamed: 0,날짜,키워드,PC 검색량,모바일 검색량,총 검색량
0,2016-01-01,예술의전당 콘서트홀,10,30,40
1,2016-01-02,예술의전당 콘서트홀,20,40,60
2,2016-01-03,예술의전당 콘서트홀,30,30,60
3,2016-01-04,예술의전당 콘서트홀,60,40,100
4,2016-01-05,예술의전당 콘서트홀,60,40,100


##### **공연 최초 예매일 전 일주일 간의 누적 검색량을 검색량으로 생각하도록 하자**

In [4]:
pre_open_date = df.loc[~df['pre_open_date'].isna(),'pre_open_date'].unique().tolist()
open_date = df.loc[df['pre_open_date'].isna(), 'open_date'].unique().tolist()

total_open = pre_open_date + open_date

In [5]:
print('유니크한 전체 공연 시간 :',df['전체공연시간'].nunique())
print('유니크한 전체 오픈 시간 : ', len(total_open))

유니크한 전체 공연 시간 : 162
유니크한 전체 오픈 시간 :  150


##### **공연 별로 중복된 티켓 예매일이 존재한다**

In [6]:
display(df['pre_open_date'].value_counts() / 2505)
display(df['open_date'].value_counts() / 2505)

2021-07-02    3.0
2022-07-30    3.0
2021-09-19    2.0
2020-08-23    2.0
2021-09-11    2.0
             ... 
2019-12-20    1.0
2019-11-16    1.0
2019-11-29    1.0
2019-10-14    1.0
2023-03-25    1.0
Name: pre_open_date, Length: 78, dtype: float64

2021-07-03    3.0
2022-07-31    3.0
2021-09-12    3.0
2018-10-29    2.0
2022-06-26    2.0
             ... 
2019-09-28    1.0
2019-11-30    1.0
2019-08-03    1.0
2019-10-25    1.0
2023-03-26    1.0
Name: open_date, Length: 138, dtype: float64

# **각 공연 별로 첫 예매 시작 일주일 전의 누적 검색량을 데이터프레임에 병합하기**

In [7]:
def keyword_calculator(data, concerthall, artcenter_classic,seoul_classic , days):
    
    data = data.copy()
    data['누적검색량'] = 0 # 맨 처음 시리즈를 모두 0으로 초기화 
    
    unique_date_list = data['전체공연시간'].unique()
    
    for unq_date in unique_date_list:
        cond = data['전체공연시간'] == unq_date
        
        if np.sum(data.loc[cond,'pre_open_date'].isna()): # 만약 선예매가 열린 적 없다면
            
            base_days = data.loc[cond, 'open_date'].unique()[0]
        else: # 만약 선예매가 열린 적이 있다면 
            base_days = data.loc[cond, 'pre_open_date'].unique()[0]
            
        end_days = pd.to_datetime(base_days) # YYYY-MM-DD 형태가 검색의 end 부분 
            
        start_days = end_days - pd.Timedelta(days = days)
            
        keyword_cond = (concerthall['날짜'] >= str(start_days)[:10]) & (concerthall['날짜'] <= str(end_days)[:10])
        
        concert_cumul_keyword = np.sum(concerthall.loc[keyword_cond, '총 검색량'])
        artcent_cumul_keyword = np.sum(artcenter_classic.loc[keyword_cond, '총 검색량'])
        seo_classic_cumul_keyword = np.sum(seoul_classic.loc[keyword_cond, '총 검색량'])
        
        data.loc[cond, '콘서트홀클래식_누적검색량'] = concert_cumul_keyword
        data.loc[cond, '예술의전당클래식_누적검색량'] = artcent_cumul_keyword
        data.loc[cond, '서울클래식_누적검색량'] = seo_classic_cumul_keyword
        
    return data
    

In [8]:
result = keyword_calculator(data = df,
                            concerthall = concerthall_keyword,
                            artcenter_classic = seoulartcenter_keyword,
                            seoul_classic = seoulclassic_keyword,
                            days = 7)

In [9]:
display(result.sample(5))
print(result.shape)

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,...,공연연도,공연월,공연일,공연연월,전체공연시간,전체거래시간,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량
287696,3층 M블록2열 10,3층,M블록,2,10,579,3468,1643,3층 N블록2열 6,22,...,2022,8,2022-08-20,2022-08-01,2022-08-20 17:00:00,2022-08-15 19:35:00,0,920.0,40.0,100.0
285442,3층 E블록6열 7,3층,E블록,6,7,-721,3025,1388,3층 C블록6열 6,25,...,2022,7,2022-07-21,2022-07-01,2022-07-21 20:00:00,,0,620.0,0.0,80.0
397591,2층 B블록5열 5,2층,B블록,5,5,808,2686,802,2층 D블록5열 8,28,...,2023,5,2023-05-21,2023-05-01,2023-05-21 17:00:00,2023-05-16 15:37:00,0,820.0,80.0,20.0
151573,합창석 G블록1열 24,합창석,G블록,1,24,484,-995,300,합창석 G블록1열 5,54,...,2020,9,2020-09-12,2020-09-01,2020-09-12 17:00:00,,0,890.0,0.0,20.0
395050,2층 E블록4열 6,2층,E블록,4,6,-1333,2287,760,2층 A블록4열 14,27,...,2023,5,2023-05-09,2023-05-01,2023-05-09 19:30:00,2023-03-27 14:11:00,0,1060.0,0.0,20.0


(405810, 44)


In [10]:
result[['콘서트홀클래식_누적검색량', '예술의전당클래식_누적검색량', '서울클래식_누적검색량']].describe()

Unnamed: 0,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량
count,405810.0,405810.0,405810.0
mean,898.888889,11.111111,51.54321
std,384.631129,18.358591,34.651232
min,110.0,0.0,0.0
25%,600.0,0.0,20.0
50%,890.0,0.0,40.0
75%,1180.0,20.0,70.0
max,2000.0,80.0,170.0


**3가지 검색 키워드 중 예술의 전당 콘서트홀 검색 키워드가 가장 많은 것으로 나타났다**

In [11]:
class DataExploratioin:
    '''
    데이터 탐색 시 사용 가능한 Class 

    기존 존재하는 프레임워크들을 이용하여 자주 이용하는 프레임워크들을 활용하여 나만의 분석 툴을 만들려고 함 

    데이터 요약, 결측값 처리 등의 내용이 담겨있는 class 
    '''

    def __init__(self, data):
        self.data = data

    def summarize(self):
        '''
        데이터를 초창기에 요약해주는 method
        '''

        cols = self.data.columns

        size = round(sys.getsizeof(self.data) / 1024 ** 2, 2)

        print(f'data size : {size}MB')

        self.result = pd.DataFrame()

        self.result['Dtype'] = self.data.dtypes.values
        self.result['Count'] = self.data.count().values
        self.result['Nunique'] = self.data.nunique().values
        self.result['Missing value'] = self.data.isna().sum().values
        self.result['Missing %'] = [str(round(
            missing / len(self.data), 2) * 100) + '%' for missing in self.result['Missing value']]
        self.result['Most Freq Value'] = self.data.mode().iloc[0].values

        freq_prop = []

        for i, col in enumerate(cols):

            raw_data = self.data.loc[~self.data[col].isna(), col]
            freq_value = self.result['Most Freq Value'].iloc[i]

            prop = np.mean(
                np.array(raw_data == freq_value)
            )

            prop_str = str(round(np.mean(prop) * 100, 1)) + '%'

            if prop_str == 'nan%':
                freq_prop.append(self.result['Missing %'].iloc[i])
            else:
                freq_prop.append(prop_str)

        self.result['Most Freq Value %'] = freq_prop

        self.result['Min'] = self.data.describe(include='all').T['min'].values
        self.result['Max'] = self.data.describe(include='all').T['max'].values
        self.result['Mean'] = self.data.describe(
            include='all').T['mean'].values
        self.result['Median'] = self.data.describe(
            include='all').T['50%'].values
        
        memory = (self.data.memory_usage(deep = True) // 1024 **2).values[1:] # index 의 usage 는 제외하고 보자 

        
        self.result['MB'] = [str(m) + ' mb' for m in memory]
        self.result = self.result.set_index(cols)

        self.result = self.result.fillna('-')

        display(self.result)
    
    
    def progress_bar(self,iterable, total_blocks = 10):
        
        total_items = len(iterable)
        block_size = total_items // total_blocks
        
        for i, item in enumerate(iterable, start=1):
            if i % block_size == 0 or i == total_items:
                progress = (i / total_items) * 100
                blocks = int(progress / (100 / total_blocks))
                empty_blocks = total_blocks - blocks
                progress_bar = '■' * blocks + '▢' * empty_blocks
                print(f"\rProgress: [{progress_bar}] {progress:.2f}%", end='', flush=True)
            yield item
            time.sleep(0.0000001)
    
    def reduce_size(self):
                
        original_size = round(sys.getsizeof(self.data) / 1024 ** 2,2)
        
        df = self.data.copy()
        
        for col in self.progress_bar(df.columns):
            
            dtp = df[col].dtype
            
            if dtp == 'object':
                df[col] = df[col].astype('category')
            else: # numeric type이면 
                
                if min(df[col]) >= 0 : # 부호가 없다면 unit 으로 변경해줘도 된다.
                    max_value = max(df[col])
                    
                    bits = [8,16,32,64]
                    
                    for bit in bits: # 최소한의 비트로 표현 될 수 있게 dtype 변경 
                        if max_value < 2 ** bit:
                            # 결측치가 있는 경우 astype 으로 변경하지 못하니 결측치를 채워준 후 변경하고 다시 결측치를 채우자 
                            df[col] = df[col].fillna(2 ** bit - 1)
                            df[col] = df[col].astype(f'uint{bit}')
                            df[col] = df[col].replace(2 ** bit - 1, np.NaN)
                            break
                        
                else: # 부호가 있다면 int type 으로 바꿔주자 
                    
                    max_value = max(abs(min(df[col])), max(df[col]))
                    
                    bits = [8,16,32,64]
                    
                    for bit in bits:
                        if max_value < 2 ** bit:
                            df[col] = df[col].fillna(2 ** bit - 1)
                            df[col] = df[col].astype(f'int{bit}')
                            df[col] = df[col].replace(2 ** bit - 1, np.NaN)
                            break
                        
        print('\n')
                        
        after_size = round(sys.getsizeof(df) / 1024 ** 2,2)
        
        # 바꾼 후 결과 보여주기 
        after = DataExploratioin(df)
        after.summarize()
        
        print(f'\n {original_size}MB -> {after_size}MB')
            
        return df

In [14]:
result = DataExploratioin(result).reduce_size()

Progress: [■■■■■■■■■■] 100.00%

data size : 37.76MB


Unnamed: 0,Dtype,Count,Nunique,Missing value,Missing %,Most Freq Value,Most Freq Value %,Min,Max,Mean,Median,MB
seat,category,405810,2505,0,0.0%,1층 A블록10열 1,0.0%,-,-,-,-,1 mb
층,category,405810,4,0,0.0%,1층,49.3%,-,-,-,-,0 mb
블록,category,405810,11,0,0.0%,C블록,16.8%,-,-,-,-,0 mb
열,uint8,405810,22,0,0.0%,4.0,10.7%,1.0,22.0,7.928543,6.0,0 mb
넘버,uint8,405810,37,0,0.0%,1.0,8.3%,1.0,37.0,7.326946,7.0,0 mb
X,int16,405810,1705,0,0.0%,0.0,0.7%,-1900.0,1900.0,0.0,0.0,0 mb
Y,int16,405810,1011,0,0.0%,-1295.0,1.5%,-1295.0,3479.0,1702.224351,1967.0,0 mb
Z,int16,405810,64,0,0.0%,400.0,3.3%,-93.0,1643.0,496.49022,332.0,0 mb
대칭점,category,405810,2505,0,0.0%,1층 A블록10열 1,0.0%,-,-,-,-,1 mb
좌우시야각,uint8,405810,85,0,0.0%,25.0,7.9%,15.0,109.0,35.172056,30.0,0 mb



 596.22MB -> 37.76MB


In [13]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

result.to_csv(file_path + '키워드_검색량_추가_클래식_데이터.csv',index = False)