# **검색량과 데이터 테이블 합치기**

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable
import time 
import sys 
import warnings 

rc = {
    "axes.facecolor": "#FFFFFF",       
    "figure.facecolor": "#FFFFFF",
    "axes.edgecolor": "#000000",
    "grid.color": "#CCCCCC",
    "font.family": "malgun gothic",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4,
}

sns.set(rc = rc)
plt.rc('axes', unicode_minus = False)
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_parquet("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/전체공연_원가격원등급추정결과.parquet")

# 3가지 키워드 검색량 데이터 추가 

# 예술의전당 콘서트홀 키워드
# 예술의전당 클래식 키워드
# 서울클래식 키워드 

concerthall_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_예술의전당 콘서트홀_검색량.xlsx")
seoulartcenter_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_예술의전당 클래식_검색량.xlsx")
seoulclassic_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_서울 클래식_검색량.xlsx")

In [3]:
display(concerthall_keyword.head())

Unnamed: 0,날짜,키워드,PC 검색량,모바일 검색량,총 검색량
0,2016-01-01,예술의전당 콘서트홀,10,30,40
1,2016-01-02,예술의전당 콘서트홀,20,40,60
2,2016-01-03,예술의전당 콘서트홀,30,30,60
3,2016-01-04,예술의전당 콘서트홀,60,40,100
4,2016-01-05,예술의전당 콘서트홀,60,40,100


##### **공연 최초 예매일 전 일주일 간의 누적 검색량을 검색량으로 생각하도록 하자**

In [4]:
pre_open_date = df.loc[~df['pre_open_date'].isna(),'pre_open_date'].unique().tolist()
open_date = df.loc[df['pre_open_date'].isna(), 'open_date'].unique().tolist()

total_open = pre_open_date + open_date

In [5]:
print('유니크한 전체 공연 시간 :',df['전체공연시간'].nunique())
print('유니크한 전체 오픈 시간 : ', len(total_open))

유니크한 전체 공연 시간 : 151
유니크한 전체 오픈 시간 :  140


##### **공연 별로 중복된 티켓 예매일이 존재한다**

In [6]:
display(df['pre_open_date'].value_counts() / 2505)
display(df['open_date'].value_counts() / 2505)

2022-07-30    3.0
2021-07-02    3.0
2023-02-25    2.0
2021-09-11    2.0
2021-09-19    2.0
             ... 
2021-04-24    1.0
2021-06-25    1.0
2021-06-04    1.0
2021-08-06    1.0
2023-03-25    1.0
Name: pre_open_date, Length: 72, dtype: float64

2022-07-31    3.0
2021-07-03    3.0
2018-10-29    2.0
2019-08-26    2.0
2023-02-26    2.0
             ... 
2020-10-02    1.0
2020-08-24    1.0
2020-05-24    1.0
2019-11-23    1.0
2023-03-26    1.0
Name: open_date, Length: 132, dtype: float64

# **각 공연 별로 첫 예매 시작 일주일 전의 누적 검색량을 데이터프레임에 병합하기**

In [7]:
def keyword_calculator(data, concerthall, artcenter_classic,seoul_classic , days):
    
    data = data.copy()
    data['누적검색량'] = 0 # 맨 처음 시리즈를 모두 0으로 초기화 
    
    unique_date_list = data['전체공연시간'].unique()
    
    for unq_date in unique_date_list:
        cond = data['전체공연시간'] == unq_date
        
        if np.sum(data.loc[cond,'pre_open_date'].isna()): # 만약 선예매가 열린 적 없다면
            
            base_days = data.loc[cond, 'open_date'].unique()[0]
        else: # 만약 선예매가 열린 적이 있다면 
            base_days = data.loc[cond, 'pre_open_date'].unique()[0]
            
        end_days = pd.to_datetime(base_days) # YYYY-MM-DD 형태가 검색의 end 부분 
            
        start_days = end_days - pd.Timedelta(days = days)
            
        keyword_cond = (concerthall['날짜'] >= str(start_days)[:10]) & (concerthall['날짜'] <= str(end_days)[:10])
        
        concert_cumul_keyword = np.sum(concerthall.loc[keyword_cond, '총 검색량'])
        artcent_cumul_keyword = np.sum(artcenter_classic.loc[keyword_cond, '총 검색량'])
        seo_classic_cumul_keyword = np.sum(seoul_classic.loc[keyword_cond, '총 검색량'])
        
        data.loc[cond, '콘서트홀클래식_누적검색량'] = concert_cumul_keyword
        data.loc[cond, '예술의전당클래식_누적검색량'] = artcent_cumul_keyword
        data.loc[cond, '서울클래식_누적검색량'] = seo_classic_cumul_keyword
        
    return data
    

In [8]:
result = keyword_calculator(data = df,
                            concerthall = concerthall_keyword,
                            artcenter_classic = seoulartcenter_keyword,
                            seoul_classic = seoulclassic_keyword,
                            days = 7)

In [9]:
display(result.sample(5))
print(result.shape)

Unnamed: 0,seat,X,Y,Z,층,블록,열,넘버,무대까지의 거리,좌우면적시야각,...,전체거래시간,할인율,할인전가격,원가격추정,z_score,원등급추정,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량
266848,합창석 G블록2열 5,-250,-1095,350,합창석,G블록,2,5,1176.445919,54.200573,...,2022-08-20 10:29:00,0.0,30000,30000,0.0,1,0,1380.0,0.0,130.0
209638,2층 A블록4열 17,1220,2406,760,2층,A블록,4,17,2802.648034,28.113073,...,NaT,0.0,0,115000,0.43,3,0,880.0,20.0,60.0
280902,1층 D블록7열 2,-568,1289,12,1층,D블록,7,2,1408.647933,59.492289,...,NaT,0.0,0,150000,0.95,4,0,780.0,80.0,90.0
154746,2층 A블록7열 11,1459,2591,888,2층,A블록,7,11,3103.305657,24.715418,...,NaT,0.0,0,40000,0.86,2,0,290.0,0.0,0.0
129910,3층 N블록1열 14,-968,3336,1583,3층,N블록,1,14,3817.303891,21.967866,...,NaT,0.0,0,5000,0.0,1,0,120.0,0.0,20.0


(378255, 49)


In [10]:
result[['콘서트홀클래식_누적검색량', '예술의전당클래식_누적검색량', '서울클래식_누적검색량']].describe()

Unnamed: 0,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량
count,378255.0,378255.0,378255.0
mean,891.788079,11.125828,52.18543
std,388.367934,18.397514,35.395524
min,110.0,0.0,0.0
25%,580.0,0.0,20.0
50%,880.0,0.0,40.0
75%,1140.0,20.0,70.0
max,2000.0,80.0,170.0


In [11]:
result['누적검색량'] = result['콘서트홀클래식_누적검색량'] +  result['예술의전당클래식_누적검색량'] + result['서울클래식_누적검색량']

**3가지 검색 키워드 중 예술의 전당 콘서트홀 검색 키워드가 가장 많은 것으로 나타났다**

In [12]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

result.to_parquet(file_path + '키워드_검색량_추가_클래식_데이터.parquet',index = False)