# **검색량과 데이터 테이블 합치기**

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable

rc = {
    "axes.facecolor": "#FFFFFF",       
    "figure.facecolor": "#FFFFFF",
    "axes.edgecolor": "#000000",
    "grid.color": "#CCCCCC",
    "font.family": "malgun gothic",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4,
}

sns.set(rc = rc)
plt.rc('axes', unicode_minus = False)

In [22]:
df = pd.read_csv("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/빈좌석_포함_클래식_데이터.csv",low_memory = False)

# 3가지 키워드 검색량 데이터 추가 

# 예술의전당 콘서트홀 키워드
# 예술의전당 클래식 키워드
# 서울클래식 키워드 

concerthall_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_예술의전당 콘서트홀_검색량.xlsx")
seoulartcenter_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_예술의전당 클래식_검색량.xlsx")
seoulclassic_keyword = pd.read_excel("C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/키워드사운드_서울 클래식_검색량.xlsx")

In [23]:
display(concerthall_keyword.head())

Unnamed: 0,날짜,키워드,PC 검색량,모바일 검색량,총 검색량
0,2016-01-01,예술의전당 콘서트홀,10,30,40
1,2016-01-02,예술의전당 콘서트홀,20,40,60
2,2016-01-03,예술의전당 콘서트홀,30,30,60
3,2016-01-04,예술의전당 콘서트홀,60,40,100
4,2016-01-05,예술의전당 콘서트홀,60,40,100


##### **공연 최초 예매일 전 일주일 간의 누적 검색량을 검색량으로 생각하도록 하자**

In [24]:
pre_open_date = df.loc[~df['pre_open_date'].isna(),'pre_open_date'].unique().tolist()
open_date = df.loc[df['pre_open_date'].isna(), 'open_date'].unique().tolist()

total_open = pre_open_date + open_date

In [25]:
print('유니크한 전체 공연 시간 :',df['전체공연시간'].nunique())
print('유니크한 전체 오픈 시간 : ', len(total_open))

유니크한 전체 공연 시간 : 162
유니크한 전체 오픈 시간 :  150


##### **공연 별로 중복된 티켓 예매일이 존재한다**

In [26]:
display(df['pre_open_date'].value_counts() / 2505)
display(df['open_date'].value_counts() / 2505)

2021-07-02    3.0
2022-07-30    3.0
2021-09-19    2.0
2020-08-23    2.0
2021-09-11    2.0
             ... 
2019-12-20    1.0
2019-11-16    1.0
2019-11-29    1.0
2019-10-14    1.0
2023-03-25    1.0
Name: pre_open_date, Length: 78, dtype: float64

2021-07-03    3.0
2022-07-31    3.0
2021-09-12    3.0
2018-10-29    2.0
2022-06-26    2.0
             ... 
2019-09-28    1.0
2019-11-30    1.0
2019-08-03    1.0
2019-10-25    1.0
2023-03-26    1.0
Name: open_date, Length: 138, dtype: float64

# **각 공연 별로 첫 예매 시작 일주일 전의 누적 검색량을 데이터프레임에 병합하기**

In [37]:
def keyword_calculator(data, concerthall, artcenter_classic,seoul_classic , days):
    
    data = data.copy()
    data['누적검색량'] = 0 # 맨 처음 시리즈를 모두 0으로 초기화 
    
    unique_date_list = data['전체공연시간'].unique()
    
    for unq_date in unique_date_list:
        cond = data['전체공연시간'] == unq_date
        
        if np.sum(data.loc[cond,'pre_open_date'].isna()): # 만약 선예매가 열린 적 없다면
            
            base_days = data.loc[cond, 'open_date'].unique()[0]
        else: # 만약 선예매가 열린 적이 있다면 
            base_days = data.loc[cond, 'pre_open_date'].unique()[0]
            
        end_days = pd.to_datetime(base_days) # YYYY-MM-DD 형태가 검색의 end 부분 
            
        start_days = end_days - pd.Timedelta(days = days)
            
        keyword_cond = (concerthall['날짜'] >= str(start_days)[:10]) & (concerthall['날짜'] <= str(end_days)[:10])
        
        concert_cumul_keyword = np.sum(concerthall.loc[keyword_cond, '총 검색량'])
        artcent_cumul_keyword = np.sum(artcenter_classic.loc[keyword_cond, '총 검색량'])
        seo_classic_cumul_keyword = np.sum(seoul_classic.loc[keyword_cond, '총 검색량'])
        
        data.loc[cond, '콘서트홀클래식_누적검색량'] = concert_cumul_keyword
        data.loc[cond, '예술의전당클래식_누적검색량'] = artcent_cumul_keyword
        data.loc[cond, '서울클래식_누적검색량'] = seo_classic_cumul_keyword
        
    return data
    

In [38]:
result = keyword_calculator(data = df,
                            concerthall = concerthall_keyword,
                            artcenter_classic = seoulartcenter_keyword,
                            seoul_classic = seoulclassic_keyword,
                            days = 7)

In [40]:
display(result.sample(5))
print(result.shape)

Unnamed: 0,seat,층,블록,열,넘버,X,Y,Z,대칭점,좌우시야각,...,공연연도,공연월,공연일,공연연월,전체공연시간,전체거래시간,누적검색량,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량
309858,2층 C블록4열 7,2층,C블록,4,7,-24,2624,760,2층 C블록4열 6,32,...,2022,10,2022-10-04,2022-10-01,2022-10-04 19:30:00,2022-09-30 11:30:00,0,780.0,80.0,90.0
200559,1층 B블록4열 2,1층,B블록,4,2,929,952,-42,1층 D블록4열 9,50,...,2021,10,2021-10-13,2021-10-01,2021-10-13 19:30:00,,0,620.0,0.0,10.0
69906,3층 B블록5열 2,3층,B블록,5,2,1293,2594,1328,3층 F블록5열 6,24,...,2019,8,2019-08-25,2019-08-01,2019-08-25 17:00:00,,0,1340.0,0.0,60.0
397717,2층 E블록6열 16,2층,E블록,6,16,-1742,2146,845,2층 A블록6열 4,23,...,2023,5,2023-05-21,2023-05-01,2023-05-21 17:00:00,2023-05-15 15:34:00,0,820.0,80.0,20.0
241196,1층 E블록13열 12,1층,E블록,13,12,-1838,1618,130,1층 A블록13열 1,22,...,2021,12,2021-12-19,2021-12-01,2021-12-19 17:00:00,2021-12-15 12:16:00,0,450.0,0.0,20.0


(405810, 44)


In [44]:
result[['콘서트홀클래식_누적검색량', '예술의전당클래식_누적검색량', '서울클래식_누적검색량']].describe()

Unnamed: 0,콘서트홀클래식_누적검색량,예술의전당클래식_누적검색량,서울클래식_누적검색량
count,405810.0,405810.0,405810.0
mean,898.888889,11.111111,51.54321
std,384.631129,18.358591,34.651232
min,110.0,0.0,0.0
25%,600.0,0.0,20.0
50%,890.0,0.0,40.0
75%,1180.0,20.0,70.0
max,2000.0,80.0,170.0


**3가지 검색 키워드 중 예술의 전당 콘서트홀 검색 키워드가 가장 많은 것으로 나타났다**

In [12]:
file_path = 'C:/Users/whileduck/Desktop/Github/Concert-Hall-Price-Model/data/'

result.to_csv(file_path + '키워드_검색량_추가_클래식_데이터.csv',index = False)