In [120]:
import pandas as pd
import os

parent_dir = 'data/raw/hotels'
files = [os.path.join(parent_dir, file) for file in os.listdir(parent_dir)]
dfs = [pd.read_csv(file, delimiter=';', encoding='utf-8') for file in files]
df_hotels = pd.concat(dfs)

df_hotels.to_csv('data/raw/raw_hotels_combined.csv', index=False)

df_hotels.shape

(211187, 9)

In [55]:
df_hotels.duplicated(df_hotels.columns[:-2]).sum()

52579

In [56]:
df_hotels.drop_duplicates(df_hotels.columns[:-2], inplace=True)

In [57]:
df_hotels.reset_index(drop=True, inplace=True)

In [62]:
def delete_columns_as_row(value):
    '현재 14개의 row에 column명이 들어가 있음. 왜 그런지는 모르겠음'
    try:
        int(value)
        return True
    except:
        return False

In [105]:
df_hotels = df_hotels[df_hotels['page_idx'].map(delete_columns_as_row)].reset_index(drop=True)

In [65]:
def strip_url(url: str):
    '호텔 url 도로명 주소 부분까지만 strip'
    if isinstance(url, str):
        to_return = url[:url.find('html') + 4]
    else:
        to_return = 'str 아님'
    return to_return

In [66]:
urls = df_hotels['url'].map(strip_url)
urls_unique = urls.unique()

In [72]:
import requests
from bs4 import BeautifulSoup
import json

data = []
for idx, url in enumerate(urls_unique):
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'html.parser')
        json_str = soup.find('script', type='application/ld+json').text
        json_dict = json.loads(json_str)
        datapoint = pd.DataFrame([[json_dict['address']['addressLocality'], json_dict['address']['postalCode'], url]], columns=['local_address', 'postal_code', 'short_url'])
        print(f'{len(urls_unique)} 중 {idx}개: {datapoint.values}', end='\r')
    except:
        datapoint = pd.DataFrame([['wrong', 'wrong', 'url']])
    data.append(datapoint)

addresses = pd.concat(data, ignore_index=True)

[['애월읍 애월해안로 255' '63045']]]]62']]6']]'63643']]

In [91]:
addresses = addresses.iloc[:,:2]
addresses = pd.concat((addresses, pd.Series(urls_unique, name='short_url')), axis=1)
addresses

Unnamed: 0,local_address,postal_code,short_url
0,성산읍 동류암로 20,63640,https://www.booking.com/hotel/kr/playce-camp-j...
1,동문로 42,63591,https://www.booking.com/hotel/kr/kenny-story-i...
2,김정문화로41번길 10-8,63566,https://www.booking.com/hotel/kr/tamara-jeju.k...
3,이어도로 684,63564,https://www.booking.com/hotel/kr/kensington-re...
4,중앙로 304,63222,https://www.booking.com/hotel/kr/ramada-jeju-c...
...,...,...,...
206,제주 구좌읍 해맞이해안로 1590-6,63362,https://www.booking.com/hotel/kr/k-turn-and-re...
207,성산읍 해맞이해안로 2614,63638,https://www.booking.com/hotel/kr/jeju-addirang...
208,태평로 353번길 14,63595,https://www.booking.com/hotel/kr/m-stay.ko.html
209,안덕면 사계남로 102,63528,https://www.booking.com/hotel/kr/sun-and-moon-...


In [93]:
df_hotels['short_url'] = urls
df_hotels = df_hotels.merge(addresses, on='short_url', how='left')
df_hotels

Unnamed: 0,url,hotel_name,region,ratings,price,start_date,end_date,page_idx,cap_idx,short_url,local_address,postal_code
0,https://www.booking.com/hotel/kr/playce-camp-j...,플레이스 캠프 제주,"성산읍, 서귀포",8.1,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,0,0,https://www.booking.com/hotel/kr/playce-camp-j...,성산읍 동류암로 20,63640
1,https://www.booking.com/hotel/kr/kenny-story-i...,호텔 케니 서귀포,"서귀포시, 서귀포",8.1,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,0,0,https://www.booking.com/hotel/kr/kenny-story-i...,동문로 42,63591
2,https://www.booking.com/hotel/kr/tamara-jeju.k...,타마라 제주 호텔,"서귀포시, 서귀포",8.2,34422,2023-07-24 00:00:00,2023-07-25 00:00:00,0,0,https://www.booking.com/hotel/kr/tamara-jeju.k...,김정문화로41번길 10-8,63566
3,https://www.booking.com/hotel/kr/kensington-re...,켄싱턴리조트 서귀포,서귀포,7.3,259015,2023-07-24 00:00:00,2023-07-25 00:00:00,0,0,https://www.booking.com/hotel/kr/kensington-re...,이어도로 684,63564
4,https://www.booking.com/hotel/kr/ramada-jeju-c...,라마다 제주시티호텔,"제주 시내, 제주",8.0,96512,2023-07-24 00:00:00,2023-07-25 00:00:00,0,0,https://www.booking.com/hotel/kr/ramada-jeju-c...,중앙로 304,63222
...,...,...,...,...,...,...,...,...,...,...,...,...
158603,https://www.booking.com/hotel/kr/jejupureunhot...,제주 푸른 호텔,"서귀포시, 서귀포",8.9,34314,2023-08-24 00:00:00,2023-08-25 00:00:00,3,3,https://www.booking.com/hotel/kr/jejupureunhot...,서호중로 47,63568
158604,https://www.booking.com/hotel/kr/the-island-ma...,브라운 스위트 호텔 & 리조트,"성산읍, 서귀포",7.1,52673,2023-08-24 00:00:00,2023-08-25 00:00:00,3,3,https://www.booking.com/hotel/kr/the-island-ma...,성산읍 고성오조로 94,63639
158605,https://www.booking.com/hotel/kr/ramada-plaza-...,라마다 프라자 호텔,"제주 시내, 제주",8.0,217412,2023-08-24 00:00:00,2023-08-25 00:00:00,3,3,https://www.booking.com/hotel/kr/ramada-plaza-...,탑동로 66,63165
158606,https://www.booking.com/hotel/kr/the-suites-je...,스위트 호텔 제주,"중문 해수욕장, 서귀포",8.1,167700,2023-08-24 00:00:00,2023-08-25 00:00:00,3,3,https://www.booking.com/hotel/kr/the-suites-je...,중문관광로72번길 67,63535


In [94]:
df_hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158608 entries, 0 to 158607
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   url            158608 non-null  object
 1   hotel_name     158608 non-null  object
 2   region         158608 non-null  object
 3   ratings        158608 non-null  object
 4   price          158608 non-null  object
 5   start_date     158608 non-null  object
 6   end_date       158608 non-null  object
 7   page_idx       158608 non-null  object
 8   cap_idx        158608 non-null  object
 9   short_url      158608 non-null  object
 10  local_address  158607 non-null  object
 11  postal_code    158607 non-null  object
dtypes: object(12)
memory usage: 14.5+ MB


In [None]:
df_hotels['ratings'] = df_hotels['ratings'].astype(float).map(lambda x: x/2)
df_hotels['price'] = df_hotels['price'].astype(int)
df_hotels.drop(['page_idx', 'cap_idx'], axis=1, inplace=True)
df_hotels.head()

In [117]:
df_hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158607 entries, 0 to 158606
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   url            158607 non-null  object 
 1   hotel_name     158607 non-null  object 
 2   region         158607 non-null  object 
 3   ratings        158607 non-null  float64
 4   price          158607 non-null  int32  
 5   start_date     158607 non-null  object 
 6   end_date       158607 non-null  object 
 7   page_idx       158607 non-null  int32  
 8   cap_idx        158607 non-null  int32  
 9   short_url      158607 non-null  object 
 10  local_address  158607 non-null  object 
 11  postal_code    158607 non-null  object 
dtypes: float64(1), int32(3), object(8)
memory usage: 12.7+ MB


In [119]:
df_hotels.to_csv('data/preprocessed/preprocessed_hotels_incomplete.csv', index=False)

In [1]:
import pandas as pd

raw_path = 'data/raw/raw_hotels_combined.csv'
pre_path = 'data/preprocessed/preprocessed_hotels_incomplete.csv'
df_raw = pd.read_csv(raw_path, encoding='utf-8')
df_pre = pd.read_csv(pre_path)

In [5]:
df_pre = df_pre.merge(df_raw[['url', 'cap_idx']], on='url')
df_pre

Unnamed: 0,url,hotel_name,region,ratings,price,start_date,end_date,short_url,local_address,postal_code,cap_idx
0,https://www.booking.com/hotel/kr/playce-camp-j...,플레이스 캠프 제주,"성산읍, 서귀포",4.05,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,https://www.booking.com/hotel/kr/playce-camp-j...,성산읍 동류암로 20,63640,0
1,https://www.booking.com/hotel/kr/playce-camp-j...,플레이스 캠프 제주,"성산읍, 서귀포",4.05,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,https://www.booking.com/hotel/kr/playce-camp-j...,성산읍 동류암로 20,63640,0
2,https://www.booking.com/hotel/kr/kenny-story-i...,호텔 케니 서귀포,"서귀포시, 서귀포",4.05,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,https://www.booking.com/hotel/kr/kenny-story-i...,동문로 42,63591,0
3,https://www.booking.com/hotel/kr/kenny-story-i...,호텔 케니 서귀포,"서귀포시, 서귀포",4.05,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,https://www.booking.com/hotel/kr/kenny-story-i...,동문로 42,63591,0
4,https://www.booking.com/hotel/kr/tamara-jeju.k...,타마라 제주 호텔,"서귀포시, 서귀포",4.10,34422,2023-07-24 00:00:00,2023-07-25 00:00:00,https://www.booking.com/hotel/kr/tamara-jeju.k...,김정문화로41번길 10-8,63566,0
...,...,...,...,...,...,...,...,...,...,...,...
211169,https://www.booking.com/hotel/kr/jejupureunhot...,제주 푸른 호텔,"서귀포시, 서귀포",4.45,34314,2023-08-24 00:00:00,2023-08-25 00:00:00,https://www.booking.com/hotel/kr/jejupureunhot...,서호중로 47,63568,3
211170,https://www.booking.com/hotel/kr/the-island-ma...,브라운 스위트 호텔 & 리조트,"성산읍, 서귀포",3.55,52673,2023-08-24 00:00:00,2023-08-25 00:00:00,https://www.booking.com/hotel/kr/the-island-ma...,성산읍 고성오조로 94,63639,3
211171,https://www.booking.com/hotel/kr/ramada-plaza-...,라마다 프라자 호텔,"제주 시내, 제주",4.00,217412,2023-08-24 00:00:00,2023-08-25 00:00:00,https://www.booking.com/hotel/kr/ramada-plaza-...,탑동로 66,63165,3
211172,https://www.booking.com/hotel/kr/the-suites-je...,스위트 호텔 제주,"중문 해수욕장, 서귀포",4.05,167700,2023-08-24 00:00:00,2023-08-25 00:00:00,https://www.booking.com/hotel/kr/the-suites-je...,중문관광로72번길 67,63535,3


In [7]:
df_pre.drop(['url', 'short_url', 'local_address', 'postal_code'], axis=1, inplace=True)
df_pre.head()

Unnamed: 0,hotel_name,region,ratings,price,start_date,end_date,cap_idx
0,플레이스 캠프 제주,"성산읍, 서귀포",4.05,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,0
1,플레이스 캠프 제주,"성산읍, 서귀포",4.05,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,0
2,호텔 케니 서귀포,"서귀포시, 서귀포",4.05,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,0
3,호텔 케니 서귀포,"서귀포시, 서귀포",4.05,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,0
4,타마라 제주 호텔,"서귀포시, 서귀포",4.1,34422,2023-07-24 00:00:00,2023-07-25 00:00:00,0


In [10]:
df_pre['capacity'] = df_pre['cap_idx'].astype(int).map(lambda x: x+1)
df_pre.drop('cap_idx', axis=1, inplace=True)
df_pre.head()

Unnamed: 0,hotel_name,region,ratings,price,start_date,end_date,capacity
0,플레이스 캠프 제주,"성산읍, 서귀포",4.05,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,1
1,플레이스 캠프 제주,"성산읍, 서귀포",4.05,51146,2023-07-24 00:00:00,2023-07-25 00:00:00,1
2,호텔 케니 서귀포,"서귀포시, 서귀포",4.05,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,1
3,호텔 케니 서귀포,"서귀포시, 서귀포",4.05,37909,2023-07-24 00:00:00,2023-07-25 00:00:00,1
4,타마라 제주 호텔,"서귀포시, 서귀포",4.1,34422,2023-07-24 00:00:00,2023-07-25 00:00:00,1


In [11]:
new_path = 'data/preprocessed/preprocessed_hotels_need_region.csv'
df_pre.to_csv(new_path, index=False)