In [1]:
# !pip install matplotlib
# !pip install pandas
# !pip install seaborn
# !pip install numpy 



In [3]:
# 기본 패키지 불러오기 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns 
import matplotlib as mpl
from matplotlib import font_manager, rc

# 그래프를 노트북 안에 그리기 위해 설정
%matplotlib inline

# 글꼴 경로 지정
font_path = '../../../../data/malgun.ttf'  # 윈도우에 설치된 맑은 고딕 폰트 경로

# 폰트 이름 얻어오기
font_name = font_manager.FontProperties(fname=font_path).get_name()

# matplotlib의 rc(run command) 기능을 이용하여 글꼴 설정
mpl.rc('font', family=font_name)

# 유니코드에서  음수 부호 설정
mpl.rc('axes', unicode_minus=False)

In [4]:
# 데이터 불러오기

path = '../../../../data/Airbnb_London/listings.csv'

london_lists = pd.read_csv(path)

# 1차 선별한 컬럼 25개 중 22개 2차 선별 + 1개(숙소_예약가능_여부)
# 최근 리뷰가 작성되었는지를 보기 위해 2개 컬럼 추가_240529 >> 'number_of_reviews_ltm' // 'last_review'
# URL 컬럼 london_lists에 적용_240530

columns_selected = ['id', 'listing_url', 'host_id', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type', \
    'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'amenities', 'price', 'has_availability', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'last_review', 'review_scores_rating', 'review_scores_accuracy', \
    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month']

# 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'minimum_nights', 'maximum_nights', 'has_availability',

# 선별한 컬럼만 적용
london_lists = london_lists[columns_selected]

# 변수 정리 
london_lists = london_lists.rename(columns={
    'id': '숙소_id',
    'listing_url' : '숙소_url',
    'host_id': '호스트_id',
    # 'host_response_time': '답변_평균시간',
    # 'host_response_rate': '문의_응답률',
    # 'host_acceptance_rate': '예약_수락률',
    'host_is_superhost': '슈퍼호스트',
    # 'host_total_listings_count': '숙소_수',
    # 'host_has_profile_pic' : '프로필_사진',
    # 'host_identity_verified' : '호스트_신원',
    'neighbourhood_cleansed': '숙소_지역',
    'property_type' : '숙소_특징',
    'room_type': '숙소_유형',
    'accommodates': '수용_인원수',
    'bathrooms': '욕실수',
    'bedrooms': '침실수',
    'beds': '침대수',
    'amenities': '편의시설',
    'price': '숙소_가격',
    # 'minimum_nights': '최소_숙박일',
    # 'maximum_nights': '최대_숙박일',
    'has_availability' : '예약_가능여부',
    'number_of_reviews': '리뷰수',
    'number_of_reviews_ltm' : '12개월_리뷰수',
    'number_of_reviews_l30d': '30일_리뷰수',
    'last_review' : '마지막_리뷰',
    'review_scores_rating': '리뷰점수',
    'review_scores_accuracy': '숙소_정확성_리뷰점수',
    'review_scores_cleanliness': '숙소_청결도_리뷰점수',
    'review_scores_checkin': '숙소_체크인_리뷰점수',
    'review_scores_communication': '숙소_소통_리뷰점수',
    'review_scores_location': '숙소_위치_리뷰점수',
    'review_scores_value': '숙소_가격_리뷰점수',
    'reviews_per_month': '평균_리뷰수'
 
})

temp = london_lists.copy()
raw = london_lists.copy()

# 컬럼명 확인
london_lists.shape



(90852, 26)

In [5]:
# 리뷰수가 0인 컬럼 조건 
condition_review_0 = london_lists['리뷰수'] == 0

# 별도의 DF로 저장
london_lists_review_0 = london_lists[condition_review_0]

# 리뷰 개수가 0이 아닌 컬럼 조건 설정
condition_review = london_lists['리뷰수'] != 0

# 리뷰 개수가 0이 아닌 컬럼으로 df 업데이트 
london_lists = london_lists[condition_review]
temp = london_lists.copy()

#숙소 유형 제거 
# 조건 설정
condition_room_entirehomeapt = london_lists['숙소_유형'] == 'Entire home/apt'
condition_room_privateroom = london_lists['숙소_유형'] == 'Private room'

# 데이터 제거
london_lists = london_lists[(condition_room_entirehomeapt | condition_room_privateroom)]

# 예약가능여부 결측치 제거
london_lists = london_lists[london_lists["예약_가능여부"].notnull()]


# 가격이 null값/notnull값 조건 생성 
condition_price_notnull = london_lists['숙소_가격'].notnull()
condition_price_null = london_lists['숙소_가격'].isnull()

# 가격이 null값/null값이 아닌 df 분리
london_lists_price = london_lists[condition_price_notnull] 
london_lists_price_null = london_lists[condition_price_null]

# 변수명 정리 널값이 아닌 데이터
london_lists = london_lists_price

#슈퍼호스트 결측치 제거
london_lists = london_lists.dropna(subset=['슈퍼호스트'])

#욕실,침대,침실수 결측츠제거
london_lists = london_lists.dropna(subset=['욕실수', '침실수', '침대수'])

#리뷰점수 결측치제거
london_lists = london_lists.dropna(subset=['숙소_정확성_리뷰점수', '숙소_청결도_리뷰점수', '숙소_체크인_리뷰점수', '숙소_소통_리뷰점수', '숙소_위치_리뷰점수', '숙소_가격_리뷰점수'])

#12개월 리뷰수 0인것 제거 
london_lists = london_lists[london_lists['12개월_리뷰수'] != 0]

london_lists['숙소_가격'] = london_lists['숙소_가격'].str.lstrip('$').str.replace(',', '').astype('float')

#가격이 0 인값 제거 
london_lists = london_lists[london_lists['숙소_가격'] != 0.0]

london_lists.shape

(38769, 26)

이상치 처리

In [6]:
#수용 인원수 이상치 제거 

import pandas as pd


# 이상치를 제거하는 함수 정의
def remove_price_outliers(df, price_column, type):
    
    condition = (df['숙소_유형'] == type)
    
    if price_column != '숙소_가격':
        Q1 = df[price_column].quantile(0.25)
        Q3 = df[price_column].quantile(0.75)
        IQR = Q3 - Q1
        upper_bound = Q3 + 1.5 * IQR

        
        outlier = df[price_column][condition] > upper_bound
        upper_outlier = df[condition][outlier]
        
        upper_outlier_indices = upper_outlier.index
        london_lists_cleaned = df.drop(index=upper_outlier_indices, inplace=True)
        df = london_lists_cleaned
        return df 
    else:
        Q1 = df[price_column].quantile(0.25)
        Q3 = df[price_column].quantile(0.75)
        IQR = Q3 - Q1
        upper_bound = Q3 + 3 * IQR

        
        outlier = df[price_column][condition] > upper_bound
        upper_outlier = df[condition][outlier]
        
        upper_outlier_indices = upper_outlier.index
        london_lists_cleaned = df.drop(index=upper_outlier_indices, inplace=True)
        df = london_lists_cleaned
        return df 
# 숙소 가격의 이상치 제거
remove_price_outliers(london_lists, '수용_인원수', 'Entire home/apt')
remove_price_outliers(london_lists, '수용_인원수', 'Private room')
remove_price_outliers(london_lists, '숙소_가격', 'Entire home/apt')
remove_price_outliers(london_lists, '숙소_가격', 'Private room')
london_lists.shape # 36407개가 되어야함


(36407, 26)

In [7]:
london_lists['숙소_url']

0                     https://www.airbnb.com/rooms/312761
1                      https://www.airbnb.com/rooms/13913
2                      https://www.airbnb.com/rooms/15400
3                     https://www.airbnb.com/rooms/159736
4                     https://www.airbnb.com/rooms/165336
                               ...                       
90451    https://www.airbnb.com/rooms/1112236129628471308
90568    https://www.airbnb.com/rooms/1112524736745357245
90590    https://www.airbnb.com/rooms/1113042462528003601
90648    https://www.airbnb.com/rooms/1113966989586525761
90735    https://www.airbnb.com/rooms/1113540860743654582
Name: 숙소_url, Length: 36407, dtype: object

크롤링

리뷰 크롤링

In [9]:
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
from bs4 import BeautifulSoup
import re



#웹 브라우저 사이즈 고장
# driver.set_window_size(1920, 1080)  

def airbnb_reviews(url):  
    
    random_sec = random.uniform(1, 5)
    
    chrome_driver_path = 'C:/Users\Master/.wdm/drivers/chromedriver/win64/125.0.6422.141/chromedriver-win32/chromedriver.exe'
    chrome_service = Service(chrome_driver_path)
      
    #크롬 option 설정 꼭 입력!
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    chrome_options.add_experimental_option("detach", True)
    chrome_options.add_experimental_option("excludeSwitches", ['enable-logging'])
    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)  
    
    # 브라우저 창 크기 설정
    driver.set_window_size(1920, 1080)
    
    driver.get(url)
    
    time.sleep(random_sec) 
    
    #번역창 닫기
    try:
        translation_modal_close_button = WebDriverWait(driver, random_sec).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Close"]'))
        )
        translation_modal_close_button.click()
        time.sleep(1)
        
    except:
        print("번역 모달 창 실패")  # 번역창이 없는 경우를 대비한 예외 처리
        
    

    # 페이지 소스를 BeautifulSoup으로 파싱
    soup = BeautifulSoup(driver.page_source, 'html.parser')


    # # #스크롤 내리기
    # for c in range(0,4):
    #     driver.find_element(By.TAG_NAME,'body').send_keys(Keys.PAGE_DOWN)
    #     time.sleep(1)
        
    # 리뷰 모두 보기 버튼 클릭
    try:
        review_button = WebDriverWait(driver, random_sec).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Show all') and contains(text(), 'reviews')]" ))
        )
        review_button.click()
       
    except:
        print('리뷰 버튼 실패')
        driver.quit()
        exit()
        
    
        
    #모달 창이 열릴 때까지 대기 
    try:
        scroll_panel = WebDriverWait(driver, random_sec).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-testid="pdp-reviews-modal-scrollable-panel"]'))
        )        
    except:
        print('리뷰 모달창 실패')
        driver.quit()
        exit()
     
    
        
    #처음 강제로 스크롤 내리기 
    driver.execute_script("""
        arguments[0].scrollTop = arguments[0].scrollHeight;
        arguments[0].dispatchEvent(new Event('scroll'));
    """, scroll_panel)
    time.sleep(1)
    
    #무한 스크롤을 통해 모든 리뷰를 로드 
    last_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)

    
    while True:
        # 스크롤을 내리는 동작 (패널의 마지막 부분으로 이동)
        driver.execute_script("arguments[0].scrollIntoView(false);", scroll_panel)
        time.sleep(1)  # 스크롤 후 로딩을 위해 잠시 대기

        # 새로운 스크롤 높이를 가져오기
        new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)
        
        if new_height == last_height:
            break
        last_height = new_height
    # 페이지 소스를 다시 BeautifulSoup으로 파싱
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    #리뷰 크롤링 
    
    #리뷰가 있는 패널
    review_container = soup.find('div', {'data-testid' : "pdp-reviews-modal-scrollable-panel"})
    
    #패널에서 리뷰와 작성일자 가져오기
    if review_container:
        review_tags = review_container.find_all('span', class_='lrl13de atm_kd_19r6f69_24z95b atm_kd_19r6f69_1xbvphn_1oszvuo dir dir-ltr')
        review_date = review_container.find_all('div',class_= "s78n3tv atm_c8_1w0928g atm_g3_1dd5bz5 atm_cs_9dzvea atm_9s_1txwivl atm_h_1h6ojuz dir dir-ltr")
        
        reviews_dates = {}
       
        date_pattern = re.compile(r'·\s*(.*)$') # · 이후의 텍스트 추출 
        
        for review, date in zip(review_tags, review_date):
            review_text = review.get_text().strip()
            date_text = date.get_text().strip()
            
            date_info = date_pattern.search(date_text)

            # 날짜 정보가 발견되었는지 확인하고 출력
            if date_info:
                date_info = date_info.group(1)  # '3 weeks ago'
            else:
                date_info = date_text
                
            # 작성일이 May 2024 이거나 April 2024이거나 weeks ago를 포함한 작성일만 가져와줘
            if "April 2024" in date_info or "May 2024" in date_info or "weeks ago" in date_info:
                reviews_dates[review_text] = date_info
            
            
    else:
        reviews_dates = {}

    driver.quit()

    return reviews_dates

  

# 결과값 데이터프레임화 
# results_df = pd.DataFrame(results, columns=['호스트_id', '숙소_url', 'etc'])

# # csv파일로 저장
# results_df.to_csv('scraped_data.csv', index=False)

# print("Data saved to 'scraped_data_1.csv'")


In [11]:
reviews_dates = airbnb_reviews('https://www.airbnb.com/rooms/165336?&locale=en')
print(reviews_dates)

{"It was really a pleasure staying in Nathan's apartment, the location is very convenient: below the house is the metro station and the bus stop is perfect for reaching all points of the city. Nathan was a helpful host and always quick to respond, he also introduced us to the planned vehicle strike ( very useful to organize our return). We found a clean apartment and everything you need to prepare our little girl's food. We were very happy with this experience and recommend the apartment to everyone!": '3 weeks ago', 'Lovely stay in a beautiful neighborhood. Equipped with everything you need! Nathan himself is a very warm host. Would stay again!': 'April 2024', 'Great place to stay for a cpuple of days. The apartment has everything you need & is a great starting point to explore the city.Nathan is a great host, very responsive and friendly.': 'April 2024'}


In [12]:
df = pd.DataFrame(list(reviews_dates.items()), columns=['Review', 'Date'])
df

Unnamed: 0,Review,Date
0,It was really a pleasure staying in Nathan's a...,3 weeks ago
1,Lovely stay in a beautiful neighborhood. Equip...,April 2024
2,Great place to stay for a cpuple of days. The ...,April 2024


In [None]:
all_reviews = {}

for url in london_lists['숙소_url']:
    reviews = airbnb_reviews(url)
    all_reviews.extend(reviews)
df = pd.DataFrame(list(all_reviews.items()), columns=['Review', 'Date'])

print(len(df))
print(df)

작성일자 가져오기

In [22]:
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
from bs4 import BeautifulSoup
import re

random_sec = random.uniform(1, 5)

chrome_driver_path = 'C:/Users\Master/.wdm/drivers/chromedriver/win64/125.0.6422.141/chromedriver-win32/chromedriver.exe'
chrome_service = Service(chrome_driver_path)
    
#크롬 option 설정 꼭 입력!
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

chrome_options.add_experimental_option("detach", True)
chrome_options.add_experimental_option("excludeSwitches", ['enable-logging'])
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)  

# 브라우저 창 크기 설정
driver.set_window_size(1920, 1080)
url = 'https://www.airbnb.com/rooms/165336?&locale=en'
driver.get(url)

time.sleep(random_sec) 

#번역창 닫기
try:
    translation_modal_close_button = WebDriverWait(driver, random_sec).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Close"]'))
    )
    translation_modal_close_button.click()
    time.sleep(1)
    
except:
    print("번역 모달 창 실패")  # 번역창이 없는 경우를 대비한 예외 처리
    


# 페이지 소스를 BeautifulSoup으로 파싱
soup = BeautifulSoup(driver.page_source, 'html.parser')


# # #스크롤 내리기
# for c in range(0,4):
#     driver.find_element(By.TAG_NAME,'body').send_keys(Keys.PAGE_DOWN)
#     time.sleep(1)
    
# 리뷰 모두 보기 버튼 클릭
try:
    review_button = WebDriverWait(driver, random_sec).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Show all') and contains(text(), 'reviews')]" ))
    )
    review_button.click()
    
except:
    print('리뷰 버튼 실패')
    driver.quit()
    exit()
    

    
#모달 창이 열릴 때까지 대기 
try:
    scroll_panel = WebDriverWait(driver, random_sec).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-testid="pdp-reviews-modal-scrollable-panel"]'))
    )        
except:
    print('리뷰 모달창 실패')
    driver.quit()
    exit()
    

    
#처음 강제로 스크롤 내리기 
driver.execute_script("""
    arguments[0].scrollTop = arguments[0].scrollHeight;
    arguments[0].dispatchEvent(new Event('scroll'));
""", scroll_panel)
time.sleep(1)

#무한 스크롤을 통해 모든 리뷰를 로드 
last_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)


while True:
    # 스크롤을 내리는 동작 (패널의 마지막 부분으로 이동)
    driver.execute_script("arguments[0].scrollIntoView(false);", scroll_panel)
    time.sleep(1)  # 스크롤 후 로딩을 위해 잠시 대기

    # 새로운 스크롤 높이를 가져오기
    new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)
    
    if new_height == last_height:
        break
    last_height = new_height
# 페이지 소스를 다시 BeautifulSoup으로 파싱
soup = BeautifulSoup(driver.page_source, 'html.parser')

#리뷰가 있는 패널
review_container = soup.find('div', {'data-testid' : "pdp-reviews-modal-scrollable-panel"})

#패널안에서 리뷰와 작성일자 가져오기 
if review_container:
    review_date = review_container.find_all('div',class_= "s78n3tv atm_c8_1w0928g atm_g3_1dd5bz5 atm_cs_9dzvea atm_9s_1txwivl atm_h_1h6ojuz dir dir-ltr")
    
    reviews = []
    for date in review_date:
        text = date.strip()
        reviews.append(text)
else:
    reviews = []
    
driver.quit()



: 

In [11]:
print(reviews)

['Rating, 5 stars,  · 3 weeks ago', 'Rating, 5 stars,  · April 2024', 'Rating, 5 stars,  · April 2024', 'Rating, 5 stars,  · March 2024', 'Rating, 5 stars,  · March 2024', 'Rating, 3 stars,  · January 2024', 'Rating, 5 stars,  · January 2024', 'Rating, 3 stars,  · December 2023', 'Rating, 5 stars,  · December 2023', 'Rating, 5 stars,  · December 2023', 'Rating, 5 stars,  · December 2023', 'Rating, 4 stars,  · November 2023', 'Rating, 5 stars,  · October 2023', 'Rating, 5 stars,  · September 2023', 'Rating, 5 stars,  · August 2023', 'Rating, 5 stars,  · August 2023', 'Rating, 5 stars,  · August 2023', 'Rating, 4 stars,  · August 2023', 'Rating, 4 stars,  · August 2023', 'Rating, 5 stars,  · July 2023', 'Rating, 5 stars,  · July 2023', 'Rating, 5 stars,  · July 2023', 'Rating, 5 stars,  · July 2023', 'Rating, 5 stars,  · July 2023']


In [21]:
reviews

['Rating, 5 stars,  · 3 weeks ago']

In [19]:
import re



# 날짜 정보를 추출하기 위한 정규 표현식 패턴
date_pattern = re.compile(r'\b\d+\s\w+\sago\b')



date_info = date_pattern.search(review)

# 날짜 정보가 발견되었는지 확인하고 출력
if date_info:
    print(date_info.group())  # '3 weeks ago'
else:
    print("Date information not found")

3 weeks ago
