In [1]:
# 기본 패키지 불러오기 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns 
import matplotlib as mpl
from matplotlib import font_manager, rc

# 그래프를 노트북 안에 그리기 위해 설정
%matplotlib inline

# 글꼴 경로 지정
font_path = '../../data/malgun.ttf'  # 윈도우에 설치된 맑은 고딕 폰트 경로

# 폰트 이름 얻어오기
font_name = font_manager.FontProperties(fname=font_path).get_name()

# matplotlib의 rc(run command) 기능을 이용하여 글꼴 설정
mpl.rc('font', family=font_name)

# 유니코드에서  음수 부호 설정
mpl.rc('axes', unicode_minus=False)

In [2]:
# 데이터 불러오기

path = '../../data/Airbnb_London/listings.csv'

london_lists = pd.read_csv(path)

# 1차 선별한 컬럼 25개 중 22개 2차 선별 + 1개(숙소_예약가능_여부)
# 최근 리뷰가 작성되었는지를 보기 위해 2개 컬럼 추가_240529 >> 'number_of_reviews_ltm' // 'last_review'
# URL 컬럼 london_lists에 적용_240530

columns_selected = ['id', 'listing_url', 'host_id', 'host_is_superhost', 'neighbourhood_cleansed', 'property_type', \
    'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'amenities', 'price', 'has_availability', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'last_review', 'review_scores_rating', 'review_scores_accuracy', \
    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month']

# 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_total_listings_count', 'host_has_profile_pic', 'host_identity_verified', 'minimum_nights', 'maximum_nights', 'has_availability',

# 선별한 컬럼만 적용
london_lists = london_lists[columns_selected]

# 변수 정리 
london_lists = london_lists.rename(columns={
    'id': '숙소_id',
    'listing_url' : '숙소_url',
    'host_id': '호스트_id',
    # 'host_response_time': '답변_평균시간',
    # 'host_response_rate': '문의_응답률',
    # 'host_acceptance_rate': '예약_수락률',
    'host_is_superhost': '슈퍼호스트',
    # 'host_total_listings_count': '숙소_수',
    # 'host_has_profile_pic' : '프로필_사진',
    # 'host_identity_verified' : '호스트_신원',
    'neighbourhood_cleansed': '숙소_지역',
    'property_type' : '숙소_특징',
    'room_type': '숙소_유형',
    'accommodates': '수용_인원수',
    'bathrooms': '욕실수',
    'bedrooms': '침실수',
    'beds': '침대수',
    'amenities': '편의시설',
    'price': '숙소_가격',
    # 'minimum_nights': '최소_숙박일',
    # 'maximum_nights': '최대_숙박일',
    'has_availability' : '예약_가능여부',
    'number_of_reviews': '리뷰수',
    'number_of_reviews_ltm' : '12개월_리뷰수',
    'number_of_reviews_l30d': '30일_리뷰수',
    'last_review' : '마지막_리뷰',
    'review_scores_rating': '리뷰점수',
    'review_scores_accuracy': '숙소_정확성_리뷰점수',
    'review_scores_cleanliness': '숙소_청결도_리뷰점수',
    'review_scores_checkin': '숙소_체크인_리뷰점수',
    'review_scores_communication': '숙소_소통_리뷰점수',
    'review_scores_location': '숙소_위치_리뷰점수',
    'review_scores_value': '숙소_가격_리뷰점수',
    'reviews_per_month': '평균_리뷰수'
 
})

temp = london_lists.copy()
raw = london_lists.copy()

# 컬럼명 확인
london_lists.shape



(90852, 26)

In [3]:
# 리뷰수가 0인 컬럼 조건 
condition_review_0 = london_lists['리뷰수'] == 0

# 별도의 DF로 저장
london_lists_review_0 = london_lists[condition_review_0]

# 리뷰 개수가 0이 아닌 컬럼 조건 설정
condition_review = london_lists['리뷰수'] != 0

# 리뷰 개수가 0이 아닌 컬럼으로 df 업데이트 
london_lists = london_lists[condition_review]
temp = london_lists.copy()

#숙소 유형 제거 
# 조건 설정
condition_room_entirehomeapt = london_lists['숙소_유형'] == 'Entire home/apt'
condition_room_privateroom = london_lists['숙소_유형'] == 'Private room'

# 데이터 제거
london_lists = london_lists[(condition_room_entirehomeapt | condition_room_privateroom)]

# 예약가능여부 결측치 제거
london_lists = london_lists[london_lists["예약_가능여부"].notnull()]


# 가격이 null값/notnull값 조건 생성 
condition_price_notnull = london_lists['숙소_가격'].notnull()
condition_price_null = london_lists['숙소_가격'].isnull()

# 가격이 null값/null값이 아닌 df 분리
london_lists_price = london_lists[condition_price_notnull] 
london_lists_price_null = london_lists[condition_price_null]

# 변수명 정리 널값이 아닌 데이터
london_lists = london_lists_price

#슈퍼호스트 결측치 제거
london_lists = london_lists.dropna(subset=['슈퍼호스트'])

#욕실,침대,침실수 결측츠제거
london_lists = london_lists.dropna(subset=['욕실수', '침실수', '침대수'])

#리뷰점수 결측치제거
london_lists = london_lists.dropna(subset=['숙소_정확성_리뷰점수', '숙소_청결도_리뷰점수', '숙소_체크인_리뷰점수', '숙소_소통_리뷰점수', '숙소_위치_리뷰점수', '숙소_가격_리뷰점수'])

#12개월 리뷰수 0인것 제거 
london_lists = london_lists[london_lists['12개월_리뷰수'] != 0]

london_lists['숙소_가격'] = london_lists['숙소_가격'].str.lstrip('$').str.replace(',', '').astype('float')

#가격이 0 인값 제거 
london_lists = london_lists[london_lists['숙소_가격'] != 0.0]

london_lists.shape

(38769, 26)

In [4]:
#수용 인원수 이상치 제거 

import pandas as pd


# 이상치를 제거하는 함수 정의
def remove_price_outliers(df, price_column, type):
    
    condition = (df['숙소_유형'] == type)
    
    if price_column != '숙소_가격':
        Q1 = df[price_column].quantile(0.25)
        Q3 = df[price_column].quantile(0.75)
        IQR = Q3 - Q1
        upper_bound = Q3 + 1.5 * IQR

        
        outlier = df[price_column][condition] > upper_bound
        upper_outlier = df[condition][outlier]
        
        upper_outlier_indices = upper_outlier.index
        london_lists_cleaned = df.drop(index=upper_outlier_indices, inplace=True)
        df = london_lists_cleaned
        return df 
    else:
        Q1 = df[price_column].quantile(0.25)
        Q3 = df[price_column].quantile(0.75)
        IQR = Q3 - Q1
        upper_bound = Q3 + 3 * IQR

        
        outlier = df[price_column][condition] > upper_bound
        upper_outlier = df[condition][outlier]
        
        upper_outlier_indices = upper_outlier.index
        london_lists_cleaned = df.drop(index=upper_outlier_indices, inplace=True)
        df = london_lists_cleaned
        return df 
# 숙소 가격의 이상치 제거
remove_price_outliers(london_lists, '수용_인원수', 'Entire home/apt')
remove_price_outliers(london_lists, '수용_인원수', 'Private room')
remove_price_outliers(london_lists, '숙소_가격', 'Entire home/apt')
remove_price_outliers(london_lists, '숙소_가격', 'Private room')
london_lists.shape # 36407개가 되어야함


(36407, 26)

In [5]:
london_lists['숙소_url']

0                     https://www.airbnb.com/rooms/312761
1                      https://www.airbnb.com/rooms/13913
2                      https://www.airbnb.com/rooms/15400
3                     https://www.airbnb.com/rooms/159736
4                     https://www.airbnb.com/rooms/165336
                               ...                       
90451    https://www.airbnb.com/rooms/1112236129628471308
90568    https://www.airbnb.com/rooms/1112524736745357245
90590    https://www.airbnb.com/rooms/1113042462528003601
90648    https://www.airbnb.com/rooms/1113966989586525761
90735    https://www.airbnb.com/rooms/1113540860743654582
Name: 숙소_url, Length: 36407, dtype: object

In [6]:
# import pandas as pd
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# import time
# from bs4 import BeautifulSoup


# # 욕실, 침실, 침대수가 nan값인 숙소_id, 숙소_url 데이터의 값 7806개 
# lodging_urls = "https://www.airbnb.com/rooms/312761"

# #크롬 option 설정 꼭 입력!
# chrome_options = Options()
# chrome_options.add_experimental_option("detach", True)
# chrome_options.add_experimental_option("excludeSwitches", ['enable-logging'])
# driver = webdriver.Chrome(options=chrome_options)

# #웹 브라우저 사이즈 고장
# driver.set_window_size(1920, 1080)  



#     listing_id, url = url_info
#     driver.get(url)
#     time.sleep(2) #로딩될 시간 지정 2초 

#     # bs4 설정 
#     html = driver.page_source
#     soup = BeautifulSoup(html, 'html.parser')

#     # 번역창 닫기
#     try:
#         driver.find_element(By.CSS_SELECTOR, 'button[aria-label="닫기"]').click()
#     except:
#         pass  # 번역창이 없는 경우를 대비한 예외 처리
    
    
#     # #스크롤 내리기
#     for c in range(0,5):
#         driver.find_element(By.TAG_NAME,'body').send_keys(Keys.PAGE_DOWN)
#         time.sleep(1)
        
#     # 리뷰 모두 보기 버튼 클릭
#     try:
#         driver.find_element(By.XPATH, '//div[contains(@class, "sh47dkx atm")]').click()
#         time.sleep(1)
#     except:
#         pass  # 리뷰 모두 보기 버튼이 없는 경우를 대비한 예외 처리
    
#     # 리뷰 크롤링
#     try:
#         review_container = driver.find_element(By.CSS_SELECTOR, 'div[data-testid="pdp-reviews-modal-scrollable-panel"]')
#         review_tags = review_container.find_elements(By.CSS_SELECTOR, 'div.r1are2x1')
#         reviews = []
#         for review_tag in review_tags:
#             review_text_tag = review_tag.find_element(By.CSS_SELECTOR, 'span.lrl13de-atm')
#             review_text = review_text_tag.text.strip() if review_text_tag else "No review text found"
#             reviews.append(review_text)
#             time.sleep(1) 
#     except:
#         reviews = ["No review container found"]

#     return listing_id, url, reviews

      
              
        
# results = []
# for index, url_info in enumerate(lodging_urls[:10]):  # 1000개만 해봄
#     print(f"{index + 1}번째 크롤링 중..")
#     result = scrape_airbnb(index, url_info)
#     results.append(result)
#     print(result)
#     time.sleep(6)  

# # 다 돌리면 크롤링 끄기
  

# # 결과값 데이터프레임화 
# # results_df = pd.DataFrame(results, columns=['호스트_id', '숙소_url', 'etc'])

# # # csv파일로 저장
# # results_df.to_csv('scraped_data.csv', index=False)

# # print("Data saved to 'scraped_data_1.csv'")


In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
from bs4 import BeautifulSoup




#크롬 option 설정 꼭 입력!
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
chrome_options.add_experimental_option("excludeSwitches", ['enable-logging'])
driver = webdriver.Chrome(options=chrome_options)

#웹 브라우저 사이즈 고장
driver.set_window_size(1920, 1080)  

url= 'https://www.airbnb.com/rooms/312761?source_impression_id=p3_1717237307_P30C1Pl_smoKOLn7'
driver.get(url)
time.sleep(2) #로딩될 시간 지정 2초 

#html 다운로드 및 bs4 로 읽기 
from bs4 import BeautifulSoup

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')


# 번역창 닫기
try:
    driver.find_element(By.CSS_SELECTOR, 'button[aria-label="닫기"]').click()
except:
    pass  # 번역창이 없는 경우를 대비한 예외 처리
time.sleep(2)

# #스크롤 내리기
for c in range(0,4):
    driver.find_element(By.TAG_NAME,'body').send_keys(Keys.PAGE_DOWN)
    time.sleep(1)
    
# 리뷰 모두 보기 버튼 클릭
try:
    driver.find_element(By.XPATH, '//div[contains(@class, "sh47dkx atm")]').click()
    
except:
    pass  # 리뷰 모두 보기 버튼이 없는 경우를 대비한 예외 처리
    time.sleep(5)
    
# 모달 창이 열릴 때까지 대기
try:
    scroll_panel = WebDriverWait(driver, 1).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-testid="pdp-reviews-modal-scrollable-panel"]'))
    )
except Exception as e:
    print("Failed to load the reviews modal:", e)
    driver.quit()
    exit()
    

# 처음에 강제로 스크롤 내리기
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_panel)
time.sleep(1)  # 스크롤 후 로딩을 위해 잠시 대기

# 무한 스크롤을 통해 모든 리뷰를 로드
last_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)

while True:
    # 스크롤을 내리는 동작 (패널의 마지막 부분으로 이동)
    driver.execute_script("arguments[0].scrollIntoView(false);", scroll_panel)
    time.sleep(2)  # 스크롤 후 로딩을 위해 잠시 대기

    # 새로운 스크롤 높이를 가져오기
    new_height = driver.execute_script("return arguments[0].scrollHeight", scroll_panel)
    
    if new_height == last_height:
        break
    last_height = new_height

# 페이지 소스를 다시 BeautifulSoup으로 파싱
soup = BeautifulSoup(driver.page_source, 'html.parser')

# 리뷰 크롤링
review_container = soup.find('div', {'data-testid': 'pdp-reviews-modal-scrollable-panel'})
if review_container:
    review_tags = review_container.find_all('div', class_='r1are2x1')
    reviews = []
    for review_tag in review_tags:
        review_text_tag = review_tag.find_element(By.CSS_SELECTOR, "body > div:nth-child(19) > div > div > section > div > div > div.p1psejvv.atm_9s_1bgihbq.dir.dir-ltr > div > div._ctwstv > div > div > div > div > div > div > section > div > div._vghwkew > div > section > div._1tqgvho > div > div:nth-child(1) > div:nth-child(2) > div.r1bctolv.atm_c8_1sjzizj.atm_g3_1dgusqm.atm_26_lfmit2_13uojos.atm_5j_1y44olf_13uojos.atm_l8_1s2714j_13uojos.dir.dir-ltr > div > span > span.lrl13de.atm")
        if review_text_tag:
            review_text = review_text_tag.get_text().strip()
        else:
            review_text = "No review text found"
        
        reviews.append({'text': review_text})
else:
    reviews = []

driver.quit()



print("Reviews:")
for review in reviews:
    print(f"Stars: {review['stars']}, Review: {review['text']}")

print(len(reviews))

review = soup.select('body > ul')
print(review)

TypeError: 'NoneType' object is not callable

In [2]:
# !pip install webdriver_manager




In [3]:
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager 

options = Options()
options.add_experimental_option('detach', True)
options.add_experimental_option('excludeSwitches', ['enable-logging'])

chrome_driver = ChromeDriverManager().install()

print(chrome_driver)

C:\Users\Master\.wdm\drivers\chromedriver\win64\125.0.6422.141\chromedriver-win32/chromedriver.exe
