In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

# 아마존 케이뷰티 - 더페이스샵 리뷰 페이지 URL
base_url = 'https://www.amazon.in/Face-Shop-Ceramide-Moisturizing-Skincare/product-reviews/B091DC8Z3Y'

# 사용자 에이전트 및 기타 헤더 설정
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.amazon.in/',
    'Connection': 'keep-alive',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1'
}

# 리뷰 데이터 수집 함수
def get_reviews(url, headers):
    review_data = []

    while True:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                reviews = soup.find_all('div', {'data-hook': 'review'})

                for review in reviews:
                    # 비디오 또는 사진이 포함된 리뷰 건너뛰기
                    if review.find('div', {'class': 'review-image-tile-section'}):
                        delete_things = review.find
                        continue

                    # 리뷰어 이름
                    reviewer_name = review.find('span', {'class': 'a-profile-name'}).text.strip()
                    if len(reviewer_name) > 15:
                       reviewer_name = reviewer_name[:15]+'*'
                    if not reviewer_name:
                        continue

                    # 별점 추출
                    star_rating = review.find('i', {'data-hook': 'review-star-rating'}).text
                    star_rating = int(re.search(r'(\d+)', star_rating).group(1))
                    star_display = '★' * star_rating + '☆' * (5 - star_rating)
                    if not star_rating:
                        continue

                    # 리뷰 날짜에서 'Reviewed in India on' 제거
                    review_date = review.find('span', {'data-hook': 'review-date'}).text.strip()
                    review_date = re.sub(r'Reviewed in India on ', '', review_date)
                    if not review_date:
                        continue

                    # 리뷰 내용 (None 방지)
                    review_text = review.find('span', {'data-hook': 'review-body'}).text.strip()
                    if not review_text:
                        continue

                    # 리뷰 데이터를 리스트에 추가
                    review_data.append([reviewer_name, star_display, review_date, review_text])

                # 다음 페이지로 이동
                next_page = soup.find('li', {'class': 'a-last'})
                if next_page and next_page.find('a'):
                    next_url = 'https://www.amazon.in' + next_page.find('a')['href']
                    url = next_url
                    time.sleep(1)  # 요청 간 간격 조절
                else:
                    break
            else:
                print(f"Failed to retrieve data: {response.status_code}")
                break

        except Exception as e:
            print(f"Error occurred: {e}")
            break

    return review_data

# 고정된 폭으로 출력
def print_reviews_fixed_width(df):
    print(f"{'Reviewer Name':<30}{'Rating':<15}{'Date':<20}{'Review':<80}")
    print("=" * 140)
    for index, row in df.iterrows():
        print(f"{row['Reviewer Name']:<30}{row['Rating']:<10}{row['Date']:<20}{row['Review']:<80}")

# 데이터 수집 후 출력
if __name__ == "__main__":
    reviews = get_reviews(base_url, headers)
    df = pd.DataFrame(reviews, columns=['Reviewer Name', 'Rating', 'Date', 'Review'])
    print(f"총 리뷰 개수: {df.shape[0]}")
    print_reviews_fixed_width(df)


Error occurred: 'NoneType' object has no attribute 'text'
총 리뷰 개수: 52
Reviewer Name                 Rating         Date                Review                                                                          
Lipika M.                     ★★★★★     26 August 2024      I love the face shop. It suits me like nothing                                  
Anisha Salunkhe               ★★★★☆     8 July 2024         Overall good product. Gives u a soft skin.                                      
Not good there *              ★★★★★     22 April 2024       Nice products I tried first time and it’s suites my skin this product give me smooth and soft skin
Isht Singh                    ★★★☆☆     12 September 2024   I doubt it's original.                                                          
purana                        ★★★★★     8 April 2024        Products have a subtle scent and makes my face smooth. Loved the kit with complete skincare. More kits of faceshop products is encouraged
Pooj

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

# 올리브영 리뷰 페이지 URL
base_url = 'https://www.oliveyoung.co.kr/store/goods/getGoodsDetail.do?goodsNo=A000000210380&t_page=%ED%86%B5%ED%95%A9%EA%B2%80%EC%83%89%EA%B2%B0%EA%B3%BC%ED%8E%98%EC%9D%B4%EC%A7%80&t_click=%EA%B2%80%EC%83%89%EC%83%81%ED%92%88%EC%83%81%EC%84%B8&t_search_name=%EB%8D%94%ED%8E%98%EC%9D%B4%EC%8A%A4%EC%83%B5&t_number=9&dispCatNo=1000001000200010003&trackingCd=Result_9'

# 헤더 설정
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
    'Referer': 'https://www.oliveyoung.co.kr/store/main/main.do',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document'
}

# 리뷰 데이터 수집 함수 수정
def get_reviews(url, headers):
    review_data = []
    reviewer_bloggers = []

    page = 1
    while True:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                reviews = soup.find_all('div', {'class': 'review_wrap'})

                for review in reviews:
                    review_text = review.find('p', {'class': 'desc'}).text if review.find('p', {'class': 'desc'}) else ""

                    if "체험단" in review_text or "판매자에게 제품비의 일부를 지원 받았지만" in review_text:
                        reviewer_bloggers.append(review_text)
                        continue

                    # 리뷰어 이름
                    reviewer_name_tag = review.find('span', {'class': 'id'})
                    reviewer_name = reviewer_name_tag.text.strip() if reviewer_name_tag else None

                    # 별점
                    star_rating_tag = review.find('span', {'class': 'score'})
                    star_rating = int(re.search(r'(\d+)', star_rating_tag.text).group(1)) if star_rating_tag else None
                    star_display = '★' * star_rating + '☆' * (5 - star_rating) if star_rating else None

                    # 날짜
                    review_date_tag = review.find('span', {'class': 'date'})
                    review_date = review_date_tag.text.strip() if review_date_tag else None

                    # 리뷰 내용
                    review_text_tag = review.find('p', {'class': 'desc'})
                    review_text = review_text_tag.text.strip() if review_text_tag else None

                    # 리뷰 데이터가 모두 유효한지 확인 후 추가 (옵션 제외)
                    if reviewer_name and star_display and review_date and review_text:
                        review_data.append([reviewer_name, star_display, review_date, review_text])

                # 다음 페이지로 이동 (페이지 +1 로직)
                page += 1
                next_url = re.sub(r't_number=\d+', f't_number={page}', url)

                time.sleep(1)  # 요청 간 간격 조절
            else:
                print(f"Failed to retrieve data: {response.status_code}")
                break

        except Exception as e:
            print(f"Error occurred: {e}")
            break

    return review_data, reviewer_bloggers

# 고정된 폭으로 출력
def print_reviews_fixed_width(df):
    print(f"{'Reviewer Name':<20}{'Rating':<10}{'Date':<15}{'Review':<80}")
    print("=" * 130)
    for index, row in df.iterrows():
        print(f"{row['Reviewer Name']:<20}{row['Rating']:<10}{row['Date']:<15}{row['Review']:<80}")

# 데이터 수집 후 출력
if __name__ == "__main__":
    reviews, reviewer_bloggers = get_reviews(base_url, headers)

    # 일반 리뷰
    df = pd.DataFrame(reviews, columns=['Reviewer Name', 'Rating', 'Date', 'Review'])

    # 전체 리뷰 개수
    total_reviews_count = len(reviews) + len(reviewer_bloggers)
    print(f"총 리뷰 개수: {total_reviews_count}")
    print(f"실제 리뷰 개수(체험단 제외): {df.shape[0]}")

    # 실제 리뷰 출력
    print_reviews_fixed_width(df)
    print(df)
