https://www.maybelline.com.au/

In [1]:
import pandas as pd
import math
import re
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta

from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# url ของเว็บที่ต้องการดูข้อมูล
base_url = "https://www.maybelline.com.au/all-products/face-makeup/face-powder/fit-me-matte-poreless-powder?variant=porcelain" 

# Set up EdgeOptions
edge_options = Options()
edge_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54")
edge_options.add_argument("window-size=1920x1080")
edge_options.add_argument("disable-extensions")


# Create Edge WebDriver with headless options
driver = webdriver.Edge(options=edge_options)
driver.get(base_url)

In [2]:
review_blocks = driver.find_elements(By.CSS_SELECTOR, "li.bv-content-review")

for i, block in enumerate(review_blocks):
    print(f"---- Block {i+1} ----")
    print(block.text.strip())

In [3]:
def extract_review(block):
    rating = date = title = review = ''

    # rating
    try:
        rating_element = block.find_element(By.CSS_SELECTOR, "abbr[title*='out of 5 stars']")
        rating = rating_element.get_attribute("title").strip()
    except:
        pass

    # date
    try:
        date_element = block.find_element(By.CSS_SELECTOR, "span.bv-content-datetime-stamp")
        date = date_element.text.strip().replace("·", "").strip()
    except:
        pass

    # title
    try:
        title_element = block.find_element(By.CSS_SELECTOR, "h3.bv-content-title")
        title = title_element.text.strip()
    except:
        pass

    # review
    try:
        review_element = block.find_element(By.CSS_SELECTOR, "div.bv-content-summary-body-text")
        review = review_element.text.strip()
        review = review.replace("[This review was collected as part of a promotion.]", "").strip()
        for phrase in ["Originally posted on", "RESPONSE FROM"]:
            if phrase in review:
                review = review.split(phrase)[0].strip()
    except:
        pass
        
    return {
        "rating": rating,
        "date": date,
        "title": title,
        "review": review
    }

In [4]:
review_blocks = driver.find_elements(By.CSS_SELECTOR, "li.bv-content-review")
data = [extract_review(block) for block in review_blocks]
df = pd.DataFrame(data)
df.head()

In [5]:
# Check
# for i, r in enumerate(review_blocks):
#     review_data = extract_review(r)
#     print(f"---- Block {i+1} ----")
#     print(f"Rating: {review_data['rating']}")
#     print(f"Date: {review_data['date']}")
#     print(f"Title: {review_data['title']}")
#     print(f"Review: {review_data['review']}")
#     print()

In [7]:
next_button = (driver.find_element(By.CSS_SELECTOR, "a.bv-content-btn-pages:has(span.bv-content-btn-pages-next)")).text
next_button

'Next Reviews\n►'

In [8]:
MAX_REVIEWS = 363
seen_reviews = set()
data = []

driver.get(base_url)
time.sleep(2)
page = 1

while True:
    review_blocks = driver.find_elements(By.CSS_SELECTOR, "li.bv-content-review")
    new_reviews = 0

    for block in review_blocks:
        review_data = extract_review(block)
        key = review_data["title"] + review_data["review"]
        if key and key not in seen_reviews:
            seen_reviews.add(key)
            data.append(review_data)
            new_reviews += 1
    
    print(f"Page {page}: reviews collected {new_reviews}")

    if len(seen_reviews) >= MAX_REVIEWS:
        # print(f"Reached MAX_REVIEWS = {MAX_REVIEWS}. Stopping.")
        break

    # คลิกปุ่ม Next
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, "a.bv-content-btn-pages:has(span.bv-content-btn-pages-next)")
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)
        page += 1
    except NoSuchElementException:
        break

driver.quit()
print(f"✅ Total reviews collected: {len(seen_reviews)}")

Page 1: reviews collected 8
Page 2: reviews collected 30
Page 3: reviews collected 30
Page 4: reviews collected 30
Page 5: reviews collected 29
Page 6: reviews collected 30
Page 7: reviews collected 30
Page 8: reviews collected 30
Page 9: reviews collected 30
Page 10: reviews collected 30
Page 11: reviews collected 30
Page 12: reviews collected 30
Page 13: reviews collected 26
✅ Total reviews collected: 363


In [9]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,rating,date,title,review
0,5 out of 5 stars.,an hour ago,GREAT COVERAGE,The texture is so smooth. It’s mattifies my oi...
1,2 out of 5 stars.,6 hours ago,CRUSTY!,"Wow, this product made my under eyes look cake..."
2,5 out of 5 stars.,17 days ago,👍🏻,I’ve been using the Maybelline Fit Me Matte + ...
3,5 out of 5 stars.,13 days ago,DO YOU NEED TO TRY THIS POWDER!,I try Fit Me Matte + Poreless Powder by Maybel...
4,1 out of 5 stars.,5 years ago,HORRIBLE FEEL AND LOOK,I normally use the Maybelline stay Better Skin...


In [10]:
def convert_to_date(text):
    if not isinstance(text, str):
        return None

    text = text.strip().lower()
    today = datetime.today()

    # กรณี 'a year ago', etc.
    special_cases = {
        "a year ago": {"years": 1},
        "a month ago": {"months": 1},
        "a week ago": {"weeks": 1},
        "a day ago": {"days": 1},
        "a hour ago": {"hours": 1},
        "an hour ago": {"hours": 1},
    }

    if text in special_cases:
        return (today - relativedelta(**special_cases[text])).strftime('%Y-%m-%d')

    # ตัวเลข เช่น "3 days ago"
    match = re.match(r"(\d+)\s+(hour|day|week|month|year)s?\s+ago", text)
    if match:
        num, unit = int(match.group(1)), match.group(2)
        return (today - relativedelta(**{unit + 's': num})).strftime('%Y-%m-%d')

In [11]:
df['date'] = df['date'].apply(convert_to_date)

In [12]:
df['source'] = 'maybelline.com.au'
df['product'] = 'MAYBELLINE FIT ME MATTE + PORELESS POWDER'

In [13]:
df.head()

Unnamed: 0,rating,date,title,review,source,product
0,5 out of 5 stars.,2025-05-26,GREAT COVERAGE,The texture is so smooth. It’s mattifies my oi...,maybelline.com.au,MAYBELLINE FIT ME MATTE + PORELESS POWDER
1,2 out of 5 stars.,2025-05-26,CRUSTY!,"Wow, this product made my under eyes look cake...",maybelline.com.au,MAYBELLINE FIT ME MATTE + PORELESS POWDER
2,5 out of 5 stars.,2025-05-09,👍🏻,I’ve been using the Maybelline Fit Me Matte + ...,maybelline.com.au,MAYBELLINE FIT ME MATTE + PORELESS POWDER
3,5 out of 5 stars.,2025-05-13,DO YOU NEED TO TRY THIS POWDER!,I try Fit Me Matte + Poreless Powder by Maybel...,maybelline.com.au,MAYBELLINE FIT ME MATTE + PORELESS POWDER
4,1 out of 5 stars.,2020-05-26,HORRIBLE FEEL AND LOOK,I normally use the Maybelline stay Better Skin...,maybelline.com.au,MAYBELLINE FIT ME MATTE + PORELESS POWDER


In [14]:
df.isna().sum()

rating     0
date       0
title      0
review     0
source     0
product    0
dtype: int64

In [15]:
df.to_excel(f'1_{df['product'].iloc[0]}.xlsx', index=False)