https://www.rimmellondon.com/en-gb

In [1]:
import pandas as pd
import math
import re
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta

from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

# url ของเว็บที่ต้องการดูข้อมูล
url = "https://www.rimmellondon.com/en-gb/face/powders/natural-finish-pressed-powder#:~:text=Our%20first%20clean%20makeup%20that%20works%2C%20no%20compromises.,lasting%20%26%20instantly%20mattifies%20skin%20and%20reduces%20shine." 

# Set up EdgeOptions
edge_options = Options()
edge_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54")
edge_options.add_argument("window-size=1920x1080")
edge_options.add_argument("disable-extensions")


# Create Edge WebDriver with headless options
driver = webdriver.Edge(options=edge_options)
driver.get(url)

In [2]:
# หา shadow host (element ที่มี shadow root)
shadow_host = driver.find_element(By.CSS_SELECTOR, '#reviews > div > div > div > div')

# เข้าถึง shadow root ผ่าน JavaScript
shadow_root = driver.execute_script('return arguments[0].shadowRoot', shadow_host)
print(f"shadow_root exists: {shadow_root is not None}")

shadow_root exists: True


In [3]:
# next_button = (shadow_root.find_element(By.CLASS_NAME, 'next')).text
# next_button

In [4]:
# relate = shadow_root.find_elements(By.CLASS_NAME, 'jWGhqr')
# dates = [date.text for date in relate]
# dates

In [5]:
# content = shadow_root.find_elements(By.CLASS_NAME, 'dYSVlA')
# reviews = [review.text for review in content]
# reviews

In [6]:
# review_blocks = (shadow_root.find_element(By.CLASS_NAME, 'jPLiFm')).text
# review_blocks

In [7]:
review_blocks = shadow_root.find_elements(By.CLASS_NAME, 'jPLiFm')

for i, block in enumerate(review_blocks):
    print(f"---- Block {i+1} ----")
    print(block.text.strip())

---- Block 1 ----
Rika26
Review1
Votes0
5 out of 5 stars.
Amazing high performing powder
Rika26
3 months ago
Absolutely love this, use daily.. loving the small list of ingredients(perfect for people like me who is conscious of ingredients) ..in my opinion the next best alternative I have used to be talc free and natural is bare minerals which is very expensive compared to this.. it saves me a lot of money.. I noticed that the powder tend to break easily when I carry it in my hand bag.. but I am happy to compromise given the price tag, performance and don’t change the formula and add lots of other things please..maybe you could do a version with better, secured packaging with a mirror for a a slightly higher price but also have cheaper refills :)
Yes, I recommend this product.
Originally posted on boots.com
---- Block 2 ----
fortunatac_8750
Review1
Votes0
3 out of 5 stars.
Economica
fortunatac_8750
4 months ago
Una buona cipria, low cost soprattutto. Finish matte, la utilizzo su tutto i

In [8]:
# ฟังก์ชันแยกข้อมูลจาก review block
def extract_review(block):
    lines = block.text.strip().split("\n")
    
    rating = ''
    title = ''
    date = ''
    review = ''

    # ค้นหา rating (มีลักษณะ "X out of 5 stars.")
    for line in lines:
        if "out of 5 stars" in line:
            rating = line.strip()
            break

    # ค้นหา title (ถัดจาก rating)
    try:
        title_index = lines.index(rating) + 1
        title = lines[title_index].strip()
    except:
        title = ''  

    # ค้นหา date (รูปแบบ xx months ago หรือ xx days ago)
    date_pattern = [line for line in lines if "ago" in line.lower()]
    if date_pattern:
        date = date_pattern[0].strip()

    # ค้นหารีวิวจริง (อยู่ถัดจากวันที่)
    try:
        date_idx = lines.index(date)
        review = lines[date_idx + 1].strip()
    except:
        review = ''

    return {
        "rating": rating,
        "title": title,
        "date": date,
        "review": review
    }

In [9]:
MAX_REVIEWS = 129 
seen_reviews = set()
data = []
page_number = 1
prev_count = 0

while True:
    print(f"Page {page_number}...")
    time.sleep(3)

    shadow_host = driver.find_element(By.CSS_SELECTOR, '#reviews > div > div > div > div')
    shadow_root = driver.execute_script('return arguments[0].shadowRoot', shadow_host)
    review_blocks = shadow_root.find_elements(By.CLASS_NAME, 'jPLiFm')

    for r in review_blocks:
        review_data = extract_review(r)
        review_key = review_data["title"] + review_data["review"]

        if review_key not in seen_reviews:
            seen_reviews.add(review_key)
            data.append(review_data)

    print(f"Reviews collected: {len(seen_reviews)}")

    # หยุดเมื่อรีวิวไม่เพิ่ม
    if len(seen_reviews) == prev_count:
        break
    prev_count = len(seen_reviews)

    # คลิก next เพื่อไปหน้าถัดไป
    try:
        next_button = shadow_root.find_element(By.CLASS_NAME, 'next')
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        driver.execute_script("arguments[0].click();", next_button)
        page_number += 1
    except NoSuchElementException:
        break

df = pd.DataFrame(data)
print(f"\nTotal unique reviews collected: {len(df)}")

Page 1...
Reviews collected: 8
Page 2...
Reviews collected: 38
Page 3...
Reviews collected: 68
Page 4...
Reviews collected: 98
Page 5...
Reviews collected: 127
Page 6...
Reviews collected: 127

Total unique reviews collected: 127


In [10]:
df.head()

Unnamed: 0,rating,title,date,review
0,5 out of 5 stars.,Amazing high performing powder,3 months ago,"Absolutely love this, use daily.. loving the s..."
1,3 out of 5 stars.,Economica,4 months ago,"Una buona cipria, low cost soprattutto. Finish..."
2,3 out of 5 stars.,Nice product ruined by poor packaging,5 months ago,Nice product but the packaging is awful - like...
3,5 out of 5 stars.,Love this Talc free powder,5 months ago,I was searching for a talc free powder that wa...
4,2 out of 5 stars.,Plastic sponge is painful!,6 months ago,I just bought the Rimmel Kind and Free face po...


In [11]:
def convert_to_date(text):
    if not isinstance(text, str):
        return None

    text = text.strip().lower()
    today = datetime.today()

    # กรณี 'a year ago', etc.
    special_cases = {
        "a year ago": {"years": 1},
        "a month ago": {"months": 1},
        "a week ago": {"weeks": 1},
        "a day ago": {"days": 1},
        "a hour ago": {"hours": 1},
        "an hour ago": {"hours": 1},
    }

    if text in special_cases:
        return (today - relativedelta(**special_cases[text])).strftime('%Y-%m-%d')

    # ตัวเลข เช่น "3 days ago"
    match = re.match(r"(\d+)\s+(hour|day|week|month|year)s?\s+ago", text)
    if match:
        num, unit = int(match.group(1)), match.group(2)
        return (today - relativedelta(**{unit + 's': num})).strftime('%Y-%m-%d')

    # วันที่เต็ม เช่น "12 May 2024"
    try:
        return datetime.strptime(text, "%d %B %Y").strftime('%Y-%m-%d')
    except:
        return None

In [12]:
df['date'] = df['date'].apply(convert_to_date)

In [13]:
df['source'] = 'rimmellondon.com'
df['product'] = '1RIMMEL KIND & FREE™ PRESSED POWDER'

In [14]:
df.head()

Unnamed: 0,rating,title,date,review,source,product
0,5 out of 5 stars.,Amazing high performing powder,2025-02-22,"Absolutely love this, use daily.. loving the s...",rimmellondon.com,1RIMMEL KIND & FREE™ PRESSED POWDER
1,3 out of 5 stars.,Economica,2025-01-22,"Una buona cipria, low cost soprattutto. Finish...",rimmellondon.com,1RIMMEL KIND & FREE™ PRESSED POWDER
2,3 out of 5 stars.,Nice product ruined by poor packaging,2024-12-22,Nice product but the packaging is awful - like...,rimmellondon.com,1RIMMEL KIND & FREE™ PRESSED POWDER
3,5 out of 5 stars.,Love this Talc free powder,2024-12-22,I was searching for a talc free powder that wa...,rimmellondon.com,1RIMMEL KIND & FREE™ PRESSED POWDER
4,2 out of 5 stars.,Plastic sponge is painful!,2024-11-22,I just bought the Rimmel Kind and Free face po...,rimmellondon.com,1RIMMEL KIND & FREE™ PRESSED POWDER


In [15]:
df.to_excel(f'1_{df['product'].iloc[0]}.xlsx', index=False)