In [1]:
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### jakarta

In [None]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
total = 0
# ============================== НАСТРОЙКИ ==============================
EXCEL_IN  = "jakarta_fitness_itog.xlsx"
# EXCEL_OUT = "train_data_reviews.xlsx"
WAIT_SEC  = 20
HEADLESS  = False
MAX_REVIEWS = 40   # <= лимит: смотрим максимум N верхних отзывов (без сортировки)

options = webdriver.ChromeOptions()
if HEADLESS:
    options.add_argument("--headless=new")
options.add_argument("--start-maximized")
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--ignore-ssl-errors")
options.add_argument("--lang=ru")
options.add_argument("Accept-Language=ru-RU,ru,en-US,en")

# driver = webdriver.Chrome(options=options)

try:
    driver = webdriver.Chrome(options=options)
    print("Браузер успешно запущен")
except Exception as e:
    print(f"Ошибка при запуске браузера: {e}")
    raise
wait = WebDriverWait(driver, WAIT_SEC)
data = pd.read_excel('progress_backup.xlsx')
progress_log = dict(data)
for key, values in progress_log.items():
    progress_log[key] = list(values)

import time

start = time.perf_counter()

# ============================== ХЕЛПЕРЫ ==============================
def js_click(el):
    driver.execute_script("arguments[0].click();", el)

def handle_consent():
    """Закрыть баннер согласия, если всплывёт."""
    try:
        btn = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((
            By.XPATH,
            '//button[.//span[contains(text(),"Принять") or contains(text(),"Согласен") '
            'or contains(text(),"Accept all") or contains(text(),"I agree")]]'
        )))
        js_click(btn)
        time.sleep(0.3)
    except Exception:
        pass

def open_reviews_panel():
    """Открыть панель отзывов."""
    try:
        btn = wait.until(EC.element_to_be_clickable((
            By.XPATH, '//button[contains(@aria-label,"Отзывы") or contains(@aria-label,"Reviews")]'
        )))
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
        time.sleep(0.2)
        js_click(btn)
    except Exception:
        header = wait.until(EC.element_to_be_clickable((
            By.CSS_SELECTOR, "a[href*='/place/'] h1"
        )))
        js_click(header)
        btn = wait.until(EC.element_to_be_clickable((
            By.XPATH, '//button[contains(@aria-label,"Отзывы") or contains(@aria-label,"Reviews")]'
        )))
        js_click(btn)

    # дождаться модалки и хотя бы одного отзыва
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="dialog"]')))
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-review-id]')))

def get_scrollable_container():
    """Вернуть прокручиваемый контейнер отзывов."""
    try:
        return driver.find_element(By.CSS_SELECTOR, 'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde')
    except Exception:
        return driver.find_element(By.CSS_SELECTOR, 'div[role="dialog"]')

def extract_when_from_review(review_el):
    """Достаём текст 'N … назад' / 'N … ago' из карточки."""
    # 1) самый частый вариант — span.rsqaWe
    for by, sel in [
        (By.CSS_SELECTOR, 'span.rsqaWe'),
        (By.XPATH, './/span[contains(@aria-label,"назад") or contains(@aria-label,"ago")]'),
        (By.XPATH, './/span[contains(text(),"назад") or contains(text(),"ago") or contains(text(),"yesterday") or contains(text(),"today")]'),
    ]:
        try:
            node = review_el.find_element(by, sel)
            txt = (node.get_attribute("aria-label") or node.text or "").strip()
            if txt:
                return txt
        except Exception:
            continue
    return None

def collect_review_times_capped(max_reviews=MAX_REVIEWS):
    """
    Скроллим ленту и собираем 'когда написан' для верхних отзывов, максимум max_reviews штук.
    """
    scrollable = get_scrollable_container()
    seen = {}  # review_id -> when_text
    stable_iters = 0
    last_count = 0
    max_loops = 200

    for _ in range(max_loops):
        cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]')
        for card in cards:
            rid = card.get_attribute('data-review-id')
            if rid and rid not in seen:
                when = extract_when_from_review(card)
                if when:
                    seen[rid] = when
                    if len(seen) >= max_reviews:
                        return list(seen.values())

        # если не достигли лимита — продолжаем прокрутку
        total = len(seen)
        if total == last_count:
            stable_iters += 1
        else:
            stable_iters = 0
            last_count = total

        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", scrollable)
        time.sleep(1.8)

        # если несколько итераций подряд ничего нового — выходим
        if stable_iters >= 3:
            break

    return list(seen.values())


# ============================== ПАРСИНГ ДАТ В ДНИ ==============================
def parse_to_days(text: str) -> int:
    """
    Преобразует:
      RU: '7 лет назад', 'год назад', '3 месяца назад', 'неделю назад', 'вчера', 'сегодня', '2 дня назад', '3 часа назад'
      EN: '7 years ago', 'a year ago', '3 months ago', 'a week ago', 'yesterday', 'today', '2 days ago', '3 hours ago'
    → количество дней (int).
    """
    t = (text or "").strip().lower()

    # быстрые кейсы без цифр
    if "сегодня" in t or "today" in t:
        return 0
    if "вчера" in t or "yesterday" in t:
        return 1

    # попробуем вытащить число; если нет — трактуем как 1 (например 'год назад', 'неделю назад', 'a week ago')
    m = re.search(r'(\d+)', t)
    n = int(m.group(1)) if m else 1

    # русские формы
    if any(w in t for w in ["лет", "год", "года"]):
        return n * 365
    if any(w in t for w in ["месяц", "месяца", "месяцев"]):
        return n * 30
    if any(w in t for w in ["недел", "неделю"]):
        return n * 7
    if any(w in t for w in ["день", "дня", "дней", "сут"]):
        return n
    if any(w in t for w in ["час", "часа", "часов"]):
        return 0
    if any(w in t for w in ["минут", "минуту", "minute"]):
        return 0

    # английские формы
    if "year" in t:
        return n * 365
    if "month" in t:
        return n * 30
    if "week" in t:
        return n * 7
    if "day" in t:
        return n
    if "hour" in t:
        return 0

    # если формат неизвестен — 0 (не ухудшит выбор максимума)
    return 0


# ============================== ЛОГИКА ОДНОГО PLACE ==============================

def get_oldest_review(place_id: str):
    url = f"https://www.google.com/maps/place/?q=place_id:{place_id}&hl=ru"
    try:
        driver.get(url)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        handle_consent()

        try:
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        except Exception:
            pass

        open_reviews_panel()
        times = collect_review_times_capped(MAX_REVIEWS)
        if not times:
            return None, None

        oldest_text = max(times, key=parse_to_days)
        oldest_days = parse_to_days(oldest_text)
        return oldest_text, oldest_days

    except Exception as e:
        # любая ошибка → bug
        return 'bug', 0

# ============================== ОСНОВНОЙ ЦИКЛ ==============================
def extract_place_id(url: str):
    if not isinstance(url, str):
        return None
    m = re.search(r'(?:placeid|place_id)=([^&]+)', url)
    return m.group(1) if m else None

def main():
    df = pd.read_excel(EXCEL_IN)
    df['placeid'] = df['reviews_link'].apply(extract_place_id)

    texts, days = [], []
    for idx, pid in enumerate(df['placeid'], start=1):
        if pid:
            t, d = get_oldest_review(pid)
            texts.append(t)
            days.append(d)
        else:
            t = ''
            d = 0
            texts.append(None)
            days.append(None)
        # логирование
        print(f"{idx},{pid},{t},{d}")
        progress_log["num"].append(idx)
        progress_log["placeid"].append(pid)
        progress_log["oldest_text"].append(t)
        progress_log["oldest_days"].append(d)

        # каждые 10 шагов сбрасываем копию
        if idx % 10 == 0:
            pd.DataFrame(progress_log).to_excel("progress_backup.xlsx", index=False)

    pd.DataFrame(progress_log).to_excel("progress_backup.xlsx", index=False)
    print(f"Готово! Файл сохранён!")

if __name__ == "__main__":
    try:
        main()
    finally:
        try:
            driver.quit()
        except Exception:
            pass
elapsed = time.perf_counter() - start   # ← выключили
print(f"{elapsed:.3f} секунд")
print(f"{elapsed/4296:.3f} секунд")

In [6]:
rev = pd.read_excel("progress_backup.xlsx")[1:]
itog = pd.read_excel(EXCEL_IN)
itog['placeid'] = rev['placeid']
itog['oldest_text'] = rev['oldest_text']
itog['oldest_days'] = rev['oldest_days']

In [7]:
itog.to_excel(EXCEL_IN, index = False)
itog.to_csv("jakarta_fitness_itog.csv", index = False)

In [12]:
len(itog)

2154