In [0]:
config_vars = {
    "store_path": "/Workspace/Users/junweiwu.me@gmail.com/databricks_talk-to-the-city-reports/scatter/pipeline/inputs",
    "news_url": "https://news.yahoo.co.jp/articles/425744badd8aafa1db8dbb047d09f59b4360c968/comments"
}

In [0]:
import os
for key, value in config_vars.items():
    dbutils.widgets.text(key, value)
    os.environ[key] = value

In [0]:
# import requests
# from bs4 import BeautifulSoup
# import time
# import pandas as pd

# def fetch_yahoo_comments(base_url):
#     all_comments = []
#     page = 1
    
#     while True:
#         try:
#             url = f"{base_url}?page={page}"
#             print(f"Fetching page {page}...")

#             r = requests.get(url)
#             r.encoding = r.apparent_encoding
            
#             soup = BeautifulSoup(r.text, "html.parser")

#             comments = soup.find_all('p', class_='sc-169yn8p-10 hYFULX')
#             ids = soup.find_all('a', class_="sc-169yn8p-7 cJjfcA")
            
#             if not comments:
#                 print(f"No more comments found at page {page}")
#                 break
 
#             for comment, id_elem in zip(comments, ids):
#                 comment_id = id_elem.get('data-comment-id', '')  
#                 comment_text = comment.text.strip()
#                 all_comments.append({
#                     'comment-id': comment_id,
#                     'comment-body': comment_text
#                 })
            
#             print(f"Found {len(comments)} comments on page {page}")
            
#             time.sleep(1)
#             page += 1
            
#         except Exception as e:
#             print(f"Error occurred on page {page}: {str(e)}")
#             break

#     df = pd.DataFrame(all_comments)
#     print(f"Total comments collected: {len(df)}")
#     return df


In [0]:
# import os
# store_path = os.getenv("store_path")
# news_url = os.getenv("news_url")

# comments_df = fetch_yahoo_comments(news_url)

# comments_df.to_csv(f'{store_path}/yahoo-news-comment.csv', index=False)

In [0]:
import os
store_path = os.getenv("store_path")
news_url = os.getenv("news_url")

In [0]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import concurrent.futures
from typing import List, Dict
from threading import Event

def fetch_page(url: str, page: int, stop_event: Event) -> List[Dict]:
    try:
        if stop_event.is_set():
            return []
            
        page_url = f"{url}?page={page}"
        print(f"Fetching page {page}...")
        
        r = requests.get(page_url)
        r.encoding = r.apparent_encoding
        
        soup = BeautifulSoup(r.text, "html.parser")
        
        comments = soup.find_all('p', class_='sc-169yn8p-10 hYFULX')
        ids = soup.find_all('a', class_="sc-169yn8p-7 gibKWW")
        
        if not comments:
            print(f"No comments found on page {page}")
            return []
        
        page_comments = []
        for comment, id_elem in zip(comments, ids):
            comment_id = id_elem.get('data-comment-id', f'p{page}_{len(page_comments)}')
            comment_text = comment.text.strip()
            page_comments.append({
                'comment-id': comment_id,
                'comment-body': comment_text
            })
            
        print(f"Found {len(page_comments)} comments on page {page}")
        time.sleep(1)  
        return page_comments
        
    except Exception as e:
        print(f"Error on page {page}: {str(e)}")
        return []

def fetch_yahoo_comments(base_url: str, max_workers: int = 4) -> pd.DataFrame:
    all_comments = []
    page = 1
    consecutive_empty_pages = 0
    max_empty_pages = 3  
    stop_event = Event()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        
        for _ in range(max_workers):
            if consecutive_empty_pages < max_empty_pages:
                futures.append(executor.submit(fetch_page, base_url, page, stop_event))
                page += 1
        
        while futures and not stop_event.is_set():
            done, futures = concurrent.futures.wait(
                futures, 
                return_when=concurrent.futures.FIRST_COMPLETED
            )
            
            for future in done:
                page_comments = future.result()
                
                if page_comments:
                    all_comments.extend(page_comments)
                    consecutive_empty_pages = 0
                else:
                    consecutive_empty_pages += 1
                    print(f"Empty pages count: {consecutive_empty_pages}")
                    
                if consecutive_empty_pages >= max_empty_pages:
                    print("Reached maximum consecutive empty pages, stopping...")
                    stop_event.set()
                    break
                    
            if not stop_event.is_set():
                for _ in range(len(done)):
                    futures.add(executor.submit(fetch_page, base_url, page, stop_event))
                    page += 1

    df = pd.DataFrame(all_comments)
    print(f"Total comments collected: {len(df)}")
    df = df.sort_values('comment-id')
    df['comment-id'] = range(1, len(df) + 1)
    return df

if __name__ == "__main__":
    df = fetch_yahoo_comments(news_url, max_workers=4)
    df.to_csv(f"{store_path}/yahoo-news-comment.csv", index=False, encoding='utf-8')

In [0]:
df.head(10)