In [1]:
from bs4 import BeautifulSoup
from random import randint
import util
import json
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bars
import re
import requests

In [2]:
# 定义处理单个 topic_path 的函数
def process_topic(topic_path):
    try:
        with open(topic_path,'r', encoding='utf-8') as topic_html:
            comment_soup = BeautifulSoup(topic_html, 'html.parser')
        td = comment_soup.find('div', class_='firstPostBox')

        # 读取 topic 的内容。提取所有 <p> 标签的内容并合并成一个字符串
        try:
            topic_content = ' '.join(str(p) for p in td.find_all('p'))
        except:
            topic_content = None

        # 获取 topic 发布的时间
        try:
            topic_time = td.find('div', class_='postDate').text
        except:
            topic_time = None

        # 获取 user_id
        try:
            user_id = re.split(r'[/?]', td.find('div', class_='avatar').find('a').get('href'))[2]
        except:
            user_id = None

        # 获取 posts, reviews 和 helpful votes 数量
        try:
            posts = td.find('div', class_='postBadge badge').find('span').text.split()[0]
        except:
            posts = 0

        try:
            reviews = td.find('div', class_='reviewerBadge badge').find('span').text.split()[0]
        except:
            reviews = 0

        try:
            helpful_votes = td.find('div', class_='helpfulVotesBadge badge').find('span').text.split()[0]
        except:
            helpful_votes = 0

        # 如果这个 topic 发布的时间为若干 years ago，那么试一下找到它的 dateCreated
        if "ago" in topic_time:
            try:
                script_tag = comment_soup.find('script', type="application/ld+json")
                json_data = json.loads(script_tag.string)
                topic_time = json_data['mainEntity']['dateCreated']
            except:
                pass

        try:
            keywords_content = comment_soup.find('meta', attrs={'name': 'keywords'})
        except:
            keywords_content = None
        
        # des=[]
        # for item in comment_soup.find('div', class_='ppr_rup ppr_priv_trip_planner_breadcrumbs').find_all('span'):
        #     if item.text=='' or item.text.endswith("Travel Forum"):
        #         continue
        #     des.append(item.text)
        
        des = []
        # 查找包含面包屑导航的div并遍历所有的<span>标签
        breadcrumb_div = comment_soup.find('div', class_='ppr_rup ppr_priv_trip_planner_breadcrumbs')
        if breadcrumb_div:
            for item in breadcrumb_div.find_all('span'):
                # 跳过空白或不需要的文本
                if item.text.strip() == '' or item.text.endswith("Travel Forum"):
                    continue
                des.append(item.text.strip())
        # 确保 des 非空，防止数据格式不一致
        if not des:
            des = [None]    
        
        # 城市
        try:
            city_name = keywords_content.get('content').split(',')[0]
        except:
            city_name = None

        # 州
        try:
            province_name = keywords_content.get('content').split(',')[1]
        except:
            province_name = None

        # topic_name
        try:
            topic_name = ', '.join(keywords_content.get('content').split(',')[2:]).strip()
        except:
            topic_name = None

        # 返回处理好的 topic 数据
        return {
            'user ID': user_id,
            'Topic Name': topic_name.strip(),
            'Content': topic_content.strip() if topic_content else None,
            'Posts': str(posts),
            'Reviews': str(reviews),
            'Helpful Votes': str(helpful_votes),
            'Date Created': topic_time,
            'Des':des
        }

    except Exception as e:
        print(f'Error processing topic at {topic_path}: {e}')
        return None

In [6]:
# 读取Excel文件
df = pd.read_excel('forum_pages.xlsx')
def generate_urls(url1, url2):
    # 提取url1和url2中的页码
    start_page = int(url1.split('-o')[1].split('-')[0])
    end_page = int(url2.split('-o')[1].split('-')[0])
    
    # 确保start_page小于end_page
    if start_page > end_page:
        start_page, end_page = end_page, start_page
    
    # 生成从start_page到end_page，每20页的URL
    urls = []
    for page in range(start_page, end_page + 1, 20):
        new_url = url1.split('-o')[0] + f"-o{page}-" + url1.split('-o')[1].split('-')[1]
        urls.append(new_url)
    
    # 确保包含终点
    if end_page not in [page for page in range(start_page, end_page + 1, 20)]:
        new_url = url1.split('-o')[0] + f"-o{end_page}-" + url1.split('-o')[1].split('-')[1]
        urls.append(new_url)
    
    return urls
states_urls = []
# 遍历每一行数据，生成每个州的所有URL
for index, row in df.iterrows():
    state = row['state']
    url1 = row['url1']
    url2 = row['url2']
    generated_urls = generate_urls(url1, url2)
    state_json = {
        "state": state,
        "urls": generated_urls
    }
    states_urls.append(state_json)

In [8]:
# 先暂时不设置 df_columns 表头，只记录基础字段
df_columns = ['user ID', 'Topic Name', 'Content', 'Posts', 'Reviews', 'Helpful Votes', 'Date Created']

for state_url in states_urls:
    # 处理每个 topic_path，先收集所有数据
    all_topics_data = []
    max_des_length = 0  # 用于记录最长的 des 列数
    all_topics_df = pd.DataFrame(columns=df_columns)
    # 创建以state_url.state命名的文件夹，如果有则不重新创建
    folder_name = f"./{state_url['state']}"
    # 对于每一个页面
    for url in tqdm(state_url['urls'],desc=state_url['state']+"Processing pages", unit="page"):
        url=f"./{state_url['state']}/{url.replace('https://www.tripadvisor.com/','')}"
        with open(url,'r', encoding='utf-8') as html:
            soup = BeautifulSoup(html,'html.parser')
        topic_path_list=[]
        tds = soup.find_all('td', class_=lambda x: "forumcol" in x if x else False, onclick=True) 
        mask=[]
        # 遍历这些<td>标签并打印其内容  
        for td in tds:  
            if(len(td.find_all('a'))!=0):
                mask.append(True)
            else:
                mask.append(False)
        tds = soup.find_all('a', onclick=lambda x: 'setPID(34603)' in x if x else False) 
        # 遍历这些<td>标签并打印其内容 
        i=0 
        for td in tds:  
            if(mask[i]):
                topic_path = f"./{state_url['state']}"+td.get('href').strip()
                topic_path_list.append(topic_path)
                i=i+1
            else:
                i=i+1
                continue
        for topic_path in  topic_path_list:
            topic_data = process_topic(topic_path)
            
            if topic_data:
                des_data = topic_data.pop('Des')  # 取出 des 列
                max_des_length = max(max_des_length, len(des_data))  # 更新最长 des 长度
                
                # 将 des_data 按顺序加入到 topic_data 中
                for i, des_item in enumerate(des_data):
                    topic_data[f'Des_{i+1}'] = des_item
                # 将 topic_data 追加到所有 topic 数据列表
                all_topics_data.append(topic_data)
            
                # topic_df = pd.DataFrame([topic_data], columns=df_columns)
                # all_topics_df = pd.concat([all_topics_df, topic_df], ignore_index=True)
    
    all_topics_df = pd.DataFrame(all_topics_data)
    des_columns = [f'Des_{i+1}' for i in range(max_des_length)]
    all_columns = df_columns + des_columns
    all_topics_df = all_topics_df.reindex(columns=all_columns, fill_value=None)
    all_topics_df.to_excel('./dataproducts/' + state_url['state'] + '.xlsx', index=False)

NevadaProcessing pages: 100%|██████████| 151/151 [19:32<00:00,  7.77s/page]
GeorgiaProcessing pages: 100%|██████████| 31/31 [03:28<00:00,  6.72s/page]
