In [1]:
import os
import json
import pandas as pd
from openai import OpenAI
import random
from tqdm import tqdm


In [None]:
def sample_reviews(path, n=1000, seed=42):
    # 第一次遍历，统计总行数
    with open(path, 'rt', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)

    reviews = []
    # 第二次遍历，显示进度条
    with open(path, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, total=total_lines, desc="Processing reviews"):
            try:
                review = json.loads(line)
                if review.get("text") and review["text"].strip():  # 过滤掉空评论
                    reviews.append(review)
            except json.JSONDecodeError:
                continue  # 避免坏行报错

    print(f"总评论数: {len(reviews)}")

    # 固定随机种子，保证结果可复现
    random.seed(seed)
    sampled = random.sample(reviews, min(n, len(reviews)))
    return sampled


In [None]:
sampled_reviews = sample_reviews("data/review-Montana_10.json", n=1000)

# 转换成 DataFrame 并保存
df = pd.DataFrame(sampled_reviews)
df["review_length"] = df["text"].apply(lambda x: len(str(x).split()))

In [None]:
reviews_df = df
places_data = []
with open("data/meta-Montana.json", "rt", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            places_data.append(json.loads(line))
        except json.JSONDecodeError:
            print(f"⚠️ 跳过坏行 {i}")
            continue

places_df = pd.DataFrame(places_data)

In [None]:
# 确认 gmap_id 在两个表里都是字符串
reviews_df['gmap_id'] = reviews_df['gmap_id'].astype(str)
places_df['gmap_id'] = places_df['gmap_id'].astype(str)

# inner join（只保留两个表都有 gmap_id 的）
merged_df = reviews_df.merge(
    places_df.drop_duplicates(subset=["gmap_id"])
    , on="gmap_id", how="left")

print(f"合并后数据量: {len(merged_df)}")

In [None]:
# 保存结果
merged_df.to_csv("data/reviews_with_places_1000_Montana.csv", index=False)
print("已保存 reviews_with_places_1000_Montana.csv")

In [None]:
# 检查 description 列的缺失情况
print("总行数:", len(df))
print("description 缺失数量:", df['description'].isnull().sum())
print("description 非缺失数量:", df['description'].notnull().sum())

<h1>清理merge数据</h1>

In [7]:
import os
print("Current working directory:", os.getcwd())
print("Files in current directory:", os.listdir('.'))

Current working directory: /Users/yuanyusi/Desktop/ML-for-Trustworthy-Location-Reviews/src
Files in current directory: ['filter_invalid.py', 'merge.py', 'data_clean.py', 'data_clean.ipynb', 'prompt_labelling.ipynb', 'merge_all.ipynb', 'prompt_labelling.py']


In [15]:
# clean merged data.
df = pd.read_csv("../data/out/merged_all.csv")

In [None]:
df = df.drop(columns=['name', 'index', 'rating', 'error'])
df = df[df['predicted_label'] != ""]

In [17]:
df.to_csv("../data/out/merged_all.csv", index=False)