In [4]:
import pandas as pd
import os

# 定义输入输出映射
files = {
    "cafe_brunch_bakery_desc.csv": "cafe.csv",
    "restaurant_english_clean_desc.csv": "restaurant.csv",
    "milktea_juice_english_clean.csv": "milk_juice.csv",
    "melbourne_cbd_bars.csv": "bars.csv"
}

for src, dst in files.items():
    if not os.path.exists(src):
        print(f"⚠️ 文件未找到: {src}")
        continue

    print(f"🔹 正在处理 {src} ...")

    df = pd.read_csv(src)

    # 处理 description
    if 'description' in df.columns:
        df['description'] = df['description'].astype(str).str.replace(r"\|.*$", "", regex=True).str.strip()
    else:
        print("  ⚠️ 未找到 'description' 列。")

    # 去除 openinghour 为空的数据
    open_cols = [c for c in df.columns if c.lower() in ["openinghour", "openinghours"]]
    if open_cols:
        col = open_cols[0]
        before = len(df)
        df = df[df[col].notna()]
        df = df[df[col].astype(str).str.strip() != ""]
        removed = before - len(df)
        print(f"  ✅ 删除 {removed} 条 openinghour 为空的数据。")
    else:
        print("  ⚠️ 未找到 'openinghour' 列。")

    # 保存
    df.to_csv(dst, index=False, encoding="utf-8-sig")
    print(f"  💾 已保存为 {dst} （剩余 {len(df)} 条）\n")

print("🎉 全部处理完成！")



🔹 正在处理 cafe_brunch_bakery_desc.csv ...
  ✅ 删除 0 条 openinghour 为空的数据。
  💾 已保存为 cafe.csv （剩余 85 条）

🔹 正在处理 restaurant_english_clean_desc.csv ...
  ✅ 删除 26 条 openinghour 为空的数据。
  💾 已保存为 restaurant.csv （剩余 464 条）

🔹 正在处理 milktea_juice_english_clean.csv ...
  ✅ 删除 2 条 openinghour 为空的数据。
  💾 已保存为 milk_juice.csv （剩余 70 条）

🔹 正在处理 melbourne_cbd_bars.csv ...
  ✅ 删除 0 条 openinghour 为空的数据。
  💾 已保存为 bars.csv （剩余 51 条）

🎉 全部处理完成！


In [5]:
import pandas as pd

# ===== 文件路径（请根据你的文件名修改）=====
files = [
    "cafe_brunch_bakery_desc.csv",
    "restaurant_english_clean_desc.csv",
    "milktea_juice_english_clean.csv",
    "melbourne_cbd_bars.csv"
]

# ===== 初始化空 DataFrame =====
all_desc = pd.DataFrame(columns=["place_id", "description", "category"])

# ===== 循环读取并提取字段 =====
for f in files:
    try:
        df = pd.read_csv(f)
        
        # 统一字段名，兼容大小写差异
        cols = [c.lower() for c in df.columns]
        df.columns = cols
        
        # 只保留 description 和 place_id
        if "description" in df.columns and "place_id" in df.columns:
            cat = f.split(".")[0]  # 从文件名提取类别
            temp = df[["place_id", "description"]].copy()
            temp["category"] = cat
            all_desc = pd.concat([all_desc, temp], ignore_index=True)
        else:
            print(f"⚠️ 文件 {f} 缺少必要字段，已跳过。")
    except Exception as e:
        print(f"❌ 读取 {f} 时出错：{e}")

# ===== 去除空描述 =====
all_desc = all_desc.dropna(subset=["description"])
all_desc = all_desc[all_desc["description"].str.strip() != ""]

# ===== 输出到新文件 =====
all_desc.to_csv("desc.csv", index=False, encoding="utf-8-sig")

print(f"✅ 已生成 desc.csv，共 {len(all_desc)} 条记录。")


✅ 已生成 desc.csv，共 678 条记录。


In [9]:
import pandas as pd
import re
from collections import Counter

# === 1. 读取合并后的 desc.csv ===
df = pd.read_csv("desc.csv")

# === 2. 合并所有 description 文本 ===
text = " ".join(df["description"].astype(str).tolist())

# === 3. 基本清洗 ===
text = text.lower()                            # 小写化
text = re.sub(r"http\S+", "", text)            # 去掉链接
text = re.sub(r"[^a-z\s]", " ", text)          # 去掉非字母字符
text = re.sub(r"\s+", " ", text).strip()       # 去掉多余空格

# === 4. 分词 ===
words = text.split()

# === 5. 去停用词（常见无意义词）===
stopwords = {
    "the","and","for","with","you","this","that","was","are","but","not","have",
    "had","were","been","from","they","it's","its","very",
    "place","service","your","our","us","would",
    "staff","menu","price","really","more",
    "just","well","one","two","bit","lot","can","get","out","there","here","will",
    "when","then","too","also","back","because","after","before","still","got",
    "make","made","some","come","going","time","around","area","much","their",
    "other","only","first","last","day","night","ever","ever","every","every",
    "always","never","always","always","which","who","what","where","why","how","all","definitely","try","recommend",

}

words_clean = [w for w in words if w not in stopwords and len(w) > 2]

# === 6. 统计词频 ===
word_counts = Counter(words_clean)
word_freq_df = pd.DataFrame(word_counts.items(), columns=["word", "count"])
word_freq_df = word_freq_df.sort_values(by="count", ascending=False)

# === 7. 保存结果 ===
word_freq_df.to_csv("word_freq.csv", index=False, encoding="utf-8-sig")

print(f"✅ 清洗完成，共 {len(word_freq_df)} 个唯一单词，已保存为 word_freq.csv")


✅ 清洗完成，共 4988 个唯一单词，已保存为 word_freq.csv


In [2]:
import pandas as pd

# Read the CSV
df = pd.read_csv("word_freq.csv")

# Capitalize first letter of each word
df["word"] = df["word"].str.capitalize()

# Sort by 'count' in descending order
df = df.sort_values("count", ascending=False)

# Reset index
df = df.reset_index(drop=True)

# Save changes back to the same file
df.to_csv("word_freq.csv", index=False)


<div class='tableauPlaceholder' id='viz1761546625032' style='position: relative'><noscript><a href='#'><img alt='Dashboard 1 ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Wo&#47;Word_17615465535280&#47;Dashboard1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Word_17615465535280&#47;Dashboard1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Wo&#47;Word_17615465535280&#47;Dashboard1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en-US' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1761546625032');                    var vizElement = divElement.getElementsByTagName('object')[0];                    if ( divElement.offsetWidth > 800 ) { vizElement.style.width='1000px';vizElement.style.height='827px';} else if ( divElement.offsetWidth > 500 ) { vizElement.style.width='1000px';vizElement.style.height='827px';} else { vizElement.style.width='100%';vizElement.style.height='727px';}                     var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>