从 data_raw/final_merged_all_news 文件里随机挑选 10000 条内容，存为 data_raw/news_10000_rows.csv

In [None]:
import pandas as pd

# 定义文件路径
input_file = 'data_raw/final_merged_all_news.csv'
output_file = 'data_raw/news_10000_rows.csv'

# 读取 csv 文件
try:
    df = pd.read_csv(input_file)

    # 去除重复记录
    df_unique = df.drop_duplicates()

    # 随机选择 10000 条不重复记录
    if len(df_unique) >= 10000:
        df_sample = df_unique.sample(n=10000, random_state=42)
    else:
        print(f"文件中的不重复记录数少于 10000 条，仅 {len(df_unique)} 条记录，将全部保存。")
        df_sample = df_unique

    # 将随机选择的记录保存为新的 csv 文件
    df_sample.to_csv(output_file, index=False)
    print(f"✅ 已随机选择 {len(df_sample)} 条不重复记录并保存到 {output_file}")

except FileNotFoundError:
    print(f"❌ 错误: 输入文件未找到 {input_file}")
except Exception as e:
    print(f"❌ 发生错误: {e}")



In [None]:
file_to_check = 'data_raw/final_merged_all_news.csv'
df = pd.read_csv(file_to_check)
print(f"文件 '{file_to_check}' 的实际数据条数是: {len(df)}")

In [None]:
import pandas as pd

# 设置文件路径
ALIYUN_OSS_PATH = ''
FINAL_CHINA_NEWS_FILE = ALIYUN_OSS_PATH + 'data_processed/final_china_news.csv'
DUPLICATE_NEWS_FILE = ALIYUN_OSS_PATH + 'data_processed/duplicate_china_news.csv'

# 加载数据
print("Loading data...")
df_china_news = pd.read_csv(FINAL_CHINA_NEWS_FILE, low_memory=False)
print(f"Data loaded successfully, total {len(df_china_news)} news articles")

# 添加新闻内容长度列
df_china_news['content_length'] = df_china_news['CONTENT'].astype(str).apply(len)

# 查找重复的新闻 (基于CONTENT列)
print("Finding duplicate news articles...")
duplicate_mask = df_china_news.duplicated(subset=['CONTENT'], keep=False)
df_duplicates = df_china_news[duplicate_mask]

# 按CONTENT排序，以便将相同的新闻放在一起
df_duplicates = df_duplicates.sort_values(['CONTENT', 'DATE']).reset_index(drop=True)

print(f"Found {len(df_duplicates)} duplicate news articles")

# 保存到CSV文件
print(f"Saving duplicate news to {DUPLICATE_NEWS_FILE}...")
df_duplicates.to_csv(DUPLICATE_NEWS_FILE, index=False, encoding='utf-8')
print("Save completed!")

# 显示一些统计信息
print("\n=== Duplicate News Statistics ===")
print(f"Total news: {len(df_china_news)}")
print(f"Duplicate news: {len(df_duplicates)}")
print(f"Unique news: {len(df_china_news) - len(df_duplicates)}")
print(f"Actually unique content: {len(df_china_news.CONTENT.unique())}")

# 显示重复最多的几条新闻
if len(df_duplicates) > 0:
    print("\n=== Most Frequently Duplicated News ===")
    duplicate_counts = df_china_news['CONTENT'].value_counts()
    for i, (content, count) in enumerate(duplicate_counts.head(5).items()):
        # 获取第一条重复新闻的日期和内容长度
        sample_entry = df_china_news[df_china_news['CONTENT'] == content].iloc[0]
        date = sample_entry['DATE']
        length = sample_entry['content_length']
        print(f"{i + 1}. Duplicated {count} times")
        print(f"   Date: {date}")
        print(f"   Content length: {length} characters")
        print(f"   Content preview: {content[:100]}...")

In [None]:
import pandas as pd

# 设置文件路径
ALIYUN_OSS_PATH = ''
RAW_NEWS_FILE = ALIYUN_OSS_PATH + 'data_raw/final_merged_all_news.csv'
SHORT_ARTICLES_FILE = ALIYUN_OSS_PATH + 'data_processed/short_articles.csv'

# 加载数据
print("Loading raw news data...")
df_raw_news = pd.read_csv(RAW_NEWS_FILE, low_memory=False)
print(f"Data loaded successfully, total {len(df_raw_news)} news articles")

# 假设新闻内容列名为'CONTENT'，日期列名为'DATE'
CONTENT_COLUMN = 'CONTENT'
DATE_COLUMN = 'DATE'

# 计算每篇文章的长度
df_raw_news['content_length'] = df_raw_news[CONTENT_COLUMN].astype(str).apply(len)

# 找出异常短文（长度小于50字符）
short_articles_mask = df_raw_news['content_length'] < 50
df_short_articles = df_raw_news[short_articles_mask]

# 按长度排序
df_short_articles = df_short_articles.sort_values('content_length').reset_index(drop=True)

print(f"Found {len(df_short_articles)} short articles (<50 characters)")

# 保存到CSV文件
print(f"Saving short articles to {SHORT_ARTICLES_FILE}...")
df_short_articles.to_csv(SHORT_ARTICLES_FILE, index=False, encoding='utf-8')
print("Save completed!")

# 显示统计信息
print("\n=== Short Articles Statistics ===")
print(f"Total articles in raw data: {len(df_raw_news)}")
print(f"Short articles (<50 characters): {len(df_short_articles)}")
print(f"Percentage of short articles: {len(df_short_articles) / len(df_raw_news) * 100:.2f}%")

# 显示一些示例
if len(df_short_articles) > 0:
    print("\n=== Sample Short Articles ===")
    print("Content Length | Date       | Content Preview")
    print("---------------|------------|----------------")
    for i in range(min(10, len(df_short_articles))):
        row = df_short_articles.iloc[i]
        content_preview = row[CONTENT_COLUMN][:50] + "..." if len(str(row[CONTENT_COLUMN])) > 50 else row[CONTENT_COLUMN]
        date = row[DATE_COLUMN] if DATE_COLUMN in df_short_articles.columns else "N/A"
        print(f"{row['content_length']:14} | {date:10} | {content_preview}")