In [1]:
import pandas as pd
import glob
import os
import ner_functions as nf

In [10]:
# 定义一个字典储存每个国家的别名
country_aliases = {
"UnitedStates": ["USA", "America", "US", "United States", "UnitedStates"],
"Canada": ["Canada", "CA"],
"UnitedKingdom": ["UK", "United Kingdom", "Britain", "England", "Scotland", "Wales", "Northern Ireland", "UnitedKingdom"],
"Australia": ["Australia", "AU", "Aussie"],
"China": ["China", "PRC"],
"Denmark": ["Denmark", "DK"],
"Finland": ["Finland", "FI"],
"France": ["France", "French Republic", "FR"],
"Germany": ["Germany", "DE"],
"Japan": ["Japan", "JP"],
"Italy": ["Italy", "Italian Republic", "IT"],
"Netherlands": ["Netherlands", "Holland", "NL"],
"Norway": ["Norway", "NO"],
"Portugal": ["Portugal", "PT"],
"Singapore": ["Singapore", "SG"],
"SouthKorea": ["South Korea", "KR", "SouthKorea"],
"Spain": ["Spain", "ES"],
"Sweden": ["Sweden", "SE"],
"Switzerland": ["Switzerland", "Swiss Confederation", "Swiss", "CH"],
"NewZealand": ["New Zealand", "NZ", "NewZealand"]
}

In [None]:
lst_files = []

Path = "../Data/Articles/CNNArticles/*.csv"

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

# NER判定的阈值
threshold = 0

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension

    # 仅对单个国家的数据进行实体识别
    # if country_name == "China":
    # 对所有的未进行过NER的国家进行实体识别
    if os.path.isfile('../Data/NER/CNN_NER/{0}.csv'.format(country_name)):
        print("File already exists: {0}.csv".format(country_name))
    else:
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            
            df = nf.preprocess_dataframe(df)
            
            df, df_ner_results = nf.perform_ner_on_dataframe(df, country_name, country_aliases, threshold)


            # Convert the list of relevant articles back to a DataFrame
            df.to_csv(r'../Data/NER/CNN_NER/{0}.csv'.format(country_name))
            df_ner_results.to_csv(r'../Data/NER/CNN_NER_Results/{0}.csv'.format(country_name))


../Data/Articles/CNNArticles\Australia.csv


Processing Australia:   0%|          | 0/8562 [00:00<?, ?it/s]

In [1]:
def merge_and_save_csv(file1, file2):
    """
    Merge two csv files, remove duplicates, and save the merged data to the first file.

    Parameters:
    - file1 (str): Path to the first csv file. The merged data will be saved to this file.
    - file2 (str): Path to the second csv file.

    Returns:
    None
    """
    
    # 读取两个csv文件
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    
    # 合并两个数据集
    df = pd.concat([df1, df2], axis=0)
    
    # 去除重复部分
    df.drop_duplicates(inplace=True)
    
    # 保存合并后的数据到第一个文件
    df.to_csv(file1, index=False)

# 使用方法：
# merge_and_save_csv("../Data/CNNArticles/UnitedStates.csv", "../Data/CNNarticles/UnitedStates2.csv")


In [5]:
file1 = "../Data/CNNArticles/China.csv"
file2 = "../Data/CNNArticles/China3.csv"
merge_and_save_csv(file1, file2)

In [17]:
# 读取所有国家数据, 然后输出所得新闻的数量
lst_ner_files = []
for fname in glob.glob("../Data/Articles/CNNArticles/*.csv"):
    lst_ner_files.append(fname)
for file in lst_ner_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)
    df = df.drop_duplicates(['Date', 'Headline'], keep='last')
    print(country_name, df.shape[0])

Australia 8562
Canada 9089
China 15687
Denmark 2206
Finland 1827
France 8895
Germany 8634
Italy 7506
Japan 8589
Netherlands 3774
NewZealand 4492
Norway 2515
Portugal 1814
Singapore 4583
SouthKorea 8182
Spain 6145
Sweden 3236
Switzerland 3885
UnitedKingdom 1461
UnitedStates 21656


In [2]:
# 读取所有国家的实体识别后剩余的数据, 然后输出剩余数据的数量
lst_ner_files = []
for fname in glob.glob("../Data/NER/CNN_NER/*.csv"):
    lst_ner_files.append(fname)
for file in lst_ner_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)
    # df = df.drop_duplicates(['Date', 'Headline'], keep='last')
    # 展示每个国家的第一个新闻的日期和最后一个新闻的日期
    print(country_name, df['Date'].iloc[0], df['Date'].iloc[-1])
    print(country_name, df.shape[0])


Australia 2016-05-03 2023-08-09
Australia 5045
Canada 2016-05-10 2023-08-08
Canada 5023
China 2012-12-16 2023-08-11
China 11247
Denmark 2011-08-31 2023-08-01
Denmark 1047
Finland 2011-09-26 2023-07-26
Finland 887
France 2019-06-06 2023-08-08
France 2956
Germany 2017-07-06 2023-08-07
Germany 4470
Italy 2013-06-16 2023-08-04
Italy 4322
Japan 2016-11-02 2023-08-07
Japan 4984
Netherlands 2011-07-11 2023-08-02
Netherlands 1665
NewZealand 2011-06-29 2023-08-11
NewZealand 2249
Norway 2011-07-25 2023-08-10
Norway 1225
Portugal 2011-07-11 2023-08-03
Portugal 930
Singapore 2011-09-08 2023-08-10
Singapore 2518
SouthKorea 2013-03-26 2023-08-10
SouthKorea 3870
Spain 2010-08-04 2023-08-11
Spain 3585
Sweden 2011-09-22 2023-08-11
Sweden 1708
Switzerland 2010-11-30 2023-08-08
Switzerland 2051
UnitedKingdom 2022-06-06 2023-08-11
UnitedKingdom 756
UnitedStates 2011-04-05 2023-08-13
UnitedStates 17397


In [31]:
# # 读取NER_Results数据, 然后输出所得新闻的数量
# path = "../Data/CNN_NER_Results/Netherlands.csv"
# df = pd.read_csv(path)
# df['NER'] = df['NER'].apply(json.loads)
# # 检查第一行的数据的NER列的第五行的数据
# print(df["NER"])


0       [{'entity': 'B-MISC', 'score': 0.9997494816780...
1       [{'entity': 'B-MISC', 'score': 0.9992812275886...
2       [{'entity': 'B-MISC', 'score': 0.9922473430633...
3       [{'entity': 'B-MISC', 'score': 0.9936847090721...
4       [{'entity': 'B-LOC', 'score': 0.99973839521408...
                              ...                        
3769    [{'entity': 'B-PER', 'score': 0.93637478351593...
3770    [{'entity': 'B-ORG', 'score': 0.94311094284057...
3771    [{'entity': 'B-LOC', 'score': 0.99974805116653...
3772    [{'entity': 'B-LOC', 'score': 0.99981027841567...
3773    [{'entity': 'B-LOC', 'score': 0.99936670064926...
Name: NER, Length: 3774, dtype: object
