In [21]:
# 导入库
import numpy as np
import pandas as pd
import glob
import os
import json
from dateutil.parser import parse
from dateutil.tz import gettz
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='transformers')

import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# use the first GPU if available, otherwise use CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = 0 if torch.cuda.is_available() else -1
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

In [3]:
# 合并headline和text
def combineHeadlineText(row):
    if isinstance(row["Headline"], str):
        return row["Headline"] + ". " + row["Text"]
    else:
        return row["Text"]

In [4]:
def preprocess_dataframe(df, use_parse=False):
    df = df.drop(['Unnamed: 0'], axis=1, errors='ignore')
    df = df.drop_duplicates(['Date', 'Headline'], keep='last')
    df['Text'] = df['Text'].astype(str)
    df['Text'] = df.apply(lambda row: combineHeadlineText(row), axis=1)
    
    if use_parse:
        df['Date'] = df['Date'].str.replace(r'Published: ', ' ')
        df['Date'] = df['Date'].str.replace(r'First', ' ')
        df['Date'] = df['Date'].apply(lambda date_str: parse(date_str, tzinfos={'ET': gettz('America/New_York')}))
        df['Date'] = df['Date'].dt.date
    else:
        df['Date'] = pd.to_datetime(df['Date'])
    
    df = df.reset_index(drop=True).sort_values(by=['Date'], ascending=True)
    
    return df

In [5]:
def process_entities(ner_results):
    # 首先，我们需要将NER的结果转换成一个更方便处理的格式
    entities = [{'word': d['word'], 'entity': d['entity'], 'score': d['score']} for d in ner_results]

    # 然后，我们创建一个新的列表来存储处理后的实体
    processed_entities = []
    current_entity = []
    for entity in entities:
        if entity['entity'].startswith('B-') or (entity['entity'].startswith('I-') and not current_entity):
            if current_entity:
                processed_entities.append(current_entity)
            current_entity = [entity]
        elif entity['entity'].startswith('I-') and current_entity:
            current_entity.append(entity)
    if current_entity:
        processed_entities.append(current_entity)

    return processed_entities

In [25]:
def json_serializable(item):
    """Convert non-serializable items to serializable."""
    if isinstance(item, np.float32):
        return float(item)
    raise TypeError(f"Type {type(item)} not serializable")

In [26]:
def perform_ner_on_dataframe(df, country_name):
    count = []
    check = []
    ner_results_data_list = []
    
    for i in tqdm(range(len(df)), desc=f"Processing {country_name}"):
        ner_results = nlp(df["Text"].iloc[i])
        processed_entities = process_entities(ner_results)

        country_instances = []
        country_check = []
        aliases = country_aliases.get(country_name, [country_name])
        for entity_group in processed_entities:
            words = [entity['word'] for entity in entity_group]
            entity_name = ' '.join(words)
            entity_type = entity_group[0]['entity']
            entity_score = sum(entity['score'] for entity in entity_group) / len(entity_group)
            if entity_type in ["B-LOC", "B-ORG"] and entity_score > 0.98:
                country_check.append(entity_name)
                if any(alias in entity_name for alias in aliases):
                    country_instances.append(entity_name)

        count.append(len(country_instances))
        check.append(country_check)
        
        ner_result = {
            'Date': df['Date'].iloc[i],
            'Headline': df['Headline'].iloc[i],
            'NER': json.dumps(ner_results, default=json_serializable)  # convert ner_results to string
        }
        ner_results_data_list.append(ner_result)

    df['Count'] = count
    df = df[df['Count'] >= 3]
    df = df.drop(['Count'], axis=1)
    df_ner_results = pd.DataFrame(ner_results_data_list)
    
    return df, df_ner_results

In [7]:
# 定义一个字典储存每个国家的别名
country_aliases = {
"UnitedStates": ["USA", "America", "US", "United States", "UnitedStates"],
"Canada": ["Canada", "CA"],
"UnitedKingdom": ["UK", "United Kingdom", "Britain", "England", "Scotland", "Wales", "Northern Ireland", "UnitedKingdom"],
"Australia": ["Australia", "AU", "Aussie"],
"China": ["China", "PRC"],
"Denmark": ["Denmark", "DK"],
"Finland": ["Finland", "FI"],
"France": ["France", "French Republic", "FR"],
"Germany": ["Germany", "DE"],
"Japan": ["Japan", "JP"],
"Italy": ["Italy", "Italian Republic", "IT"],
"Netherlands": ["Netherlands", "Holland", "NL"],
"Norway": ["Norway", "NO"],
"Portugal": ["Portugal", "PT"],
"Singapore": ["Singapore", "SG"],
"SouthKorea": ["South Korea", "KR", "SouthKorea"],
"Spain": ["Spain", "ES"],
"Sweden": ["Sweden", "SE"],
"Switzerland": ["Switzerland", "Swiss Confederation", "Swiss", "CH"],
"NewZealand": ["New Zealand", "NZ", "NewZealand"]
}

In [27]:
lst_files = []

Path = "../Data/CNNArticles/*.csv"
count = 0

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension

    if not os.path.isfile('../Data/CNN_NER/{0}.csv'.format(country_name)):
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            
            df = preprocess_dataframe(df)

            df, df_ner_results = perform_ner_on_dataframe(df, country_name)


            # Convert the list of relevant articles back to a DataFrame
            df.to_csv(r'../Data/CNN_NER/{0}.csv'.format(country_name))
            df_ner_results.to_csv(r'../Data/CNN_NER_Results/{0}.csv'.format(country_name))


../Data/CNNArticles\Netherlands.csv


Processing Netherlands:   0%|          | 0/3774 [00:00<?, ?it/s]

In [34]:
# 读取UnitedStates和UnitedStates2的数据，合并为UnitedStates
df1 = pd.read_csv("../Data/CNNArticles/UnitedStates.csv")
df2 = pd.read_csv("../Data/CNNarticles/UnitedStates2.csv")
df = pd.concat([df1, df2], axis=0)
# 保存到UnitedStates.csv
df.to_csv(r'../Data/CNNArticles/UnitedStates.csv')

In [35]:
# 读取所有国家数据, 然后输出所得新闻的数量
lst_ner_files = []
for fname in glob.glob("../Data/CNNArticles/*.csv"):
    lst_ner_files.append(fname)
for file in lst_ner_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)

    print(country_name, df.shape[0])

Australia 8605
Canada 10939
China 9229
Denmark 2215
Finland 1835
France 9742
Germany 9218
Italy 8167
Japan 9071
Netherlands 3779
NewZealand 4509
Norway 2518
Portugal 1815
Singapore 4605
SouthKorea 8595
Spain 6164
Sweden 3245
Switzerland 3888
UnitedKingdom 1462
UnitedStates 18586


In [28]:
# 读取所有国家的实体识别后剩余的数据, 然后输出剩余数据的数量
lst_ner_files = []
for fname in glob.glob("../Data/CNN_NER/*.csv"):
    lst_ner_files.append(fname)
for file in lst_ner_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    df = pd.read_csv(file)
    print(country_name, df.shape[0])


Australia 1359
Canada 831
China 2833
Denmark 136
Finland 127
France 474
Germany 706
Italy 762
Japan 1337
Netherlands 107
NewZealand 717
Norway 200
Portugal 110
Singapore 499
SouthKorea 892
Spain 695
Sweden 262
Switzerland 121
UnitedKingdom 312
UnitedStates 5971


In [31]:
# 读取NER_Results数据, 然后输出所得新闻的数量
path = "../Data/CNN_NER_Results/Netherlands.csv"
df = pd.read_csv(path)
df['NER'] = df['NER'].apply(json.loads)
# 检查第一行的数据的NER列的第五行的数据
print(df["NER"])


0       [{'entity': 'B-MISC', 'score': 0.9997494816780...
1       [{'entity': 'B-MISC', 'score': 0.9992812275886...
2       [{'entity': 'B-MISC', 'score': 0.9922473430633...
3       [{'entity': 'B-MISC', 'score': 0.9936847090721...
4       [{'entity': 'B-LOC', 'score': 0.99973839521408...
                              ...                        
3769    [{'entity': 'B-PER', 'score': 0.93637478351593...
3770    [{'entity': 'B-ORG', 'score': 0.94311094284057...
3771    [{'entity': 'B-LOC', 'score': 0.99974805116653...
3772    [{'entity': 'B-LOC', 'score': 0.99981027841567...
3773    [{'entity': 'B-LOC', 'score': 0.99936670064926...
Name: NER, Length: 3774, dtype: object
