In [1]:
import numpy as np
import pandas as pd
import re
import glob
from   os import path
import os
from tqdm.notebook import tqdm
from dateutil.parser import parse
from dateutil.tz import gettz

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='transformers')

import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# use the first GPU if available, otherwise use CPU
device = torch.device("mps"if torch.backends.mps.is_available()else "cpu")
# device = 0 if torch.cuda.is_available() else -1
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

In [2]:
def combineHeadlineText(row):
    if isinstance(row["Headline"], str):
        return row["Headline"] + ". " + row["Text"]
    else:
        return row["Text"]

In [3]:
# 定义一个字典储存每个国家的别名
country_aliases = {
"United States": ["USA", "America", "United States of America", "US"],
"Canada": ["Canada", "CA"],
"United Kingdom": ["UK", "United Kingdom", "Britain", "England", "Scotland", "Wales", "Northern Ireland"],
"Australia": ["Australia", "AU", "Aussie"],
"China": ["China", "PRC", "People's Republic of China"],
"Denmark": ["Denmark", "DK"],
"Finland": ["Finland", "FI"],
"France": ["France", "French Republic", "FR"],
"Germany": ["Germany", "Federal Republic of Germany", "DE"],
"Japan": ["Japan", "JP"],
"Italy": ["Italy", "Italian Republic", "IT"],
"Netherlands": ["Netherlands", "Holland", "NL"],
"Norway": ["Norway", "NO"],
"Portugal": ["Portugal", "PT"],
"Singapore": ["Singapore", "SG"],
"South Korea": ["South Korea", "Republic of Korea", "KR"],
"Spain": ["Spain", "Kingdom of Spain", "ES"],
"Sweden": ["Sweden", "SE"],
"Switzerland": ["Switzerland", "Swiss Confederation", "CH"],
"New Zealand": ["New Zealand", "NZ"]
}

In [8]:
lst_files = []

Path = "../Data/ReutersArticles/*.csv"

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension
    country_name = country_name.replace("_articles", "")  # Remove the "_articles" part of the file name

    # 仅对芬兰的数据进行实体识别
    # if file == '../Data/ReutersArticles/Finland_articles.csv':
    # 对所有的未进行过NER的国家进行实体识别
    if os.path.isfile('../Data/Reuters_NER/{0}.csv'.format(country_name)):
        print("File already exists: {0}.csv".format(country_name))
    else:
        print(file)
        df = pd.read_csv(file)

        if df.shape[0] != 0:
            df = df.drop(['Unnamed: 0'], axis = 1)
            df = df.drop_duplicates(['Date','Headline'],keep= 'last')
            df['Text'] = df['Text'].astype(str)
            df['Text'] = df.apply(lambda row: combineHeadlineText(row), axis=1)
            df['Date'] = pd.to_datetime(df['Date']).dt.date  # change date format to YYYY-MM-DD
            df = df.sort_values(by = ['Date'], ascending = True)

            count = []
            check = []
            # 新建一个DataFrame储存实体识别结果
            ner_results_data = pd.DataFrame()
            for i in tqdm(range(len(df)), desc=f"Processing {country_name}"):
                ner_results = nlp(df["Text"].iloc[i])
                aliases = country_aliases.get(country_name, [country_name])
                # 检查是否有国家名以及任何别名出现在实体识别结果中
                country_instances = [d for d in ner_results if (d['entity'] in ["B-LOC", "B-ORG"]) and (any(alias in d['word'] for alias in aliases)) and (d['score'] > 0.98)]
                country_check = [d for d in ner_results if (d['entity'] in ["B-LOC", "B-ORG"]) and (d['score'] > 0.98)]
                count.append(len(country_instances))
                check.append([d['word'] for d in country_check])
                ner_results_data = pd.concat([ner_results_data, pd.json_normalize(ner_results)], ignore_index=True)

            df['Count'] = count
            df_check = df.copy()
            df_check['Check'] = check
            df = df[df['Count'] > 0]
            df = df.drop(['Count'], axis = 1)
            df.to_csv(r'../Data/Reuters_NER/{0}.csv'.format(country_name))
            # 添加df_check为csv文件, 并命名为country_name_check.csv
            df_check.to_csv(r'../Data/NER_Entity/{0}_check.csv'.format(country_name))
            ner_results_data.to_csv(r'../Data/NER_Entity/{0}.csv'.format(country_name), index=False)  # Save the ner_results DataFrame to a CSV file


File already exists: Australia.csv
../Data/ReutersArticles/Canada_articles.csv


Processing Canada:   0%|          | 0/8908 [00:00<?, ?it/s]

File already exists: China.csv
File already exists: Denmark.csv
File already exists: Finland.csv
File already exists: France.csv
File already exists: Germany.csv
File already exists: Italy.csv
../Data/ReutersArticles/Japan_articles.csv


Processing Japan:   0%|          | 0/12224 [00:00<?, ?it/s]

../Data/ReutersArticles/Netherlands_articles.csv


Processing Netherlands:   0%|          | 0/3661 [00:00<?, ?it/s]

../Data/ReutersArticles/New Zealand_articles.csv


Processing New Zealand:   0%|          | 0/4339 [00:00<?, ?it/s]

../Data/ReutersArticles/Norway_articles.csv


Processing Norway:   0%|          | 0/2444 [00:00<?, ?it/s]

../Data/ReutersArticles/Portugal_articles.csv


Processing Portugal:   0%|          | 0/2072 [00:00<?, ?it/s]

../Data/ReutersArticles/Singapore_articles.csv


Processing Singapore:   0%|          | 0/4214 [00:00<?, ?it/s]

../Data/ReutersArticles/South Korea_articles.csv


Processing South Korea:   0%|          | 0/4938 [00:00<?, ?it/s]

../Data/ReutersArticles/Spain_articles.csv


Processing Spain:   0%|          | 0/5374 [00:00<?, ?it/s]

../Data/ReutersArticles/Sweden_articles.csv


Processing Sweden:   0%|          | 0/2689 [00:00<?, ?it/s]

../Data/ReutersArticles/Switzerland_articles.csv


Processing Switzerland:   0%|          | 0/2726 [00:00<?, ?it/s]

../Data/ReutersArticles/United Kingdom_articles.csv


Processing United Kingdom:   0%|          | 0/14940 [00:00<?, ?it/s]

../Data/ReutersArticles/United States_articles.csv


Processing United States:   0%|          | 0/22907 [00:00<?, ?it/s]