In [1]:
import numpy as np
import pandas as pd
import re
import glob
from   os import path
import os
from tqdm.notebook import tqdm
from dateutil.parser import parse
from dateutil.tz import gettz

import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='transformers')

import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# use the first GPU if available, otherwise use CPU
device = torch.device("mps"if torch.backends.mps.is_available()else "cpu")
# device = 0 if torch.cuda.is_available() else -1
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

In [2]:
def combineHeadlineText(row):
    if isinstance(row["Headline"], str):
        return row["Headline"] + ". " + row["Text"]
    else:
        return row["Text"]

In [3]:
# 定义一个字典储存每个国家的别名
country_aliases = {
"United States": ["USA", "America", "United States of America", "US"],
"Canada": ["Canada", "CA"],
"United Kingdom": ["UK", "United Kingdom", "Britain", "England", "Scotland", "Wales", "Northern Ireland"],
"Australia": ["Australia", "AU", "Aussie"],
"China": ["China", "PRC", "People's Republic of China"],
"Denmark": ["Denmark", "DK"],
"Finland": ["Finland", "FI"],
"France": ["France", "French Republic", "FR"],
"Germany": ["Germany", "Federal Republic of Germany", "DE"],
"Japan": ["Japan", "JP"],
"Italy": ["Italy", "Italian Republic", "IT"],
"Netherlands": ["Netherlands", "Holland", "NL"],
"Norway": ["Norway", "NO"],
"Portugal": ["Portugal", "PT"],
"Singapore": ["Singapore", "SG"],
"South Korea": ["South Korea", "Republic of Korea", "KR"],
"Spain": ["Spain", "Kingdom of Spain", "ES"],
"Sweden": ["Sweden", "SE"],
"Switzerland": ["Switzerland", "Swiss Confederation", "CH"],
"New Zealand": ["New Zealand", "NZ"]
}

In [7]:
lst_files = []

Path = "../Data/ReutersArticles/*.csv"

for fname in glob.glob(Path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

for file in lst_files:
    file_name = os.path.basename(file)  # Get the file name from the full file path
    country_name = os.path.splitext(file_name)[0]  # Remove the file extension

    # 对所有的国家进行实体识别
    # if not os.path.isfile('../Data/MW_NER/{0}.csv'.format(country_name)):
    # 仅对芬兰的数据进行实体识别
    if file == '../Data/ReutersArticles/Finland_articles.csv':
        print(file)
        df = pd.read_csv(file)
        country_name = country_name.replace("_articles", "")  # Remove the "_articles" part of the file name

        if df.shape[0] != 0:
            df = df.drop(['Unnamed: 0'], axis = 1)
            df = df.drop_duplicates(['Date','Headline'],keep= 'last')
            df['Text'] = df['Text'].astype(str)
            df['Text'] = df.apply(lambda row: combineHeadlineText(row), axis=1)
            df['Date'] = pd.to_datetime(df['Date']).dt.date  # change date format to YYYY-MM-DD
            df = df.sort_values(by = ['Date'], ascending = True)

            count = []
            check = []
            # 新建一个DataFrame储存实体识别结果
            ner_results_data = pd.DataFrame()
            for i in tqdm(range(len(df)), desc=f"Processing {country_name}"):
                ner_results = nlp(df["Text"].iloc[i])
                aliases = country_aliases.get(country_name, [country_name])
                # 检查是否有国家名以及任何别名出现在实体识别结果中
                country_instances = [d for d in ner_results if (d['entity'] in ["B-LOC", "B-ORG"]) and (any(alias in d['word'] for alias in aliases)) and (d['score'] > 0.98)]
                country_check = [d for d in ner_results if (d['entity'] in ["B-LOC", "B-ORG"]) and (d['score'] > 0.98)]
                count.append(len(country_instances))
                check.append([d['word'] for d in country_check])
                ner_results_data = pd.concat([ner_results_data, pd.json_normalize(ner_results)], ignore_index=True)

            df['Count'] = count
            df_check = df.copy()
            df_check['Check'] = check
            df = df[df['Count'] > 0]
            df = df.drop(['Count'], axis = 1)
            df.to_csv(r'../Data/Reuters_NER/{0}.csv'.format(country_name))
            # 添加df_check为csv文件, 并命名为country_name_check.csv
            df_check.to_csv(r'../Data/NER_Entity/{0}_check.csv'.format(country_name))
            ner_results_data.to_csv(r'../Data/NER_Entity/{0}.csv'.format(country_name), index=False)  # Save the ner_results DataFrame to a CSV file


../Data/ReutersArticles/Finland_articles.csv


Processing Finland:   0%|          | 0/1592 [00:00<?, ?it/s]