In [1]:
# 导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import glob
import json

## 定义了一些函数

In [28]:
import pandas as pd

def extract_ner_entities(df):
    """
    Extract named entities, their types, words, and scores from the NER column of the dataframe.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing the 'NER_parsed' column with the NER results.

    Returns:
    - entities_df (pd.DataFrame): DataFrame with extracted entities, their types, words, and scores.
    """
    
    entities = []
    entity_words = []
    scores = []

    for ner_list in df['NER'].apply(json.loads):
        for ner in ner_list:
            entities.append(ner['entity'])
            entity_words.append(ner['word'])
            scores.append(ner['score'])

    entities_df = pd.DataFrame({
        'Entity_Type': entities,
        'Entity_Word': entity_words,
        'Score': scores
    })

    return entities_df

def process_entities(ner_results):
    entities = [{'word': d['Entity_Word'], 'entity': d['Entity_Type'], 'score': d['Score']} for index, d in ner_results.iterrows()]
    processed_entities = []
    current_entity = []
    for entity in entities:
        if entity['entity'].startswith('B-') or (entity['entity'].startswith('I-') and not current_entity):
            if current_entity:
                processed_entities.append(current_entity)
            current_entity = [entity]
        elif entity['entity'].startswith('I-') and current_entity:
            current_entity.append(entity)
    if current_entity:
        processed_entities.append(current_entity)

    return processed_entities

# Example usage:
# extracted_df = extract_ner_entities(australia_data)
# processed_results = process_entities(extracted_df)
# for entity_group in processed_results:
#     print(entity_group)


def text_display(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

    # Displaying each sentence on a new line for clarity
    displayed_text = "\n\n".join(sentences)
    print(displayed_text)

# CNN NER结果分析

In [57]:
# 读取数据
cnn_articles_path = '../Data/Articles/CNNArticles/*.csv'
cnn_ner_path = '../Data/NER/count_3/CNN_NER_Results/*.csv'

lst_files = []
for fname in glob.glob(cnn_ner_path):
    lst_files.append(fname)

lst_files = sorted(lst_files)

lst_files_articles = []

for fname in glob.glob(cnn_articles_path):
    lst_files_articles.append(fname)

lst_files_articles = sorted(lst_files_articles)

all_selected_articles = []
# 读取数据
for i in range(1, 2):
    file_ner = lst_files[i]
    file_articles = lst_files_articles[i]
    
    df = pd.read_csv(file_ner, index_col=0)
    df_articles = pd.read_csv(file_articles, index_col=0)
    # 读取文件大小，随机抽选20篇文章
    # Randomly select 20 articles (rows) from the DataFrame
    selected_articles = df.sample(n=20, replace=False) if len(df) >= 20 else df
    
    for index, row in selected_articles.iterrows():
        if row['Headline'] in df_articles['Headline'].values:
            selected_articles.loc[index, 'Text'] = df_articles[df_articles['Headline'] == row['Headline']]['Text'].values[0]
    
    # Append the selected articles to the list
    all_selected_articles.append(selected_articles)

# Concatenate all the selected articles into a single DataFrame
combined_df = pd.concat(all_selected_articles, ignore_index=True)



In [58]:
combined_df

Unnamed: 0,Date,Headline,Count,NER,Text
0,2018-10-31,Trump claims he can defy Constitution and end ...,0,"[{""entity"": ""B-PER"", ""score"": 0.99924743175506...","President Donald Trump offered a dramatic, if ..."
1,2018-05-25,2 men wanted after blast injures 15 people at ...,0,"[{""entity"": ""B-LOC"", ""score"": 0.99959534406661...",A manhunt is on in Canada after two men detona...
2,2021-11-09,Covid-19 drove deaths up 16% in OECD countries...,0,"[{""entity"": ""B-MISC"", ""score"": 0.9694918990135...",The Covid-19 pandemic caused a 16% rise in exp...
3,2020-05-08,A weak polar vortex will usher in record cold air,0,"[{""entity"": ""B-LOC"", ""score"": 0.99911135435104...",A weak polar vortex will help usher in record ...
4,2022-04-19,The bags look like well-known chips or candies...,0,"[{""entity"": ""B-MISC"", ""score"": 0.7330789566040...","At first glance, it looks like a single servin..."
5,2018-03-02,Republican governor asks Trump to reconsider t...,0,"[{""entity"": ""B-MISC"", ""score"": 0.9997137188911...",Wisconsin's Republican governor says President...
6,2018-12-18,"China is buying soybeans again, but Trump is s...",0,"[{""entity"": ""B-LOC"", ""score"": 0.99981015920639...",President Donald Trump said Monday that he wou...
7,2018-02-13,How one tweet inspired 120 people to shovel sn...,0,"[{""entity"": ""B-LOC"", ""score"": 0.99961709976196...",Jahmal Cole is a community organizer in Chatha...
8,2022-09-29,Myanmar court sentences Aung San Suu Kyi and A...,0,"[{""entity"": ""B-LOC"", ""score"": 0.99960035085678...",A Myanmar military court has sentenced ousted ...
9,2017-08-09,U.S. sanctions 8 more Venezuelan leaders tied ...,0,"[{""entity"": ""B-LOC"", ""score"": 0.99926501512527...",The Trump administration just got a little tou...


In [59]:
num = 0
text_display(combined_df['Text'].iloc[num])

President Donald Trump offered a dramatic, if legally dubious, promise in a new interview to unilaterally end birthright citizenship, ratcheting up his hardline immigration rhetoric with a week to go before critical midterm elections.

Trump's vow to end the right to citizenship for the children of non-citizens and unauthorized immigrants born on US soil came in an interview with Axios released Tuesday.

Such a step would be regarded as an affront to the US Constitution, which was amended 150 years ago to include the words: "All persons born or naturalized in the United States and subject to the jurisdiction thereof, are citizens of the United States." Trump did not say when he would sign the order, and some of his past promises to use executive action have gone unfulfilled.

But whether the President follows through on his threat or not, the issue joins a string of actions intended to thrust the matter of immigration into the front of voters' minds as they head to polls next week.

A 

In [60]:
extracted_df = extract_ner_entities(combined_df.iloc[num:num+1])
processed_results = process_entities(extracted_df)

for entity_group in processed_results:
    entity_type = entity_group[0]['entity']
    words = [entity['word'] for entity in entity_group]
    entity_name = ' '.join(words)
    print(f'{entity_type}: {entity_name}')

B-PER: Trump
B-ORG: Constitution
B-PER: Donald Trump
B-PER: Trump
B-LOC: US
B-PER: A
B-PER: ##xi
B-PER: ##os
B-MISC: US Constitution
B-LOC: United States
B-LOC: United States
B-PER: Trump
B-ORG: Fox News
B-LOC: Mexico
B-LOC: US
B-LOC: US
B-PER: Trump
B-LOC: United States
B-PER: Trump
B-ORG: HBO
B-LOC: Canada
B-ORG: Center
B-ORG: House
B-PER: Paul
B-PER: Ryan
B-LOC: Kentucky


In [53]:
type(combined_df.iloc[0])

pandas.core.series.Series