In [121]:
import pandas as pd
seeds = pd.read_csv('2024/seeds_2024.csv')
references = pd.read_csv('2024/references_2024.csv')
citations = pd.read_csv('2024/citations_2024.csv')

In [122]:
len(seeds), len(references), len(citations)

(568, 7962, 1210)

1. 去除HTML标签和特殊字符

In [123]:
# 定义替换函数
def standardize_punctuation(text):
    if not isinstance(text, str):
        return text
    
    # 使用字典定义所有需要替换的字符
    replacements = {
        '–': '-', '—': '-',   # 破折号
        '“': '"', '”': '"',   # 双引号
        '‘': "'", '’': "'",   # 单引号
        '…': '...',           # 省略号
        '《': '"', '》': '"',    # 法语引号
        '（': '(', '）': ')',
        '【': '[', '】': ']',
        '；': ';', '：': ':',
        '？': '?', '！': '!',
        '，': ',', '。': '.'
    }
    
    # 逐个替换
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    return text

In [124]:
# 应用替换到指定列
seeds['processed_abstract'] = seeds['abstract'].apply(standardize_punctuation)
references['processed_abstract'] = references['abstract'].apply(standardize_punctuation)
citations['processed_abstract'] = citations['abstract'].apply(standardize_punctuation)

In [125]:
from bs4 import BeautifulSoup
import re, html

def advanced_clean_text(text):
    """
    增强版文本清理，处理更多边缘情况
    """
    if not isinstance(text, str):
        return ""
    
    # 处理常见的"垃圾"Unicode字符
    text = re.sub(
        r'[\x00-\x1f\x7f-\x9f\u2000-\u200f\u2028-\u202f\u205f-\u206f\ufeff\ufff0-\uffff]',
        ' ', 
        text
    )
    
    # 处理HTML实体和标签
    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text(" ")
    
    # 处理URL(可选)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # 处理特殊字符但保留有意义的标点
    text = re.sub(r"""[^\w\s@#\$%&\*\+-=<>/\\:;'"\.,\?\!\(\)\[\]\{\}]""", '', text)
    
    # 规范化空白
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [126]:
# 应用替换到指定列
seeds['processed_abstract'] = seeds['processed_abstract'].apply(advanced_clean_text)
references['processed_abstract'] = references['processed_abstract'].apply(advanced_clean_text)
citations['processed_abstract'] = citations['processed_abstract'].apply(advanced_clean_text)

  text = BeautifulSoup(text, "html.parser").get_text(" ")


2. 分词处理

In [127]:
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize_abstracts_df(df, text_column='processed_abstract'):
    """
    处理DataFrame中指定文本列的分词
    
    参数:
        df: pandas DataFrame
        text_column: 要处理的文本列名(默认为'abstract')
    
    返回:
        包含分词结果的新Series
    """
    return df[text_column].apply(lambda text: [token.text for token in nlp(text)]), df[text_column].apply(lambda text: [token.tag_ for token in nlp(text)])

In [128]:
# 应用替换到指定列
seeds['tokenized_abstract'], seeds['tokenized_tag'] = tokenize_abstracts_df(seeds)
references['tokenized_abstract'], references['tokenized_tag'] = tokenize_abstracts_df(references)
citations['tokenized_abstract'], citations['tokenized_tag'] = tokenize_abstracts_df(citations)

3. 词形还原

In [129]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')

def get_wordnet_pos(spacy_tag):
    """将 spaCy 的词性标签转换为 WordNet 的词性标签"""
    if spacy_tag.startswith('J'):  # 形容词
        return wordnet.ADJ
    elif spacy_tag.startswith('V'):  # 动词
        return wordnet.VERB
    elif spacy_tag.startswith('N'):  # 名词
        return wordnet.NOUN
    elif spacy_tag.startswith('R'):  # 副词
        return wordnet.ADV
    else:
        return wordnet.NOUN  # 默认作为名词处理
    
def lemmatize_tokens(row):
    """
    使用现有的 abstract 和 tag 列进行词形还原
    :param row: DataFrame 的一行，包含 'abstract' 和 'tag' 列
    :return: 词形还原后的单词列表
    """
    tokens = row['tokenized_abstract']
    tags = row['tokenized_tag']
    
    lemmatized_tokens = []
    for token, tag in zip(tokens, tags):
        wordnet_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, pos=wordnet_pos)
        lemmatized_tokens.append(lemma.lower()) # 小写处理
    return lemmatized_tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [130]:
# 应用替换到指定列
seeds['lemmatized_abstract'] = seeds.apply(lemmatize_tokens, axis=1)
references['lemmatized_abstract'] = references.apply(lemmatize_tokens, axis=1)
citations['lemmatized_abstract'] = citations.apply(lemmatize_tokens, axis=1)

4. 去除停用词

In [131]:
from nltk.corpus import stopwords
import nltk

# 确保NLTK停用词已下载（如果没有，运行第一次时会自动下载）
nltk.download('stopwords')

# 定义停用词
stop_words = set(stopwords.words('english'))

# 扩展领域词汇（AI + Finance）
domain_words = {
    # 通用AI术语
    'ai', 'artificial', 'intelligence', 'machine', 'learning', 'ml', 'deep', 'learning', 'dl',
    'neural', 'network', 'nn', 'transformer', 'attention', 'gpt', 'llm', 'bert', 'gpt-3', 'gpt-4',
    'nlp', 'natural', 'language', 'processing', 'computer', 'vision', 'cv', 'generative', 'gan',
    'reinforcement', 'learning', 'rl', 'supervised', 'unsupervised', 'semi-supervised',
    
    # 金融科技（FinTech）相关
    'finance', 'financial', 'fintech', 'banking', 'investment', 'stock', 'market', 'trading',
    'portfolio', 'risk', 'management', 'credit', 'loan', 'blockchain', 'crypto', 'cryptocurrency',
    'bitcoin', 'ethereum', 'defi', 'algorithmic', 'forex', 'quantitative', 'hedge', 'fund',
    'robo-advisor', 'fraud', 'detection', 'regulation', 'compliance', 'insurtech', 'payments',
    'wealth', 'management', 'asset', 'lending', 'mortgage', 'insurance', 'audit', 'accounting',
    'financial', 'inclusion', 'microfinance', 'crowdfunding', 'p2p', 'peer-to-peer',
    
    # 模型/方法相关
    'gcn', 'graph', 'convolutional', 'network', 'lstm', 'cnn', 'rnn', 'transformer', 'attention',
    'svm', 'random', 'forest', 'xgboost', 'clustering', 'regression', 'classification',
    'optimization', 'bayesian', 'forecasting', 'prediction', 'anomaly', 'detection'
}

def filter_tokens(df, text_columns='lemmatized_abstract'):
    """
    过滤停用词，但保留领域词汇
    
    参数:
        tokenized_abstracts: 分词后的数据集（List[List[str]]）
    
    返回:
        过滤后的分词结果（List[List[str]]）
    """
    filtered_abstracts = []
    for tokens in df[text_columns]:
        filtered = [
            t.lower() for t in tokens 
            if (t.lower() not in stop_words) or (t.lower() in domain_words)
        ]
        filtered_abstracts.append(filtered)
    return filtered_abstracts

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [132]:
# 应用替换到指定列
seeds['final_abstract'] = filter_tokens(seeds)
references['final_abstract'] = filter_tokens(references)
citations['final_abstract'] = filter_tokens(citations)

In [133]:
# 保存为新的CSV文件（使用UTF-8编码）
seeds.to_csv('2024/seeds_processed_abstract_2024.csv', index=False, encoding='utf-8-sig')
references.to_csv('2024/references_processed_abstract_2024.csv', index=False, encoding='utf-8-sig')
citations.to_csv('2024/citations_processed_abstract_2024.csv', index=False, encoding='utf-8-sig')