## 1 读原文件

In [None]:
import csv

def remove_quotes(string):
    if string.startswith("'") and string.endswith("'"):
        string = string.strip("'")
    elif string.startswith('"') and string.endswith('"'):
        string = string.strip('"')
    return string

with open('/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/data/original_data/before_translate/biological_strategy.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    # 跳过表头
    next(reader)
    
    # 逐行读取数据, 并将需要叙述存入列表中
    text_list = []
    
    for row in reader:
        doc_dict = {}
        id = '##id:'+ row[0]
        first_fun = row[1]
        second_fun = row[2]
        third_fun = row[3]
        title = row[4]
        URL = row[5]
        introduction = row[6]
        strategy = row[7]
        potential = row[8]
        related_innovation = row[9]
        related_strategy = row[10]
        reference = row[11]
        
        # 删除字符串首尾的‘’符号
        introduction = remove_quotes(introduction)
        strategy = remove_quotes(strategy).replace("', '",", ")
        potential = remove_quotes(potential).replace("', '",". ")
        
        # 将文章编号、标题、introduction、strategy、potential合并，并处理成标注需要的格式
        if len(introduction) > 3 or len(strategy) > 3 or len(potential) > 3:
            doc_dict["id"] = id
            doc_dict["title"] = title
            doc_dict["introduction"] = introduction
            doc_dict["strategy"] = strategy
            doc_dict["potential"] = potential
            text_list.append(doc_dict)


In [None]:
# print(len(text_list))
# print(text_list[0])

## 2. 进行依存句法分析 

In [None]:
import spacy
import re

# 加载spaCy的英文模型
nlp = spacy.load('en_core_web_sm')

# 输入待处理的文本并进行依存句法分析
parse_result_list = []
for doc in text_list:
    parse_result = {}
    text = doc["title"] + doc["introduction"] + doc["strategy"] + doc["potential"]
    
    # 对文章进行清洗
    text = text.replace("\\xa0", "")
    
    # 对文章进行依存句法分析
    result = nlp(text)

    # 打印每个词语的依存关系、头部词和词性
    parse_result["id"] = doc["id"]
    parse_result["title"] = doc["title"]
    parse_result["result"] = []
    for token in result:
        parse_result_dict = {}
        parse_result_dict["text"] = token.text
        parse_result_dict["dep_"] = token.dep_
        parse_result_dict["head"] = token.head.text
        parse_result_dict["pos_"] = token.pos_
        parse_result["result"].append(parse_result_dict)
    parse_result_list.append(parse_result)


In [None]:
print(len(parse_result_list))
parse_result_list[0]

## 3.将句法分析的结果存入csv表格 

In [None]:
# 定义CSV文件路径
csv_file = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result.csv'

# 将字典逐行写入CSV文件
with open(csv_file, 'w', newline='', encoding="utf-8") as file:
    writer = csv.writer(file)
    
    # 写入表头
    file.write("id,title,text,dep_,head,pos_\n")
    
    # 逐行写入数据
    for i in range(len(parse_result_list)):
        for j in range(len(parse_result_list[i]['result'])):
            file.write(parse_result_list[i]["id"])
            file.write(",")
            file.write(parse_result_list[i]["title"].replace(",", "-"))
            file.write(",")
            file.write(parse_result_list[i]['result'][j]["text"].replace(",", "-"))
            file.write(",")
            file.write(parse_result_list[i]['result'][j]["dep_"])
            file.write(",")
            file.write(parse_result_list[i]['result'][j]["head"].replace(",", "-"))
            file.write(",")
            file.write(parse_result_list[i]['result'][j]["pos_"])
            file.write("\n")

In [None]:
# 统计句法依存关系的数量和词性的数量
import csv

file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result.csv'
with open(file_path, "r", encoding="utf-8") as input_file:
    reader = csv.DictReader(input_file)
    dep_list = []
    pos_list = []
    for row in reader:
        dep_list.append(row["dep_"])
        pos_list.append(row["pos_"])
dep_list = list(set(dep_list))
pos_list = list(set(pos_list))
print("依存关系汇总：", dep_list)
print("词性汇总：", pos_list)

### 依存关系（Dependency Relations）：
nsubj: 名词主语    
dobj: 直接宾语    
iobj: 间接宾语    
csubj: 从句主语    
cc: 并列关系中的连接词
conj: 并列关系中的其他词    
advmod: 状语    
amod: 修饰形容词    
neg: 否定词    
det: 冠词、限定词
aux: 助动词   
auxpass: 被动语态的助动词    
prep: 介词    
pobj: 介词宾语     
nummod: 数量修饰词
quantmod: 量词修饰词    
compound: 复合词    
appos: 同位语    
mark: 标记词      
advcl: 状语从句        
acl: 从句修饰    
parataxis: 平行结构    
discourse: 跨句关系     
root: 根节点

# 词性（Part-of-Speech, POS）：
ADJ: 形容词    ADP: 介词    ADV: 副词    AUX: 助动词    CCONJ: 并列连词
DET: 冠词、限定词    INTJ: 感叹词    NOUN: 名词    NUM: 数词    PART: 助词
PRON: 代词    PROPN: 专有名词    PUNCT: 标点符号    SCONJ: 从属连词
SYM: 符号     VERB: 动词     X: 其他

#### 一般名词和专有名词是结构，或者功能对象
#### 一般动词是行为
#### 科学效应一般也都是动词和名词

## 4. 获得功能知识的句法分析结果 

In [None]:
import csv

input_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result.csv'
output_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result_F.csv'

with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, 'w', newline='', encoding="utf-8") as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)
    output_file.write("id,title,text,dep_,head,pos_\n")
    
    for row in reader:
        if row[3] == 'root' or row[3] == 'dobj' or row[3] == 'nsubjpass':
            if row[5] == 'NOUN' or row[5] == 'PROPN' or row[5] == 'VERB' or row[5] =='ADJ':
                writer.writerow(row)

## 5. 获得结构知识的句法分析结果 

In [None]:
import csv

input_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result.csv'
output_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result_S.csv'

with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, 'w', newline='', encoding="utf-8") as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)
    output_file.write("id,title,text,dep_,head,pos_\n")
    
    for row in reader:
        if row[3] == 'amod' or row[3] == 'nsubj' or row[3] == 'agent':
            if row[5] == 'NOUN' or row[5] == 'PROPN' or row[5] == 'VERB' or row[5] =='ADJ':
                writer.writerow(row)

## 6.功能结构知识的过滤 
### 6.1获得主题相关词汇

In [None]:
import csv

def remove_quotes(string):
    if string.startswith("'") and string.endswith("'"):
        string = string.strip("'")
    elif string.startswith('"') and string.endswith('"'):
        string = string.strip('"')
    return string

with open('/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/data/original_data/before_translate/biological_strategy.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    # 跳过表头
    next(reader)
    
    # 逐行读取数据, 并将需要叙述存入列表中
    text_list = []
    
    for row in reader:
        doc_dict = {}
        id = '##id:'+ row[0]
        first_fun = row[1]
        second_fun = row[2]
        third_fun = row[3]
        title = row[4]
        URL = row[5]
        introduction = row[6]
        strategy = row[7]
        potential = row[8]
        related_innovation = row[9]
        related_strategy = row[10]
        reference = row[11]
        
        # 删除字符串首尾的‘’符号
        introduction = remove_quotes(introduction)
        strategy = remove_quotes(strategy).replace("', '",", ")
        potential = remove_quotes(potential).replace("', '",". ")
        
        # 将文章编号、标题、introduction、strategy、potential合并，并处理成标注需要的格式
        if len(introduction) > 3 or len(strategy) > 3 or len(potential) > 3:
            doc_dict["id"] = id
            doc_dict["title"] = title
            doc_dict["introduction"] = introduction
            doc_dict["strategy"] = strategy
            doc_dict["potential"] = potential
            text_list.append(doc_dict)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import json

def get_top_keywords(text, k = 20):
    # 创建TF-IDF向量化器，并设置停用词
    vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)

    # 对文本进行向量化，并计算TF-IDF
    tfidf_matrix = vectorizer.fit_transform([text])

    # 获取特征词列表
    feature_names = vectorizer.get_feature_names()

    # 构建关键词和对应的TF-IDF值的字典
    keywords = {}
    for col in tfidf_matrix.nonzero()[1]:
        keywords[feature_names[col]] = tfidf_matrix[0, col]

    # 根据TF-IDF值进行排序，并返回前K个关键词
    sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
    top_keywords = sorted_keywords[:k]

    return top_keywords

def write_json(doc_tfidf, output_file):
    # 数据的写入
    with open(output_file, 'a', encoding='utf-8') as f:
        f.write(json.dumps(doc_tfidf, ensure_ascii=False) + "\n")

In [None]:
tfidf_result = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/tfidf_result.jsonl'

for doc in text_list:
    parse_result = {}
    text = doc["title"] + doc["introduction"] + doc["strategy"] + doc["potential"]
    
    # 对文章进行清洗
    text = text.replace("\\xa0", "")
    
    # 获取关键词及其TF-IDF值
    doc_tfidf = {}
    doc_tfidf["id"] = doc["id"]
    doc_tfidf["title"] = doc["title"]
    doc_tfidf["keywords"] = get_top_keywords(text)
    write_json(doc_tfidf,tfidf_result)

In [None]:
keywords

### 6.2根据主题相关词汇筛选功能结构知识
#### 6.2.1筛选功能

In [1]:
import json

teidf_result_list = []
tfidf_result = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/tfidf_result.jsonl'
with open(tfidf_result, 'r') as file:
    for line in file:
        line_dict = json.loads(line)
        doc = {}
        doc[line_dict['id']] = line_dict['keywords']
        teidf_result_list.append(doc)

In [2]:
teidf_result_list[0]

{'##id:0': [['fungi', 0.43515327467107956],
  ['molecules', 0.2769157202452324],
  ['enzymes', 0.23735633163877065],
  ['use', 0.23735633163877065],
  ['break', 0.19779694303230888],
  ['chemicals', 0.19779694303230888],
  ['energy', 0.1582375544258471],
  ['pollutants', 0.11867816581938533],
  ['chemical', 0.11867816581938533],
  ['living', 0.11867816581938533],
  ['environment', 0.11867816581938533],
  ['natural', 0.11867816581938533],
  ['make', 0.11867816581938533],
  ['organisms', 0.11867816581938533],
  ['certain', 0.07911877721292356],
  ['pollution', 0.07911877721292356],
  ['process', 0.07911877721292356],
  ['waste', 0.07911877721292356],
  ['water', 0.07911877721292356],
  ['substances', 0.07911877721292356]]}

In [6]:
import csv
import json

input_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result_F.csv'
output_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_tfidf_F.csv'

with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, 'w', newline='', encoding="utf-8") as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)
    output_file.write("id,title,text,dep_,head,pos_\n")

    for row in reader:
        for i in range(len(teidf_result_list)):
            for key, value in teidf_result_list[i].items():
                # 找到对应的文章
                if row[0] == key:
                    for j in range(len(value)):
                        keyword = value[j][0].strip('"')
                        if row[2] == value[j][0] or row[4] == value[j][0]:
                            writer.writerow(row)
                else:
                    continue

#### 6.2.2筛选结构 

In [8]:
import csv

input_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_result_S.csv'
output_file_path = '/ssd01/Codes/PersonalCodes/ZhangXianpeng/graduation_design/biological_ner/dependency_parsing_data/parsing_tfidf_S.csv'

with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, 'w', newline='', encoding="utf-8") as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)
    output_file.write("id,title,text,dep_,head,pos_\n")
    
    for row in reader:
        for i in range(len(teidf_result_list)):
            for key, value in teidf_result_list[i].items():
                # 找到对应的文章
                if row[0] == key:
                    for j in range(len(value)):
                        keyword = value[j][0].strip('"')
                        if row[2] == value[j][0] or row[4] == value[j][0]:
                            writer.writerow(row)
                else:
                    continue

## 7. 获得结构-功能知识对 
通过关联nsubj（名词主语）和agent（被动语态中的主语）两种依赖关系来实现

（1）如果结构知识中的结构词作为主语出现在功能知识中，那么这个结构知识和功能知识就可以形成一个结构-功能知识对

（2）如果结构知识和功能知识中具有相同的动词，那么也可以形成一个结构-功能知识对

## 8.功能结构知识对的存储 