#### 优化章节名和参考文献名

##### 章节名

In [2]:
import pandas as pd

df = pd.read_csv("../Data/plos_section.csv")
sections = df['section'].tolist()

print(len(sections))
sections = list(set(sections))
print(len(sections))

55580
55580


In [3]:
sections[:5]

[nan,
 'Methodological Considerations and Study Limitations',
 '2. Flow field model of micro groove seal',
 '9 Supplementary materials',
 'Ethical approval and informed consent']

In [None]:
"""
    1. 对于不规范但确实属于章节名的数据
    2. 对于不规范且不属于章节名的数据  （可能是爬取过程中存在的问题，这部分数据如何处理，去除吗？）

    但重点是如何利用章节名信息🤔， 比如说我认为在method部分共现的更加重要，这依据何在？
"""

##### 参考文献名

In [1]:
"""
    1. 通过建立字典，去优化期刊名
    2. 通过从WOS Science进行检索获取参考文献

    这一步是肯定要做的,且这一步骤是需要针对所有论文进行处理的
"""
import pandas as pd

In [2]:
df = pd.read_csv('data/refer_source.csv')
df = df.dropna()
df.shape

(13024187, 2)

In [3]:
sources = df['Refer_Source'].tolist()

title_to_nums = {}
for title in sources:
    if type(title) != str:
        continue
    if title not in title_to_nums:
        title_to_nums[title] = 0
    else:
        title_to_nums[title] += 1

In [4]:
unique_sources = title_to_nums.keys()
print(len(unique_sources))

887593


In [None]:
df_match = pd.read_csv("data/match_journal.csv")
match_journals = df_match['MATCH Source'].tolist()

cnt = 0
error_cnt = 0
error_journals = []
for journal in set(match_journals):
    try:
        cnt += title_to_nums[journal]
    except:
        error_journals.append(journal)
        error_cnt += 1

print(cnt)
print(cnt/len(sources))
print(error_cnt)

In [None]:
import os

files = os.listdir("data/standard journal list")

journals = []

for file in files:
    df = pd.read_csv(os.path.join("data/standard journal list", file))
    try:
        journals.extend(df['Journal title'].tolist())
    except:
        journals.extend(df['Title'].tolist())
        continue

scopus_df = pd.read_excel("data/standard journal(scopus)/ext_list_August_2024.xlsx")
scopus_df = scopus_df[scopus_df['Source Type'] == 'Journal']

journals.extend(scopus_df['Source Title'].tolist())
len(set(journals))

In [7]:
# df_standard = pd.DataFrame()
# df_standard['Source'] = list(set(journals))
# df_standard.to_csv("data/standard_journals.csv", index=False)

df_standard = pd.read_csv("data/standard_journals.csv")
stan_journals = df_standard['Source'].tolist()
len(set(stan_journals))

68121

In [12]:
# 来自SciSciNet中的期刊名

with open("data/sciscinet/SciSciNet_Journals.tsv", 'r', encoding='utf-8') as f:
    datas = f.readlines()

sci_journals = []
for data in datas:
    elems = data.split('\t')
    sci_journals.append(elems[1])

stan_journals = set(stan_journals + sci_journals)
print(len(stan_journals))

df_standard = pd.DataFrame()
df_standard['Source'] = list(set(stan_journals))
df_standard.to_csv("data/standard_journals.csv", index=False)

99302

In [7]:
# 来自openalex的期刊名
import json

openalex_journal = []
with open("data/openalex/journal", 'r') as f:
    datas = json.load(f)

openalex_journal = [data['display_name'] for data in datas]
print(len(set(openalex_journal)))

245324


In [9]:
df = pd.read_csv("data/standard_journals.csv")
stan_journals = df['Source'].tolist()

set(stan_journals).issubset(set(openalex_journal))

False

In [13]:
stan_journals = set(stan_journals+openalex_journal)
print(len(stan_journals))

df = pd.DataFrame()
df['Source'] = list(stan_journals)
df.to_csv("data/standard_journals_plus.csv", index=False)

302923


In [8]:
"Journal of Psycholinguistic Research" in stan_journals

True

##### 标准化参考文献名

##### 1. 自己构建缩写名称

两种期刊名缩写格式: 

https://www.sohu.com/a/673180484_121252874

https://www.lcgdbzz.org/news/bkgf/58444b6f-f790-4e2d-99d2-38b932ef9ddd.htm

    原： JOURNAL OF CONTROLLED RELEASE

    缩写（1） J CONTROL RELEASE
    缩写（2） J.Control. Release

In [67]:
vowels = ['a', 'e', 'i', 'o', 'u']
func_words = ['The', 'of', 'the']

word_to_abbr = {}

def generate_abbreviation(title):
    # 首字母缩写： British Medical Journal   BMJ
    words = title.strip().split(" ")
    if len(words) == 1:
        return None
    
    word_list = [word[0] for word in words]
    abbr_1 = "".join(word_list)

    # 去掉虚词后的首字母缩写
    new_word_list = []
    for word in words:
        if word in func_words:
            continue
        else:
            new_word_list.append(word[0])
    abbr_2 = "".join(new_word_list)
    
    # 去掉虚词并将词语部分缩写
    new_word_list = []
    for word in words:
        if word in func_words or len(word)<=5:
            new_word_list.append(word)
            continue
        if word in word_to_abbr:
            new_word_list.append(word_to_abbr[word])
        elif word[-3:] == 'ogy' or word[-3:] == 'ics':
            new_word_list.append(word[:-3])
            word_to_abbr[word] = word[:-3]
        elif word[-3:] == 'try':
            new_word_list.append(word[:-5])
            word_to_abbr[word] = word[:-5]
        else:
            min_idx = 100
            for vowel in vowels:
                idx = word.find(vowel)
                if idx != -1 and idx != 0 and idx < min_idx:
                    min_idx = idx
            new_word_list.append(word[:min_idx])
            word_to_abbr[word] = word[:min_idx]

    abbr_3 = " ".join(new_word_list)

    return abbr_1, abbr_2, abbr_3

##### 2. 基于编辑距离算法

可用第三方库： python-Levenshtein, editdistance, textdistance

In [4]:
# 将结果转换为字典，便于后续代码中检索

import pandas as pd
import json

df = pd.read_csv("data/close_form_journal.csv")
frame_titles = ["Raw J", "New D", "New J", "New D Second", "New J Second"]

word_to_close_form = {}
for row in df.itertuples():
    """
        编辑距离相差太远的并不能归为同一类
    """
    try:
        word_to_close_form[row[2]] = []
        if row[3] < 10:
            word_to_close_form[row[2]].append(row[4])
        if row[5] < 10:
            word_to_close_form[row[2]].append(row[6])
    except:
        pass
    
json_datas = json.dumps(word_to_close_form, indent=4)
with open("data/close_form_journal.json", "w") as f:
    f.write(json_datas)

##### 3. 从NLM官网爬取

In [2]:
import json

with open("data/nlm/part_3_test_results.json", 'r') as f:
    datas = json.load(f)

type(datas)

dict

In [7]:
# 首先根据字典的键检查有哪些url是没有成功爬取下来的
import pandas as pd

crawl_urls = list(datas.keys())

df = pd.read_csv("data/nlm/standard_journals_nlm_url.csv")
all_urls = df['URL'].tolist()

print(len(set(all_urls)-set(crawl_urls)))
print(len(set(all_urls)))

# miss_df = pd.DataFrame()
# miss_df['URL'] = list(set(all_urls)-set(crawl_urls))
# miss_df.to_csv("data/nlm/miss_urls", index=False)

99242
99302


In [5]:
# 然后根据字典的值构建标准期刊名与其缩写之间的关系
from tqdm import tqdm

abbr_dict = {}

for items in tqdm(datas.values()):
    titles = items['title']
    try:
        attrs = items['attri']
    except:
        attrs = items['attr']

    if type(titles) == str:
        abbr_dict[titles] = attrs
    else:
        simple_attrs = []
        for attr in attrs:
            if "NLM Title Abbreviation:" in attr:
                attr = attr.replace("NLM Title Abbreviation:","").strip()
                simple_attrs.append(attr)
        try:
            assert len(simple_attrs) == len(titles)
            for i, title in enumerate(titles):
                abbr_dict[title] = simple_attrs[i]
        except Exception as e:
            # print(f"{e}")
            pass

100%|██████████| 60/60 [00:00<?, ?it/s]


In [6]:
json_datas = json.dumps(abbr_dict, indent=4)
with open("data/abbreviation_nlm.json", 'w') as f:
    f.write(json_datas)